TODO: Move to an ssl subpackage when we have other classes other than
+ * just this one.
+ *
+ * @author stack
+ * @version $Id$
+ */
+public class ConfigurableX509TrustManager implements X509TrustManager
+{
+ /**
+ * Logging instance.
+ */
+ protected static Logger logger = Logger.getLogger(
+ "org.archive.httpclient.ConfigurableX509TrustManager");
+
+ public static enum TrustLevel {
+ /**
+ * Trust anything given us.
+ *
+ * Default setting.
+ *
+ *
See
+ * e502. Disabling Certificate Validation in an HTTPS Connection from
+ * the java almanac for how to trust all.
+ */
+ OPEN,
+
+ /**
+ * Trust any valid cert including self-signed certificates.
+ */
+ LOOSE,
+
+ /**
+ * Normal jsse behavior.
+ *
+ * Seemingly any certificate that supplies valid chain of trust.
+ */
+ NORMAL,
+
+ /**
+ * Strict trust.
+ *
+ * Ensure server has same name as cert DN.
+ */
+ STRICT,
+ }
+
+ /**
+ * Default setting for trust level.
+ */
+ public final static TrustLevel DEFAULT = TrustLevel.OPEN;
+
+ /**
+ * Trust level.
+ */
+ private TrustLevel trustLevel = DEFAULT;
+
+
+ /**
+ * An instance of the SUNX509TrustManager that we adapt variously
+ * depending upon passed configuration.
+ *
+ * We have it do all the work we don't want to.
+ */
+ private X509TrustManager standardTrustManager = null;
+
+
+ public ConfigurableX509TrustManager()
+ throws NoSuchAlgorithmException, KeyStoreException {
+ this(DEFAULT);
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param level Level of trust to effect.
+ *
+ * @throws NoSuchAlgorithmException
+ * @throws KeyStoreException
+ */
+ public ConfigurableX509TrustManager(TrustLevel level)
+ throws NoSuchAlgorithmException, KeyStoreException {
+ super();
+ TrustManagerFactory factory = TrustManagerFactory.
+ getInstance(TrustManagerFactory.getDefaultAlgorithm());
+
+ // Pass in a null (Trust) KeyStore. Null says use the 'default'
+ // 'trust' keystore (KeyStore class is used to hold keys and to hold
+ // 'trusts' (certs)). See 'X509TrustManager Interface' in this doc:
+ // http://java.sun.com
+ // /j2se/1.4.2/docs/guide/security/jsse/JSSERefGuide.html#Introduction
+ factory.init((KeyStore)null);
+ TrustManager[] trustmanagers = factory.getTrustManagers();
+ if (trustmanagers.length == 0) {
+ throw new NoSuchAlgorithmException(TrustManagerFactory.
+ getDefaultAlgorithm() + " trust manager not supported");
+ }
+ this.standardTrustManager = (X509TrustManager)trustmanagers[0];
+
+ this.trustLevel = level;
+ }
+
+ public void checkClientTrusted(X509Certificate[] certificates, String type)
+ throws CertificateException {
+ if (this.trustLevel.equals(TrustLevel.OPEN)) {
+ return;
+ }
+
+ this.standardTrustManager.checkClientTrusted(certificates, type);
+ }
+
+ public void checkServerTrusted(X509Certificate[] certificates, String type)
+ throws CertificateException {
+ if (this.trustLevel.equals(TrustLevel.OPEN)) {
+ return;
+ }
+
+ try {
+ this.standardTrustManager.checkServerTrusted(certificates, type);
+ if (this.trustLevel.equals(TrustLevel.STRICT)) {
+ logger.severe(TrustLevel.STRICT + " not implemented.");
+ }
+ } catch (CertificateException e) {
+ if (this.trustLevel.equals(TrustLevel.LOOSE) &&
+ certificates != null && certificates.length == 1)
+ {
+ // If only one cert and its valid and it caused a
+ // CertificateException, assume its selfsigned.
+ X509Certificate certificate = certificates[0];
+ certificate.checkValidity();
+ } else {
+ // If we got to here, then we're probably NORMAL. Rethrow.
+ throw e;
+ }
+ }
+ }
+
+ public X509Certificate[] getAcceptedIssuers() {
+ return this.standardTrustManager.getAcceptedIssuers();
+ }
+}
diff --git a/src/main/java/org/archive/httpclient/HttpRecorderGetMethod.java b/src/main/java/org/archive/httpclient/HttpRecorderGetMethod.java
new file mode 100644
index 00000000..105c4f7e
--- /dev/null
+++ b/src/main/java/org/archive/httpclient/HttpRecorderGetMethod.java
@@ -0,0 +1,120 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.httpclient;
+
+import java.io.IOException;
+import java.util.logging.Logger;
+
+import org.apache.commons.httpclient.HttpConnection;
+import org.apache.commons.httpclient.HttpException;
+import org.apache.commons.httpclient.HttpState;
+import org.apache.commons.httpclient.methods.GetMethod;
+import org.archive.util.Recorder;
+
+
+/**
+ * Override of GetMethod that marks the passed HttpRecorder w/ the transition
+ * from HTTP head to body and that forces a close on the http connection.
+ *
+ * The actions done in this subclass used to be done by copying
+ * org.apache.commons.HttpMethodBase, overlaying our version in place of the
+ * one that came w/ httpclient. Here is the patch of the difference between
+ * shipped httpclient code and our mods:
+ *
We're not supposed to have access to the underlying connection object;
+ * am only violating contract because see cases where httpclient is skipping
+ * out w/o cleaning up after itself.
+ *
+ * @author stack
+ * @version $Revision$, $Date$
+ */
+public class HttpRecorderGetMethod extends GetMethod {
+
+ protected static Logger logger =
+ Logger.getLogger(HttpRecorderGetMethod.class.getName());
+
+ /**
+ * Instance of http recorder method.
+ */
+ protected HttpRecorderMethod httpRecorderMethod = null;
+
+
+ public HttpRecorderGetMethod(String uri, Recorder recorder) {
+ super(uri);
+ this.httpRecorderMethod = new HttpRecorderMethod(recorder);
+ }
+
+ protected void readResponseBody(HttpState state, HttpConnection connection)
+ throws IOException, HttpException {
+ // We're about to read the body. Mark transition in http recorder.
+ this.httpRecorderMethod.markContentBegin(connection);
+ super.readResponseBody(state, connection);
+ }
+
+ protected boolean shouldCloseConnection(HttpConnection conn) {
+ // Always close connection after each request. As best I can tell, this
+ // is superfluous -- we've set our client to be HTTP/1.0. Doing this
+ // out of paranoia.
+ return true;
+ }
+
+ public int execute(HttpState state, HttpConnection conn)
+ throws HttpException, IOException {
+ // Save off the connection so we can close it on our way out in case
+ // httpclient fails to (We're not supposed to have access to the
+ // underlying connection object; am only violating contract because
+ // see cases where httpclient is skipping out w/o cleaning up
+ // after itself).
+ this.httpRecorderMethod.setConnection(conn);
+ return super.execute(state, conn);
+ }
+
+ protected void addProxyConnectionHeader(HttpState state, HttpConnection conn)
+ throws IOException, HttpException {
+ super.addProxyConnectionHeader(state, conn);
+ this.httpRecorderMethod.handleAddProxyConnectionHeader(this);
+ }
+}
diff --git a/src/main/java/org/archive/httpclient/HttpRecorderMethod.java b/src/main/java/org/archive/httpclient/HttpRecorderMethod.java
new file mode 100644
index 00000000..932e7e98
--- /dev/null
+++ b/src/main/java/org/archive/httpclient/HttpRecorderMethod.java
@@ -0,0 +1,107 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.httpclient;
+
+import java.util.logging.Logger;
+
+import org.apache.commons.httpclient.Header;
+import org.apache.commons.httpclient.HttpConnection;
+import org.apache.commons.httpclient.HttpMethod;
+import org.archive.util.Recorder;
+
+
+/**
+ * This class encapsulates the specializations supplied by the
+ * overrides {@link HttpRecorderGetMethod} and {@link HttpRecorderPostMethod}.
+ *
+ * It keeps instance of HttpRecorder and HttpConnection.
+ *
+ * @author stack
+ * @version $Revision$, $Date$
+ */
+public class HttpRecorderMethod {
+ protected static Logger logger =
+ Logger.getLogger(HttpRecorderMethod.class.getName());
+
+ /**
+ * Instance of http recorder we're using recording this http get.
+ */
+ private Recorder httpRecorder = null;
+
+ /**
+ * Save around so can force close.
+ *
+ * See [ 922080 ] IllegalArgumentException (size is wrong).
+ * https://sourceforge.net/tracker/?func=detail&aid=922080&group_id=73833&atid=539099
+ */
+ private HttpConnection connection = null;
+
+
+ public HttpRecorderMethod(Recorder recorder) {
+ this.httpRecorder = recorder;
+ }
+
+ public void markContentBegin(HttpConnection c) {
+ if (c != this.connection) {
+ // We're checking that we're not being asked to work on
+ // a connection that is other than the one we started
+ // this method#execute with.
+ throw new IllegalArgumentException("Connections differ: " +
+ this.connection + " " + c + " " +
+ Thread.currentThread().getName());
+ }
+ this.httpRecorder.markContentBegin();
+ }
+
+ /**
+ * @return Returns the connection.
+ */
+ public HttpConnection getConnection() {
+ return this.connection;
+ }
+
+ /**
+ * @param connection The connection to set.
+ */
+ public void setConnection(HttpConnection connection) {
+ this.connection = connection;
+ }
+ /**
+ * @return Returns the httpRecorder.
+ */
+ public Recorder getHttpRecorder() {
+ return httpRecorder;
+ }
+
+ /**
+ * If a 'Proxy-Connection' header has been added to the request,
+ * it'll be of a 'keep-alive' type. Until we support 'keep-alives',
+ * override the Proxy-Connection setting and instead pass a 'close'
+ * (Otherwise every request has to timeout before we notice
+ * end-of-document).
+ * @param method Method to find proxy-connection header in.
+ */
+ public void handleAddProxyConnectionHeader(HttpMethod method) {
+ Header h = method.getRequestHeader("Proxy-Connection");
+ if (h != null) {
+ h.setValue("close");
+ method.setRequestHeader(h);
+ }
+ }
+}
diff --git a/src/main/java/org/archive/httpclient/HttpRecorderPostMethod.java b/src/main/java/org/archive/httpclient/HttpRecorderPostMethod.java
new file mode 100644
index 00000000..20f1bfd1
--- /dev/null
+++ b/src/main/java/org/archive/httpclient/HttpRecorderPostMethod.java
@@ -0,0 +1,82 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.httpclient;
+
+import java.io.IOException;
+
+import org.apache.commons.httpclient.HttpConnection;
+import org.apache.commons.httpclient.HttpException;
+import org.apache.commons.httpclient.HttpState;
+import org.apache.commons.httpclient.methods.PostMethod;
+import org.archive.util.Recorder;
+
+
+/**
+ * Override of PostMethod that marks the passed HttpRecorder w/ the transition
+ * from HTTP head to body and that forces a close on the responseConnection.
+ *
+ * This is a copy of {@link HttpRecorderGetMethod}. Only difference is the
+ * parent subclass.
+ *
+ * @author stack
+ * @version $Date$ $Revision$
+ */
+public class HttpRecorderPostMethod extends PostMethod {
+ /**
+ * Instance of http recorder method.
+ */
+ protected HttpRecorderMethod httpRecorderMethod = null;
+
+
+ public HttpRecorderPostMethod(String uri, Recorder recorder) {
+ super(uri);
+ this.httpRecorderMethod = new HttpRecorderMethod(recorder);
+ }
+
+ protected void readResponseBody(HttpState state, HttpConnection connection)
+ throws IOException, HttpException {
+ // We're about to read the body. Mark transition in http recorder.
+ this.httpRecorderMethod.markContentBegin(connection);
+ super.readResponseBody(state, connection);
+ }
+
+ protected boolean shouldCloseConnection(HttpConnection conn) {
+ // Always close connection after each request. As best I can tell, this
+ // is superfluous -- we've set our client to be HTTP/1.0. Doing this
+ // out of paranoia.
+ return true;
+ }
+
+ public int execute(HttpState state, HttpConnection conn)
+ throws HttpException, IOException {
+ // Save off the connection so we can close it on our way out in case
+ // httpclient fails to (We're not supposed to have access to the
+ // underlying connection object; am only violating contract because
+ // see cases where httpclient is skipping out w/o cleaning up
+ // after itself).
+ this.httpRecorderMethod.setConnection(conn);
+ return super.execute(state, conn);
+ }
+
+ protected void addProxyConnectionHeader(HttpState state, HttpConnection conn)
+ throws IOException, HttpException {
+ super.addProxyConnectionHeader(state, conn);
+ this.httpRecorderMethod.handleAddProxyConnectionHeader(this);
+ }
+}
diff --git a/src/main/java/org/archive/httpclient/SingleHttpConnectionManager.java b/src/main/java/org/archive/httpclient/SingleHttpConnectionManager.java
new file mode 100644
index 00000000..4ba6a837
--- /dev/null
+++ b/src/main/java/org/archive/httpclient/SingleHttpConnectionManager.java
@@ -0,0 +1,70 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.httpclient;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.commons.httpclient.HostConfiguration;
+import org.apache.commons.httpclient.HttpConnection;
+import org.apache.commons.httpclient.SimpleHttpConnectionManager;
+
+/**
+ * An HttpClient-compatible HttpConnection "manager" that actually
+ * just gives out a new connection each time -- skipping the overhead
+ * of connection management, since we already throttle our crawler
+ * with external mechanisms.
+ *
+ * @author gojomo
+ */
+public class SingleHttpConnectionManager extends SimpleHttpConnectionManager {
+
+ public SingleHttpConnectionManager() {
+ super();
+ }
+
+ public HttpConnection getConnectionWithTimeout(
+ HostConfiguration hostConfiguration, long timeout) {
+
+ HttpConnection conn = new HttpConnection(hostConfiguration);
+ conn.setHttpConnectionManager(this);
+ conn.getParams().setDefaults(this.getParams());
+ return conn;
+ }
+
+ public void releaseConnection(HttpConnection conn) {
+ // ensure connection is closed
+ conn.close();
+ finishLast(conn);
+ }
+
+ protected static void finishLast(HttpConnection conn) {
+ // copied from superclass because it wasn't made available to subclasses
+ InputStream lastResponse = conn.getLastResponseInputStream();
+ if (lastResponse != null) {
+ conn.setLastResponseInputStream(null);
+ try {
+ lastResponse.close();
+ } catch (IOException ioe) {
+ //FIXME: badness - close to force reconnect.
+ conn.close();
+ }
+ }
+ }
+}
diff --git a/src/main/java/org/archive/httpclient/ThreadLocalHttpConnectionManager.java b/src/main/java/org/archive/httpclient/ThreadLocalHttpConnectionManager.java
new file mode 100644
index 00000000..91e850ea
--- /dev/null
+++ b/src/main/java/org/archive/httpclient/ThreadLocalHttpConnectionManager.java
@@ -0,0 +1,291 @@
+/**
+ * ====================================================================
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ====================================================================
+ *
+ */
+package org.archive.httpclient;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import org.apache.commons.httpclient.HostConfiguration;
+import org.apache.commons.httpclient.HttpConnection;
+import org.apache.commons.httpclient.HttpConnectionManager;
+import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
+
+/**
+ * A simple, but thread-safe HttpClient {@link HttpConnectionManager}.
+ * Based on {@link org.apache.commons.httpclient.SimpleHttpConnectionManager}.
+ *
+ * Java >= 1.4 is recommended.
+ *
+ * @author Christian Kohlschuetter
+ */
+public final class ThreadLocalHttpConnectionManager implements
+ HttpConnectionManager {
+
+ private static final CloserThread closer = new CloserThread();
+ private static final Logger logger = Logger
+ .getLogger(ThreadLocalHttpConnectionManager.class.getName());
+
+ private final ThreadLocal tl = new ThreadLocal() {
+ protected synchronized ConnectionInfo initialValue() {
+ return new ConnectionInfo();
+ }
+ };
+
+ private ConnectionInfo getConnectionInfo() {
+ return (ConnectionInfo) tl.get();
+ }
+
+ private static final class ConnectionInfo {
+ /** The http connection */
+ private HttpConnection conn = null;
+
+ /**
+ * The time the connection was made idle.
+ */
+ private long idleStartTime = Long.MAX_VALUE;
+ }
+
+ public ThreadLocalHttpConnectionManager() {
+ }
+
+ /**
+ * Since the same connection is about to be reused, make sure the
+ * previous request was completely processed, and if not
+ * consume it now.
+ * @param conn The connection
+ * @return true, if the connection is reusable
+ */
+ private static boolean finishLastResponse(final HttpConnection conn) {
+ InputStream lastResponse = conn.getLastResponseInputStream();
+ if(lastResponse != null) {
+ conn.setLastResponseInputStream(null);
+ try {
+ lastResponse.close();
+ return true;
+ } catch (IOException ioe) {
+ // force reconnect.
+ return false;
+ }
+ } else {
+ return false;
+ }
+ }
+
+ /**
+ * Collection of parameters associated with this connection manager.
+ */
+ private HttpConnectionManagerParams params = new HttpConnectionManagerParams();
+
+ /**
+ * @see HttpConnectionManager#getConnection(HostConfiguration)
+ */
+ public HttpConnection getConnection(
+ final HostConfiguration hostConfiguration) {
+ return getConnection(hostConfiguration, 0);
+ }
+
+ /**
+ * Gets the staleCheckingEnabled value to be set on HttpConnections that are created.
+ *
+ * @return true if stale checking will be enabled on HttpConections
+ *
+ * @see HttpConnection#isStaleCheckingEnabled()
+ *
+ * @deprecated Use {@link HttpConnectionManagerParams#isStaleCheckingEnabled()},
+ * {@link HttpConnectionManager#getParams()}.
+ */
+ public boolean isConnectionStaleCheckingEnabled() {
+ return this.params.isStaleCheckingEnabled();
+ }
+
+ /**
+ * Sets the staleCheckingEnabled value to be set on HttpConnections that are created.
+ *
+ * @param connectionStaleCheckingEnabled true if stale checking will be enabled
+ * on HttpConections
+ *
+ * @see HttpConnection#setStaleCheckingEnabled(boolean)
+ *
+ * @deprecated Use {@link HttpConnectionManagerParams#setStaleCheckingEnabled(boolean)},
+ * {@link HttpConnectionManager#getParams()}.
+ */
+ public void setConnectionStaleCheckingEnabled(
+ final boolean connectionStaleCheckingEnabled) {
+ this.params.setStaleCheckingEnabled(connectionStaleCheckingEnabled);
+ }
+
+ /**
+ * @see HttpConnectionManager#getConnectionWithTimeout(HostConfiguration, long)
+ *
+ * @since 3.0
+ */
+ public HttpConnection getConnectionWithTimeout(
+ final HostConfiguration hostConfiguration, final long timeout) {
+
+ final ConnectionInfo ci = getConnectionInfo();
+ HttpConnection httpConnection = ci.conn;
+
+ // make sure the host and proxy are correct for this connection
+ // close it and set the values if they are not
+ if(httpConnection == null || !finishLastResponse(httpConnection)
+ || !hostConfiguration.hostEquals(httpConnection)
+ || !hostConfiguration.proxyEquals(httpConnection)) {
+
+ if(httpConnection != null && httpConnection.isOpen()) {
+ closer.closeConnection(httpConnection);
+ }
+
+ httpConnection = new HttpConnection(hostConfiguration);
+ httpConnection.setHttpConnectionManager(this);
+ httpConnection.getParams().setDefaults(this.params);
+ ci.conn = httpConnection;
+
+ httpConnection.setHost(hostConfiguration.getHost());
+ httpConnection.setPort(hostConfiguration.getPort());
+ httpConnection.setProtocol(hostConfiguration.getProtocol());
+ httpConnection.setLocalAddress(hostConfiguration.getLocalAddress());
+
+ httpConnection.setProxyHost(hostConfiguration.getProxyHost());
+ httpConnection.setProxyPort(hostConfiguration.getProxyPort());
+ }
+
+ // remove the connection from the timeout handler
+ ci.idleStartTime = Long.MAX_VALUE;
+
+ return httpConnection;
+ }
+
+ /**
+ * @see HttpConnectionManager#getConnection(HostConfiguration, long)
+ *
+ * @deprecated Use #getConnectionWithTimeout(HostConfiguration, long)
+ */
+ public HttpConnection getConnection(
+ final HostConfiguration hostConfiguration, final long timeout) {
+ return getConnectionWithTimeout(hostConfiguration, timeout);
+ }
+
+ /**
+ * @see HttpConnectionManager#releaseConnection(org.apache.commons.httpclient.HttpConnection)
+ */
+ public void releaseConnection(final HttpConnection conn) {
+ final ConnectionInfo ci = getConnectionInfo();
+ HttpConnection httpConnection = ci.conn;
+
+ if(conn != httpConnection) {
+ throw new IllegalStateException(
+ "Unexpected release of an unknown connection.");
+ }
+
+ finishLastResponse(httpConnection);
+
+ // track the time the connection was made idle
+ ci.idleStartTime = System.currentTimeMillis();
+ }
+
+ /**
+ * Returns {@link HttpConnectionManagerParams parameters} associated
+ * with this connection manager.
+ *
+ * @since 2.1
+ *
+ * @see HttpConnectionManagerParams
+ */
+ public HttpConnectionManagerParams getParams() {
+ return this.params;
+ }
+
+ /**
+ * Assigns {@link HttpConnectionManagerParams parameters} for this
+ * connection manager.
+ *
+ * @since 2.1
+ *
+ * @see HttpConnectionManagerParams
+ */
+ public void setParams(final HttpConnectionManagerParams p) {
+ if(p == null) {
+ throw new IllegalArgumentException("Parameters may not be null");
+ }
+ this.params = p;
+ }
+
+ /**
+ * @since 3.0
+ */
+ public void closeIdleConnections(final long idleTimeout) {
+ long maxIdleTime = System.currentTimeMillis() - idleTimeout;
+
+ final ConnectionInfo ci = getConnectionInfo();
+
+ if(ci.idleStartTime <= maxIdleTime) {
+ ci.conn.close();
+ }
+ }
+
+ private static final class CloserThread extends Thread {
+ private List connections
+ = new ArrayList();
+
+ private static final int SLEEP_INTERVAL = 5000;
+
+ public CloserThread() {
+ super("HttpConnection closer");
+ // Make this a daemon thread so it can't be responsible for the JVM
+ // not shutting down.
+ setDaemon(true);
+ start();
+ }
+
+ public void closeConnection(final HttpConnection conn) {
+ synchronized (connections) {
+ connections.add(conn);
+ }
+ }
+
+ public void run() {
+ try {
+ while (!Thread.interrupted()) {
+ Thread.sleep(SLEEP_INTERVAL);
+
+ List s;
+ synchronized (connections) {
+ s = connections;
+ connections = new ArrayList();
+ }
+ logger.log(Level.INFO, "Closing " + s.size()
+ + " HttpConnections");
+ for(final Iterator it = s.iterator();
+ it.hasNext();) {
+ HttpConnection conn = it.next();
+ conn.close();
+ conn.setHttpConnectionManager(null);
+ it.remove();
+ }
+ }
+ } catch (InterruptedException e) {
+ return;
+ }
+ }
+ }
+}
diff --git a/src/main/java/org/archive/httpclient/package.html b/src/main/java/org/archive/httpclient/package.html
new file mode 100644
index 00000000..87ae77ed
--- /dev/null
+++ b/src/main/java/org/archive/httpclient/package.html
@@ -0,0 +1,24 @@
+
+
+
+org.archive.httpclient package
+
+Provides specializations on
+ apache jakarta
+ commons httpclient.
+
+
HttpRecorderGetMethod
+
Class that the passed HttpRecorder w/ boundary between
+ HTTP header and content. Also forces a close on the response on
+ call to releaseConnection.
+
+
ConfigurableTrustManagerProtocolSocketFactory
+
A protocol socket factory that allows setting of trust level on
+ construction.
+
+
+
diff --git a/src/main/java/org/archive/io/ArchiveFileConstants.java b/src/main/java/org/archive/io/ArchiveFileConstants.java
new file mode 100644
index 00000000..b1a39194
--- /dev/null
+++ b/src/main/java/org/archive/io/ArchiveFileConstants.java
@@ -0,0 +1,24 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io;
+
+@Deprecated
+public interface ArchiveFileConstants extends org.archive.format.ArchiveFileConstants {
+}
diff --git a/src/main/java/org/archive/io/ArchiveReader.java b/src/main/java/org/archive/io/ArchiveReader.java
new file mode 100644
index 00000000..66056d33
--- /dev/null
+++ b/src/main/java/org/archive/io/ArchiveReader.java
@@ -0,0 +1,761 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io;
+
+
+import java.io.BufferedInputStream;
+import java.io.BufferedWriter;
+import java.io.Closeable;
+import java.io.EOFException;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
+import org.archive.util.MimetypeUtils;
+import org.archive.util.zip.GZIPMembersInputStream;
+
+import com.google.common.io.CountingInputStream;
+
+
+/**
+ * Reader for an Archive file of Archive {@link ArchiveRecord}s.
+ * @author stack
+ * @version $Date$ $Version$
+ */
+public abstract class ArchiveReader implements ArchiveFileConstants, Iterable, Closeable {
+ /**
+ * Is this Archive file compressed?
+ */
+ private boolean compressed = false;
+
+ /**
+ * Should we digest as we read?
+ */
+ private boolean digest = true;
+
+ /**
+ * Should the parse be strict?
+ */
+ private boolean strict = false;
+
+ /**
+ * Archive file input stream.
+ *
+ * Keep it around so we can close it when done.
+ *
+ *
Set in constructor. Should support at least 1 byte mark/reset.
+ * Make it protected so subclasses have access.
+ */
+ protected InputStream in = null;
+
+ /**
+ * Maximum amount of recoverable exceptions in a row.
+ * If more than this amount in a row, we'll let out the exception rather
+ * than go back in for yet another retry.
+ */
+ public static final int MAX_ALLOWED_RECOVERABLES = 10;
+
+
+ /**
+ * The Record currently being read.
+ *
+ * Keep this ongoing reference so we'll close the record even if the caller
+ * doesn't.
+ */
+ private ArchiveRecord currentRecord = null;
+
+ /**
+ * Descriptive string for the Archive file we're going against:
+ * full path, url, etc. -- depends on context in which file was made.
+ */
+ private String identifier = null;
+
+ /**
+ * Archive file version.
+ */
+ private String version = null;
+
+
+ protected ArchiveReader() {
+ super();
+ }
+
+ /**
+ * Convenience method used by subclass constructors.
+ * @param i Identifier for Archive file this reader goes against.
+ */
+ protected void initialize(final String i) {
+ setReaderIdentifier(i);
+ }
+
+ /**
+ * Convenience method for constructors.
+ *
+ * @param f File to read.
+ * @param offset Offset at which to start reading.
+ * @return InputStream to read from.
+ * @throws IOException If failed open or fail to get a memory
+ * mapped byte buffer on file.
+ */
+ protected InputStream getInputStream(final File f, final long offset)
+ throws IOException {
+ FileInputStream fin = new FileInputStream(f);
+ return new BufferedInputStream(fin);
+ }
+
+ public boolean isCompressed() {
+ return this.compressed;
+ }
+
+ /**
+ * Get record at passed offset.
+ *
+ * @param offset Byte index into file at which a record starts.
+ * @return An Archive Record reference.
+ * @throws IOException
+ */
+ public ArchiveRecord get(long offset) throws IOException {
+ cleanupCurrentRecord();
+ long posn = positionForRecord(in);
+ if(offset>=posn) {
+ in.skip(offset-posn);
+ } else {
+ throw new UnsupportedOperationException("no reverse seeking: at "+posn+" requested "+offset);
+ }
+ return createArchiveRecord(this.in, offset);
+ }
+
+ /**
+ * @return Return Archive Record created against current offset.
+ * @throws IOException
+ */
+ public ArchiveRecord get() throws IOException {
+ return createArchiveRecord(this.in, positionForRecord(in));
+ }
+
+ public void close() throws IOException {
+ if (this.in != null) {
+ this.in.close();
+ this.in = null;
+ }
+ }
+
+ /**
+ * Cleanout the current record if there is one.
+ * @throws IOException
+ */
+ protected void cleanupCurrentRecord() throws IOException {
+ if (this.currentRecord != null) {
+ this.currentRecord.close();
+ gotoEOR(this.currentRecord);
+ this.currentRecord = null;
+ }
+ }
+
+ /**
+ * Return an Archive Record homed on offset into
+ * is.
+ * @param is Stream to read Record from.
+ * @param offset Offset to find Record at.
+ * @return ArchiveRecord instance.
+ * @throws IOException
+ */
+ protected abstract ArchiveRecord createArchiveRecord(InputStream is,
+ long offset)
+ throws IOException;
+
+ /**
+ * Skip over any trailing new lines at end of the record so we're lined up
+ * ready to read the next.
+ * @param record
+ * @throws IOException
+ */
+ protected abstract void gotoEOR(ArchiveRecord record) throws IOException;
+
+ public abstract String getFileExtension();
+ public abstract String getDotFileExtension();
+
+ /**
+ * @return Version of this Archive file.
+ */
+ public String getVersion() {
+ return this.version;
+ }
+
+ /**
+ * Validate the Archive file.
+ *
+ * This method iterates over the file throwing exception if it fails
+ * to successfully parse any record.
+ *
+ *
Assumes the stream is at the start of the file.
+ * @return List of all read Archive Headers.
+ *
+ * @throws IOException
+ */
+ public List validate() throws IOException {
+ return validate(-1);
+ }
+
+ /**
+ * Validate the Archive file.
+ *
+ * This method iterates over the file throwing exception if it fails
+ * to successfully parse.
+ *
+ *
We start validation from wherever we are in the stream.
+ *
+ * @param numRecords Number of records expected. Pass -1 if number is
+ * unknown.
+ *
+ * @return List of all read metadatas. As we validate records, we add
+ * a reference to the read metadata.
+ *
+ * @throws IOException
+ */
+ public List validate(int numRecords)
+ throws IOException {
+ List hdrList = new ArrayList();
+ int recordCount = 0;
+ setStrict(true);
+ for (Iterator i = iterator(); i.hasNext();) {
+ recordCount++;
+ ArchiveRecord r = i.next();
+ if (r.getHeader().getLength() <= 0
+ && r.getHeader().getMimetype().
+ equals(MimetypeUtils.NO_TYPE_MIMETYPE)) {
+ throw new IOException("record content is empty.");
+ }
+ r.close();
+ hdrList.add(r.getHeader());
+ }
+
+ if (numRecords != -1) {
+ if (recordCount != numRecords) {
+ throw new IOException("Count of records, "
+ + Integer.toString(recordCount)
+ + " is not equal to expected "
+ + Integer.toString(numRecords));
+ }
+ }
+
+ return hdrList;
+ }
+
+ /**
+ * Test Archive file is valid.
+ * Assumes the stream is at the start of the file. Be aware that this
+ * method makes a pass over the whole file.
+ * @return True if file can be successfully parsed.
+ */
+ public boolean isValid() {
+ boolean valid = false;
+ try {
+ validate();
+ valid = true;
+ } catch(Exception e) {
+ // File is not valid if exception thrown parsing.
+ valid = false;
+ }
+
+ return valid;
+ }
+
+ /**
+ * @return Returns the strict.
+ */
+ public boolean isStrict() {
+ return this.strict;
+ }
+
+ /**
+ * @param s The strict to set.
+ */
+ public void setStrict(boolean s) {
+ this.strict = s;
+ }
+
+ /**
+ * @param d True if we're to digest.
+ */
+ public void setDigest(boolean d) {
+ this.digest = d;
+ }
+
+ /**
+ * @return True if we're digesting as we read.
+ */
+ public boolean isDigest() {
+ return this.digest;
+ }
+
+ protected Logger getLogger() {
+ return Logger.getLogger(this.getClass().getName());
+ }
+
+ /**
+ * Returns an ArchiveRecord iterator.
+ * Of note, on IOException, especially if ZipException reading compressed
+ * ARCs, rather than fail the iteration, try moving to the next record.
+ * If {@link ArchiveReader#strict} is not set, this will usually succeed.
+ * @return An iterator over ARC records.
+ */
+ public Iterator iterator() {
+ // Eat up any record outstanding.
+ try {
+ cleanupCurrentRecord();
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+
+ return new ArchiveRecordIterator();
+ }
+
+ protected void setCompressed(boolean compressed) {
+ this.compressed = compressed;
+ }
+
+ /**
+ * @return The current ARC record or null if none.
+ * After construction has the arcfile header record.
+ * @see #get()
+ */
+ protected ArchiveRecord getCurrentRecord() {
+ return this.currentRecord;
+ }
+
+ protected ArchiveRecord currentRecord(final ArchiveRecord r) {
+ this.currentRecord = r;
+ return r;
+ }
+
+ protected InputStream getIn() {
+ return in;
+ }
+
+ protected void setIn(InputStream in) {
+ this.in = in;
+ }
+
+ protected void setVersion(String version) {
+ this.version = version;
+ }
+
+ public String getReaderIdentifier() {
+ return this.identifier;
+ }
+
+ protected void setReaderIdentifier(final String i) {
+ this.identifier = i;
+ }
+
+ /**
+ * Log on stderr.
+ * Logging should go via the logging system. This method
+ * bypasses the logging system going direct to stderr.
+ * Should not generally be used. Its used for rare messages
+ * that come of cmdline usage of ARCReader ERRORs and WARNINGs.
+ * Override if using ARCReader in a context where no stderr or
+ * where you'd like to redirect stderr to other than System.err.
+ * @param level Level to log message at.
+ * @param message Message to log.
+ */
+ public void logStdErr(Level level, String message) {
+ System.err.println(level.toString() + " " + message);
+ }
+
+// /**
+// * Add buffering to RandomAccessInputStream.
+// */
+// protected class RandomAccessBufferedInputStream
+// extends BufferedInputStream implements RepositionableStream {
+//
+// public RandomAccessBufferedInputStream(RandomAccessInputStream is)
+// throws IOException {
+// super(is);
+// }
+//
+// public RandomAccessBufferedInputStream(RandomAccessInputStream is, int size)
+// throws IOException {
+// super(is, size);
+// }
+//
+// public long position() throws IOException {
+// // Current position is the underlying files position
+// // minus the amount thats in the buffer yet to be read.
+// return ((RandomAccessInputStream)this.in).position() -
+// (this.count - this.pos);
+// }
+//
+// public void position(long position) throws IOException {
+// // Force refill of buffer whenever there's been a seek.
+// this.pos = 0;
+// this.count = 0;
+// ((RandomAccessInputStream)this.in).position(position);
+// }
+//
+// public int available() throws IOException {
+// // Avoid overflow on large datastreams
+// long amount = (long)in.available() + (long)(count - pos);
+// return (amount >= Integer.MAX_VALUE)? Integer.MAX_VALUE: (int)amount;
+// }
+// }
+
+ /**
+ * Inner ArchiveRecord Iterator class.
+ * Throws RuntimeExceptions in {@link #hasNext()} and {@link #next()} if
+ * trouble pulling record from underlying stream.
+ * @author stack
+ */
+ protected class ArchiveRecordIterator implements Iterator {
+ private final Logger logger =
+ Logger.getLogger(this.getClass().getName());
+ /**
+ * @return True if we have more records to read.
+ * @exception RuntimeException Can throw an IOException wrapped in a
+ * RuntimeException if a problem reading underlying stream (Corrupted
+ * gzip, etc.).
+ */
+ public boolean hasNext() {
+ // Call close on any extant record. This will scoot us past
+ // any content not yet read.
+ try {
+ cleanupCurrentRecord();
+ } catch (IOException e) {
+ if (isStrict()) {
+ throw new RuntimeException(e);
+ }
+ if (e instanceof EOFException) {
+ logger.warning("Premature EOF cleaning up " +
+ currentRecord.getHeader().toString() + ": " +
+ e.getMessage());
+ return false;
+ }
+ // If not strict, try going again. We might be able to skip
+ // over the bad record.
+ logger.log(Level.WARNING,"Trying skip of failed record cleanup of " +
+ currentRecord.getHeader().toString() + ": " +
+ e.getMessage(), e);
+ }
+ return innerHasNext();
+ }
+
+ protected boolean innerHasNext(){
+ try {
+ getIn().mark(1);
+ int c = getIn().read();
+ getIn().reset();
+ return c > -1;
+ } catch (IOException e) {
+ logger.log(Level.WARNING,"problem probing for more content",e);
+ return false;
+ }
+ }
+
+ /**
+ * Tries to move to next record if we get
+ * {@link RecoverableIOException}. If not strict
+ * tries to move to next record if we get an
+ * {@link IOException}.
+ * @return Next object.
+ * @exception RuntimeException Throws a runtime exception,
+ * usually a wrapping of an IOException, if trouble getting
+ * a record (Throws exception rather than return null).
+ */
+ public ArchiveRecord next() {
+ long offset = -1;
+ try {
+ offset = positionForRecord(getIn());
+ return exceptionNext();
+ } catch (IOException e) {
+ if (!isStrict()) {
+ // Retry though an IOE. Maybe we will succeed reading
+ // subsequent record.
+ try {
+ if (hasNext()) {
+ getLogger().warning("Bad Record. Trying skip " +
+ "(Record start " + offset + "): " +
+ e.getMessage());
+ return exceptionNext();
+ }
+ // Else we are at last record. Iterator#next is
+ // expecting value. We do not have one. Throw exception.
+ throw new RuntimeException("Retried but no next " +
+ "record (Record start " + offset + ")", e);
+ } catch (IOException e1) {
+ throw new RuntimeException("After retry (Offset " +
+ offset + ")", e1);
+ }
+ }
+ throw new RuntimeException("(Record start " + offset + ")", e);
+ }
+ }
+
+ /**
+ * A next that throws exceptions and has handling of
+ * recoverable exceptions moving us to next record. Can call
+ * hasNext which itself may throw exceptions.
+ * @return Next record.
+ * @throws IOException
+ * @throws RuntimeException Thrown when we've reached maximum
+ * retries.
+ */
+ protected ArchiveRecord exceptionNext()
+ throws IOException, RuntimeException {
+ ArchiveRecord result = null;
+ IOException ioe = null;
+ for (int i = MAX_ALLOWED_RECOVERABLES; i > 0 &&
+ result == null; i--) {
+ ioe = null;
+ try {
+ result = innerNext();
+ } catch (RecoverableIOException e) {
+ ioe = e;
+ getLogger().warning(e.getMessage());
+ if (hasNext()) {
+ continue;
+ }
+ // No records left. Throw exception rather than
+ // return null. The caller is expecting to get
+ // back a record since they've just called
+ // hasNext.
+ break;
+ }
+ }
+ if (ioe != null) {
+ // Then we did MAX_ALLOWED_RECOVERABLES retries. Throw
+ // the recoverable ioe wrapped in a RuntimeException so
+ // it goes out pass checks for IOE.
+ throw new RuntimeException("Retried " +
+ MAX_ALLOWED_RECOVERABLES + " times in a row", ioe);
+ }
+ return result;
+ }
+
+ protected ArchiveRecord innerNext() throws IOException {
+ return get(positionForRecord(getIn()));
+ }
+
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+ }
+
+ protected static long positionForRecord(InputStream in) {
+ return (in instanceof GZIPMembersInputStream)
+ ? ((GZIPMembersInputStream)in).getCurrentMemberStart()
+ : ((CountingInputStream)in).getCount();
+ }
+
+ protected static String stripExtension(final String name,
+ final String ext) {
+ return (!name.endsWith(ext))? name:
+ name.substring(0, name.length() - ext.length());
+ }
+
+ /**
+ * @return short name of Archive file.
+ */
+ public String getFileName() {
+ return (new File(getReaderIdentifier())).getName();
+ }
+
+ /**
+ * @return short name of Archive file.
+ */
+ public String getStrippedFileName() {
+ return getStrippedFileName(getFileName(),
+ getDotFileExtension());
+ }
+
+ /**
+ * @param name Name of ARCFile.
+ * @param dotFileExtension '.arc' or '.warc', etc.
+ * @return short name of Archive file.
+ */
+ public static String getStrippedFileName(String name,
+ final String dotFileExtension) {
+ name = stripExtension(name,
+ ArchiveFileConstants.DOT_COMPRESSED_FILE_EXTENSION);
+ return stripExtension(name, dotFileExtension);
+ }
+
+ /**
+ * @param value Value to test.
+ * @return True if value is 'true', else false.
+ */
+ protected static boolean getTrueOrFalse(final String value) {
+ if (value == null || value.length() <= 0) {
+ return false;
+ }
+ return Boolean.TRUE.toString().equals(value.toLowerCase());
+ }
+
+ /**
+ * @param format Format to use outputting.
+ * @throws IOException
+ * @throws java.text.ParseException
+ * @return True if handled.
+ */
+ protected boolean output(final String format)
+ throws IOException, java.text.ParseException {
+ boolean result = true;
+ // long start = System.currentTimeMillis();
+
+ // Write output as pseudo-CDX file. See
+ // http://www.archive.org/web/researcher/cdx_legend.php
+ // and http://www.archive.org/web/researcher/example_cdx.php.
+ // Hash is hard-coded straight SHA-1 hash of content.
+ if (format.equals(DUMP)) {
+ // No point digesting dumping.
+ setDigest(false);
+ dump(false);
+ } else if (format.equals(GZIP_DUMP)) {
+ // No point digesting dumping.
+ setDigest(false);
+ dump(true);
+ } else if (format.equals(CDX)) {
+ cdxOutput(false);
+ } else if (format.equals(CDX_FILE)) {
+ cdxOutput(true);
+ } else {
+ result = false;
+ }
+ return result;
+ }
+
+ protected void cdxOutput(boolean toFile)
+ throws IOException {
+ BufferedWriter cdxWriter = null;
+ if (toFile) {
+ String cdxFilename = stripExtension(getReaderIdentifier(),
+ DOT_COMPRESSED_FILE_EXTENSION);
+ cdxFilename = stripExtension(cdxFilename, getDotFileExtension());
+ cdxFilename += ('.' + CDX);
+ cdxWriter = new BufferedWriter(new FileWriter(cdxFilename));
+ }
+
+ String header = "CDX b e a m s c " + ((isCompressed()) ? "V" : "v")
+ + " n g";
+ if (toFile) {
+ cdxWriter.write(header);
+ cdxWriter.newLine();
+ } else {
+ System.out.println(header);
+ }
+
+ String strippedFileName = getStrippedFileName();
+ try {
+ for (Iterator ii = iterator(); ii.hasNext();) {
+ ArchiveRecord r = ii.next();
+ if (toFile) {
+ cdxWriter.write(r.outputCdx(strippedFileName));
+ cdxWriter.newLine();
+ } else {
+ System.out.println(r.outputCdx(strippedFileName));
+ }
+ }
+ } finally {
+ if (toFile) {
+ cdxWriter.close();
+ }
+ }
+ }
+
+ /**
+ * Output passed record using passed format specifier.
+ * @param format What format to use outputting.
+ * @throws IOException
+ * @return True if handled.
+ */
+ public boolean outputRecord(final String format)
+ throws IOException {
+ boolean result = true;
+ if (format.equals(CDX)) {
+ System.out.println(get().outputCdx(getStrippedFileName()));
+ } else if(format.equals(ArchiveFileConstants.DUMP)) {
+ // No point digesting if dumping content.
+ setDigest(false);
+ get().dump();
+ } else {
+ result = false;
+ }
+ return result;
+ }
+
+ /**
+ * Dump this file on STDOUT
+ * @throws compress True if dumped output is compressed.
+ * @throws IOException
+ * @throws java.text.ParseException
+ */
+ public abstract void dump(final boolean compress)
+ throws IOException, java.text.ParseException;
+
+ /**
+ * @return an ArchiveReader that will delete a local file on close. Used
+ * when we bring Archive files local and need to clean up afterward.
+ */
+ public abstract ArchiveReader getDeleteFileOnCloseReader(final File f);
+
+ /**
+ * Output passed record using passed format specifier.
+ * @param r ARCReader instance to output.
+ * @param format What format to use outputting.
+ * @throws IOException
+ */
+ protected static void outputRecord(final ArchiveReader r,
+ final String format)
+ throws IOException {
+ if (!r.outputRecord(format)) {
+ throw new IOException("Unsupported format" +
+ " (or unsupported on a single record): " + format);
+ }
+ }
+
+ /**
+ * @return Base Options object filled out with help, digest, strict, etc.
+ * options.
+ */
+ protected static Options getOptions() {
+ Options options = new Options();
+ options.addOption(new Option("h","help", false,
+ "Prints this message and exits."));
+ options.addOption(new Option("o","offset", true,
+ "Outputs record at this offset into file."));
+ options.addOption(new Option("d","digest", true,
+ "Pass true|false. Expensive. Default: true (SHA-1)."));
+ options.addOption(new Option("s","strict", false,
+ "Strict mode. Fails parse if incorrectly formatted file."));
+ options.addOption(new Option("f","format", true,
+ "Output options: 'cdx', cdxfile', 'dump', 'gzipdump'," +
+ "'or 'nohead'. Default: 'cdx'."));
+ return options;
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/io/ArchiveReaderFactory.java b/src/main/java/org/archive/io/ArchiveReaderFactory.java
new file mode 100644
index 00000000..17f14d3a
--- /dev/null
+++ b/src/main/java/org/archive/io/ArchiveReaderFactory.java
@@ -0,0 +1,301 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.HttpURLConnection;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.net.URLConnection;
+
+import org.archive.io.arc.ARCReaderFactory;
+import org.archive.io.warc.WARCReaderFactory;
+import org.archive.net.md5.Md5URLConnection;
+import org.archive.net.rsync.RsyncURLConnection;
+import org.archive.url.UsableURI;
+import org.archive.util.FileUtils;
+
+
+/**
+ * Factory that returns an Archive file Reader.
+ * Returns Readers for ARCs or WARCs.
+ * @author stack
+ * @version $Date$ $Revision$
+ */
+public class ArchiveReaderFactory implements ArchiveFileConstants {
+ // Static block to enable S3 URLs
+ static {
+ if (System.getProperty("java.protocol.handler.pkgs") != null) {
+ System.setProperty("java.protocol.handler.pkgs",
+ System.getProperty("java.protocol.handler.pkgs")
+ + "|" + "org.archive.net");
+ } else {
+ System.setProperty("java.protocol.handler.pkgs", "org.archive.net");
+ }
+ }
+
+ private static final ArchiveReaderFactory factory =
+ new ArchiveReaderFactory();
+
+ /**
+ * Shutdown any public access to default constructor.
+ */
+ protected ArchiveReaderFactory() {
+ super();
+ }
+
+ /**
+ * Get an Archive file Reader on passed path or url.
+ * Does primitive heuristic figuring if path or URL.
+ * @param arcFileOrUrl File path or URL pointing at an Archive file.
+ * @return An Archive file Reader.
+ * @throws IOException
+ * @throws MalformedURLException
+ * @throws IOException
+ */
+ public static ArchiveReader get(final String arcFileOrUrl)
+ throws MalformedURLException, IOException {
+ return ArchiveReaderFactory.factory.getArchiveReader(arcFileOrUrl);
+ }
+
+ protected ArchiveReader getArchiveReader(final String arcFileOrUrl)
+ throws MalformedURLException, IOException {
+ return getArchiveReader(arcFileOrUrl, 0);
+ }
+
+ protected ArchiveReader getArchiveReader(final String arcFileOrUrl,
+ final long offset)
+ throws MalformedURLException, IOException {
+ return UsableURI.hasScheme(arcFileOrUrl) && arcFileOrUrl.indexOf(":")>1?
+ get(new URL(arcFileOrUrl), offset):
+ get(new File(arcFileOrUrl), offset);
+ }
+
+ /**
+ * @param f An Archive file to read.
+ * @return An ArchiveReader
+ * @throws IOException
+ */
+ public static ArchiveReader get(final File f) throws IOException {
+ return ArchiveReaderFactory.factory.getArchiveReader(f);
+ }
+
+ protected ArchiveReader getArchiveReader(final File f)
+ throws IOException {
+ return getArchiveReader(f, 0);
+ }
+
+ /**
+ * @param f An Archive file to read.
+ * @param offset Have returned Reader set to start reading at this offset.
+ * @return An ArchiveReader
+ * @throws IOException
+ */
+ public static ArchiveReader get(final File f, final long offset)
+ throws IOException {
+ return ArchiveReaderFactory.factory.getArchiveReader(f, offset);
+ }
+
+ protected ArchiveReader getArchiveReader(final File f,
+ final long offset)
+ throws IOException {
+ if (ARCReaderFactory.isARCSuffix(f.getName())) {
+ return ARCReaderFactory.get(f, true, offset);
+ } else if (WARCReaderFactory.isWARCSuffix(f.getName())) {
+ return WARCReaderFactory.get(f, offset);
+ }
+ throw new IOException("Unknown file extension (Not ARC nor WARC): "
+ + f.getName());
+ }
+
+ /**
+ * Wrap a Reader around passed Stream.
+ * @param s Identifying String for this Stream used in error messages.
+ * Must be a string that ends with the name of the file we're to put
+ * an ArchiveReader on. This code looks at file endings to figure
+ * whether to return an ARC or WARC reader.
+ * @param is Stream. Stream will be wrapped with implementation of
+ * RepositionableStream unless already supported.
+ * @param atFirstRecord Are we at first Record?
+ * @return ArchiveReader.
+ * @throws IOException
+ */
+ public static ArchiveReader get(final String s, final InputStream is,
+ final boolean atFirstRecord)
+ throws IOException {
+ return ArchiveReaderFactory.factory.getArchiveReader(s, is,
+ atFirstRecord);
+ }
+
+ protected ArchiveReader getArchiveReader(final String id,
+ final InputStream is, final boolean atFirstRecord)
+ throws IOException {
+ final InputStream stream = is;
+ if (ARCReaderFactory.isARCSuffix(id)) {
+ return ARCReaderFactory.get(id, stream, atFirstRecord);
+ } else if (WARCReaderFactory.isWARCSuffix(id)) {
+ return WARCReaderFactory.get(id, stream, atFirstRecord);
+ }
+ throw new IOException("Unknown extension (Not ARC nor WARC): " + id);
+ }
+
+ /**
+ * Get an Archive Reader aligned at offset.
+ * This version of get will not bring the file local but will try to
+ * stream across the net making an HTTP 1.1 Range request on remote
+ * http server (RFC1435 Section 14.35).
+ * @param u HTTP URL for an Archive file.
+ * @param offset Offset into file at which to start fetching.
+ * @return An ArchiveReader aligned at offset.
+ * @throws IOException
+ */
+ public static ArchiveReader get(final URL u, final long offset)
+ throws IOException {
+ return ArchiveReaderFactory.factory.getArchiveReader(u, offset);
+ }
+
+ protected ArchiveReader getArchiveReader(final URL f, final long offset)
+ throws IOException {
+ // Get URL connection.
+ URLConnection connection = f.openConnection();
+ if (connection instanceof HttpURLConnection) {
+ addUserAgent((HttpURLConnection)connection);
+ }
+ if (offset != 0) {
+ // Use a Range request (Assumes HTTP 1.1 on other end). If
+ // length >= 0, add open-ended range header to the request. Else,
+ // because end-byte is inclusive, subtract 1.
+ connection.addRequestProperty("Range", "bytes=" + offset + "-");
+ // TODO: should actually verify that server respected 'Range' request
+ // (spec allows them to ignore; 206 response or Content-Range header
+ // should be present if Range satisfied; multipart/byteranges could be
+ // a problem).
+ }
+
+ return getArchiveReader(f.toString(), connection.getInputStream(), (offset == 0));
+ }
+
+ /**
+ * Get an ARCReader.
+ * Pulls the ARC local into whereever the System Property
+ * java.io.tmpdir points. It then hands back an ARCReader that
+ * points at this local copy. A close on this ARCReader instance will
+ * remove the local copy.
+ * @param u An URL that points at an ARC.
+ * @return An ARCReader.
+ * @throws IOException
+ */
+ public static ArchiveReader get(final URL u)
+ throws IOException {
+ return ArchiveReaderFactory.factory.getArchiveReader(u);
+ }
+
+ protected ArchiveReader getArchiveReader(final URL u)
+ throws IOException {
+ // If url represents a local file then return file it points to.
+ if (u.getPath() != null) {
+ // TODO: Add scheme check and host check.
+ File f = new File(u.getPath());
+ if (f.exists()) {
+ return get(f, 0);
+ }
+ }
+
+ String scheme = u.getProtocol();
+ if (scheme.startsWith("http") || scheme.equals("s3")) {
+ // Try streaming if http or s3 URLs rather than copying local
+ // and then reading (Passing an offset will get us an Reader
+ // that wraps a Stream).
+ return get(u, 0);
+ }
+
+ return makeARCLocal(u.openConnection());
+ }
+
+ protected ArchiveReader makeARCLocal(final URLConnection connection)
+ throws IOException {
+ File localFile = null;
+ if (connection instanceof HttpURLConnection) {
+ // If http url connection, bring down the resource local.
+ String p = connection.getURL().getPath();
+ int index = p.lastIndexOf('/');
+ if (index >= 0) {
+ // Name file for the file we're making local.
+ localFile = File.createTempFile("",p.substring(index + 1));
+ if (localFile.exists()) {
+ // If file of same name already exists in TMPDIR, then
+ // clean it up (Assuming only reason a file of same name in
+ // TMPDIR is because we failed a previous download).
+ localFile.delete();
+ }
+ } else {
+ localFile = File.createTempFile(ArchiveReader.class.getName(),
+ ".tmp");
+ }
+ addUserAgent((HttpURLConnection)connection);
+ connection.connect();
+ try {
+ FileUtils.readFullyToFile(connection.getInputStream(), localFile);
+ } catch (IOException ioe) {
+ localFile.delete();
+ throw ioe;
+ }
+ } else if (connection instanceof RsyncURLConnection) {
+ // Then, connect and this will create a local file.
+ // See implementation of the rsync handler.
+ connection.connect();
+ localFile = ((RsyncURLConnection)connection).getFile();
+ } else if (connection instanceof Md5URLConnection) {
+ // Then, connect and this will create a local file.
+ // See implementation of the md5 handler.
+ connection.connect();
+ localFile = ((Md5URLConnection)connection).getFile();
+ } else {
+ throw new UnsupportedOperationException("No support for " +
+ connection);
+ }
+
+ ArchiveReader reader = null;
+ try {
+ reader = get(localFile, 0);
+ } catch (IOException e) {
+ localFile.delete();
+ throw e;
+ }
+
+ // Return a delegate that does cleanup of downloaded file on close.
+ return reader.getDeleteFileOnCloseReader(localFile);
+ }
+
+ protected void addUserAgent(final HttpURLConnection connection) {
+ connection.addRequestProperty("User-Agent", this.getClass().getName());
+ }
+
+ /**
+ * @param f File to test.
+ * @return True if f is compressed.
+ * @throws IOException
+ */
+ protected boolean isCompressed(final File f) throws IOException {
+ return f.getName().toLowerCase().
+ endsWith(DOT_COMPRESSED_FILE_EXTENSION);
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/io/ArchiveRecord.java b/src/main/java/org/archive/io/ArchiveRecord.java
new file mode 100644
index 00000000..63bfe628
--- /dev/null
+++ b/src/main/java/org/archive/io/ArchiveRecord.java
@@ -0,0 +1,409 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+import java.util.logging.Level;
+
+import org.archive.util.Base32;
+
+/**
+ * Archive file Record.
+ * @author stack
+ * @version $Date$ $Version$
+ */
+public abstract class ArchiveRecord extends InputStream {
+
+ /**
+ * Minimal http response or request header length.
+ *
+ * I've seen in arcs content length of 1 with no header.
+ */
+ protected static final long MIN_HTTP_HEADER_LENGTH =
+ Math.min("HTTP/1.1 200 OK\r\n".length(), "GET / HTTP/1.0\n\r".length());
+
+ protected ArchiveRecordHeader header = null;
+
+ /**
+ * Stream to read this record from.
+ *
+ * Stream can only be read sequentially. Will only return this records'
+ * content returning a -1 if you try to read beyond the end of the current
+ * record.
+ *
+ *
Streams can be markable or not. If they are, we'll be able to roll
+ * back when we've read too far. If not markable, assumption is that
+ * the underlying stream is managing our not reading too much (This pertains
+ * to the skipping over the end of the ARCRecord. See {@link #skip()}.
+ */
+ protected InputStream in = null;
+
+ /**
+ * Position w/i the Record content, within in.
+ * This position is relative within this Record. Its not same as the
+ * Archive file position.
+ */
+ protected long position = 0;
+
+ /**
+ * Set flag when we've reached the end-of-record.
+ */
+ protected boolean eor = false;
+
+ /**
+ * Compute digest on what we read and add to metadata when done.
+ *
+ * Currently hardcoded as sha-1. TODO: Remove when archive records
+ * digest or else, add a facility that allows the arc reader to
+ * compare the calculated digest to that which is recorded in
+ * the arc.
+ *
+ *
Protected instead of private so subclasses can update and complete
+ * the digest.
+ */
+ protected MessageDigest digest = null;
+ private String digestStr = null;
+
+ protected boolean strict = false;
+
+
+ /**
+ * Constructor.
+ *
+ * @param in Stream cue'd up to be at the start of the record this instance
+ * is to represent.
+ * @throws IOException
+ */
+ public ArchiveRecord(InputStream in)
+ throws IOException {
+ this(in, null, 0, true, false);
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param in Stream cue'd up to be at the start of the record this instance
+ * is to represent.
+ * @param header Header data.
+ * @throws IOException
+ */
+ public ArchiveRecord(InputStream in, ArchiveRecordHeader header)
+ throws IOException {
+ this(in, header, 0, true, false);
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param in Stream cue'd up to be at the start of the record this instance
+ * is to represent.
+ * @param header Header data.
+ * @param bodyOffset Offset into the body. Usually 0.
+ * @param digest True if we're to calculate digest for this record. Not
+ * digesting saves about ~15% of cpu during an ARC parse.
+ * @param strict Be strict parsing (Parsing stops if ARC inproperly
+ * formatted).
+ * @throws IOException
+ */
+ public ArchiveRecord(InputStream in, ArchiveRecordHeader header,
+ int bodyOffset, boolean digest, boolean strict)
+ throws IOException {
+ this.in = in;
+ this.header = header;
+ this.position = bodyOffset;
+ if (digest) {
+ try {
+ this.digest = MessageDigest.getInstance("SHA1");
+ } catch (NoSuchAlgorithmException e) {
+ // Convert to IOE because thats more amenable to callers
+ // -- they are dealing with it anyways.
+ throw new IOException(e.getMessage());
+ }
+ }
+ this.strict = strict;
+ }
+
+ public boolean markSupported() {
+ return false;
+ }
+
+ /**
+ * @return Header data for this record.
+ */
+ public ArchiveRecordHeader getHeader() {
+ return this.header;
+ }
+
+ protected void setHeader(ArchiveRecordHeader header) {
+ this.header = header;
+ }
+
+ /**
+ * Calling close on a record skips us past this record to the next record
+ * in the stream.
+ *
+ * It does not actually close the stream. The underlying steam is probably
+ * being used by the next arc record.
+ *
+ * @throws IOException
+ */
+ public void close() throws IOException {
+ if (this.in != null) {
+ skip();
+ this.in = null;
+ if (this.digest != null) {
+ this.digestStr = Base32.encode(this.digest.digest());
+ }
+ }
+ }
+
+ /**
+ * @return Next character in this Record content else -1 if at EOR.
+ * @throws IOException
+ */
+ public int read() throws IOException {
+ int c = -1;
+ if (available() > 0) {
+ c = this.in.read();
+ if (c == -1) {
+ throw new IOException("Premature EOF before end-of-record.");
+ }
+ if (this.digest != null) {
+ this.digest.update((byte) c);
+ }
+ incrementPosition();
+ }
+ return c;
+ }
+
+ public int read(byte[] b, int offset, int length) throws IOException {
+ int read = Math.min(length, available());
+ if (read == -1 || read == 0) {
+ read = -1;
+ } else {
+ read = this.in.read(b, offset, read);
+ if (read == -1) {
+ String msg = "Premature EOF before end-of-record: "
+ + getHeader().getHeaderFields();
+ if (isStrict()) {
+ throw new IOException(msg);
+ }
+ setEor(true);
+ System.err.println(Level.WARNING.toString() + " " + msg);
+ }
+ if (this.digest != null && read >= 0) {
+ this.digest.update(b, offset, read);
+ }
+ incrementPosition(read);
+ }
+ return read;
+ }
+
+ /**
+ * This available is not the stream's available. Its an available based on
+ * what the stated Archive record length is minus what we've read to date.
+ *
+ * @return True if bytes remaining in record content.
+ */
+ public int available() {
+ long amount = getHeader().getLength() - getPosition();
+ return (amount > Integer.MAX_VALUE? Integer.MAX_VALUE: (int)amount);
+ }
+
+ /**
+ * Skip over this records content.
+ *
+ * @throws IOException
+ */
+ protected void skip() throws IOException {
+ if (this.eor) {
+ return;
+ }
+
+ // Read to the end of the body of the record. Exhaust the stream.
+ // Can't skip direct to end because underlying stream may be compressed
+ // and we're calculating the digest for the record.
+ int r = available();
+ while (r > 0 && !this.eor) {
+ skip(r);
+ r = available();
+ }
+ }
+
+ public long skip(long n) throws IOException {
+ final int SKIP_BUFFERSIZE = 1024 * 4;
+ byte[] b = new byte[SKIP_BUFFERSIZE];
+ long total = 0;
+ for (int read = 0; (total < n) && (read != -1);) {
+ read = Math.min(SKIP_BUFFERSIZE, (int) (n - total));
+ // TODO: Interesting is that reading from compressed stream, we only
+ // read about 500 characters at a time though we ask for 4k.
+ // Look at this sometime.
+ read = read(b, 0, read);
+ if (read <= 0) {
+ read = -1;
+ } else {
+ total += read;
+ }
+ }
+ return total;
+ }
+
+ /**
+ * @return Returns the strict.
+ */
+ public boolean isStrict() {
+ return this.strict;
+ }
+
+ /**
+ * @param strict The strict to set.
+ */
+ public void setStrict(boolean strict) {
+ this.strict = strict;
+ }
+
+ protected InputStream getIn() {
+ return this.in;
+ }
+
+ public String getDigestStr() {
+ return this.digestStr;
+ }
+
+ protected void incrementPosition() {
+ this.position++;
+ }
+
+ protected void incrementPosition(final long incr) {
+ this.position += incr;
+ }
+
+ public long getPosition() {
+ return this.position;
+ }
+
+ protected boolean isEor() {
+ return eor;
+ }
+
+ protected void setEor(boolean eor) {
+ this.eor = eor;
+ }
+
+ protected String getStatusCode4Cdx(final ArchiveRecordHeader h) {
+ return "-";
+ }
+
+ protected String getIp4Cdx(final ArchiveRecordHeader h) {
+ return "-";
+ }
+
+ protected String getDigest4Cdx(final ArchiveRecordHeader h) {
+ return getDigestStr() == null? "-": getDigestStr();
+ }
+
+ protected String getMimetype4Cdx(final ArchiveRecordHeader h) {
+ return h.getMimetype();
+ }
+
+ protected String outputCdx(final String strippedFileName)
+ throws IOException {
+ // Read the whole record so we get out a hash. Should be safe calling
+ // close on already closed Record.
+ close();
+ ArchiveRecordHeader h = getHeader();
+ StringBuilder buffer =
+ new StringBuilder(ArchiveFileConstants.CDX_LINE_BUFFER_SIZE);
+ buffer.append(h.getDate());
+ buffer.append(ArchiveFileConstants.SINGLE_SPACE);
+ buffer.append(getIp4Cdx(h));
+ buffer.append(ArchiveFileConstants.SINGLE_SPACE);
+ buffer.append(h.getUrl());
+ buffer.append(ArchiveFileConstants.SINGLE_SPACE);
+ buffer.append(getMimetype4Cdx(h));
+ buffer.append(ArchiveFileConstants.SINGLE_SPACE);
+ buffer.append(getStatusCode4Cdx(h));
+ buffer.append(ArchiveFileConstants.SINGLE_SPACE);
+ buffer.append(getDigest4Cdx(h));
+ buffer.append(ArchiveFileConstants.SINGLE_SPACE);
+ buffer.append(h.getOffset());
+ buffer.append(ArchiveFileConstants.SINGLE_SPACE);
+ buffer.append(h.getLength());
+ buffer.append(ArchiveFileConstants.SINGLE_SPACE);
+ buffer.append(strippedFileName != null? strippedFileName: '-');
+ return buffer.toString();
+ }
+
+ /**
+ * Writes output on STDOUT.
+ * @throws IOException
+ */
+ public void dump()
+ throws IOException {
+ dump(System.out);
+ }
+
+ /**
+ * Writes output on passed os.
+ * @throws IOException
+ */
+ public void dump(final OutputStream os)
+ throws IOException {
+ final byte [] outputBuffer = new byte [16*1024];
+ int read = outputBuffer.length;
+ while ((read = read(outputBuffer, 0, outputBuffer.length)) != -1) {
+ os.write(outputBuffer, 0, read);
+ }
+ os.flush();
+ }
+
+ /**
+ * Is it likely that this record contains headers?
+ * This method will return true if the body is a http response that includes
+ * http response headers or the body is a http request that includes request
+ * headers, etc. Be aware that headers in content are distinct from
+ * {@link ArchiveRecordHeader} 'headers'.
+ * @return True if this Record's content has headers:
+ */
+ public boolean hasContentHeaders() {
+ final String url = getHeader().getUrl();
+ if (url == null) {
+ return false;
+ }
+
+ if (!url.toLowerCase().startsWith("http")) {
+ return false;
+ }
+
+ if (getHeader().getLength() <= MIN_HTTP_HEADER_LENGTH) {
+ return false;
+ }
+
+ return true;
+ }
+
+ protected void setBodyOffset(int bodyOffset) {
+ this.position = bodyOffset;
+ }
+}
diff --git a/src/main/java/org/archive/io/ArchiveRecordHeader.java b/src/main/java/org/archive/io/ArchiveRecordHeader.java
new file mode 100644
index 00000000..953537b1
--- /dev/null
+++ b/src/main/java/org/archive/io/ArchiveRecordHeader.java
@@ -0,0 +1,111 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * Archive Record Header.
+ * @author stack
+ * @version $Date$ $Version$
+ */
+public interface ArchiveRecordHeader {
+ /**
+ * Get the time when the record was created.
+ * @return Date in 14 digit time format (UTC).
+ * @see org.archive.util.ArchiveUtils#parse14DigitDate(String)
+ */
+ public abstract String getDate();
+
+ /**
+ * @return Return length of record.
+ */
+ public abstract long getLength();
+
+ /**
+ * @return Return Content-Length of the contents of the record
+ */
+ public abstract long getContentLength();
+
+
+ /**
+ * @return Record subject-url.
+ */
+ public abstract String getUrl();
+
+ /**
+ * @return Record mimetype.
+ */
+ public abstract String getMimetype();
+
+ /**
+ * @return Record version.
+ */
+ public abstract String getVersion();
+
+ /**
+ * @return Offset into Archive file at which this record begins.
+ */
+ public abstract long getOffset();
+
+ /**
+ * @param key Key to use looking up field value.
+ * @return value for passed key of null if no such entry.
+ */
+ public abstract Object getHeaderValue(final String key);
+
+ /**
+ * @return Header field name keys.
+ */
+ public abstract Set getHeaderFieldKeys();
+
+ /**
+ * @return Map of header fields.
+ */
+ public abstract Map getHeaderFields();
+
+ /**
+ * @return Returns identifier for current Archive file. Be aware this
+ * may not be a file name or file path. It may just be an URL. Depends
+ * on how Archive file was made.
+ */
+ public abstract String getReaderIdentifier();
+
+ /**
+ * @return Identifier for the record. If ARC, the URL + date. If WARC,
+ * the GUID assigned.
+ */
+ public abstract String getRecordIdentifier();
+
+ /**
+ * @return Returns digest as String for this record. Only available after
+ * the record has been read in totality.
+ */
+ public abstract String getDigest();
+
+ /**
+ * Offset at which the content begins.
+ * For ARCs, its used to delimit where http headers end and content begins.
+ * For WARCs, its end of Named Fields before payload starts.
+ */
+ public int getContentBegin();
+
+ public abstract String toString();
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/io/ArraySeekInputStream.java b/src/main/java/org/archive/io/ArraySeekInputStream.java
new file mode 100644
index 00000000..5b30747e
--- /dev/null
+++ b/src/main/java/org/archive/io/ArraySeekInputStream.java
@@ -0,0 +1,106 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+import java.io.IOException;
+
+
+/**
+ * A repositionable stream backed by an array.
+ *
+ * @author pjack
+ */
+public class ArraySeekInputStream extends SeekInputStream {
+
+
+ /**
+ * The array of bytes to read from.
+ */
+ private byte[] array;
+
+
+ /**
+ * The offset in the array of the next byte to read.
+ */
+ private int offset;
+
+
+ /**
+ * Constructor. Note that changes to the given array will be reflected
+ * in the stream.
+ *
+ * @param array The array to read bytes from.
+ */
+ public ArraySeekInputStream(byte[] array) {
+ this.array = array;
+ this.offset = 0;
+ }
+
+
+ @Override
+ public int read() {
+ if (offset >= array.length) {
+ return -1;
+ }
+ int r = array[offset] & 0xFF;
+ offset++;
+ return r;
+ }
+
+
+ @Override
+ public int read(byte[] buf, int ofs, int len) {
+ if (offset >= array.length) {
+ return 0;
+ }
+ len = Math.min(len, array.length - offset);
+ System.arraycopy(array, offset, buf, ofs, len);
+ offset += len;
+ return len;
+ }
+
+
+ @Override
+ public int read(byte[] buf) {
+ return read(buf, 0, buf.length);
+ }
+
+
+ /**
+ * Returns the position of the stream.
+ */
+ public long position() {
+ return offset;
+ }
+
+
+ /**
+ * Repositions the stream.
+ *
+ * @param p the new position for the stream
+ * @throws IOException if the given position is out of bounds
+ */
+ public void position(long p) throws IOException {
+ if ((p < 0) || (p > array.length)) {
+ throw new IOException("Invalid position: " + p);
+ }
+ offset = (int)p;
+ }
+
+}
diff --git a/src/main/java/org/archive/io/BufferedSeekInputStream.java b/src/main/java/org/archive/io/BufferedSeekInputStream.java
new file mode 100644
index 00000000..2fdc72b7
--- /dev/null
+++ b/src/main/java/org/archive/io/BufferedSeekInputStream.java
@@ -0,0 +1,217 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+
+import java.io.IOException;
+
+
+/**
+ * Buffers data from some other SeekInputStream.
+ *
+ * @author pjack
+ */
+public class BufferedSeekInputStream extends SeekInputStream {
+
+
+ /**
+ * The underlying input stream.
+ */
+ final private SeekInputStream input;
+
+
+ /**
+ * The buffered data.
+ */
+ final private byte[] buffer;
+
+
+ /**
+ * The maximum offset of valid data in the buffer. Usually the same
+ * as buffer.length, but may be shorter if we're in the last region
+ * of the stream.
+ */
+ private int maxOffset;
+
+
+ /**
+ * The offset of within the buffer of the next byte to read.
+ */
+ private int offset;
+
+
+ /**
+ * Constructor.
+ *
+ * @param input the underlying input stream
+ * @param capacity the size of the buffer
+ * @throws IOException if an IO occurs filling the first buffer
+ */
+ public BufferedSeekInputStream(SeekInputStream input, int capacity)
+ throws IOException {
+ this.input = input;
+ this.buffer = new byte[capacity];
+ buffer();
+ }
+
+ /**
+ * Fills the buffer.
+ *
+ * @throws IOException if an IO error occurs
+ */
+ private void buffer() throws IOException {
+ int remaining = buffer.length;
+ while (remaining > 0) {
+ int r = input.read(buffer, buffer.length - remaining, remaining);
+ if (r <= 0) {
+ // Not enough information to fill the buffer
+ offset = 0;
+ maxOffset = buffer.length - remaining;
+ return;
+ }
+ remaining -= r;
+ }
+ maxOffset = buffer.length;
+ offset = 0;
+ }
+
+
+ /**
+ * Ensures that the buffer is valid.
+ *
+ * @throws IOException if an IO error occurs
+ */
+ private void ensureBuffer() throws IOException {
+ if (offset >= maxOffset) {
+ buffer();
+ }
+ }
+
+
+ /**
+ * Returns the number of unread bytes in the current buffer.
+ *
+ * @return the remaining bytes
+ */
+ private int remaining() {
+ return maxOffset - offset;
+ }
+
+
+ @Override
+ public int read() throws IOException {
+ ensureBuffer();
+ if (maxOffset == 0) {
+ return -1;
+ }
+ int ch = buffer[offset] & 0xFF;
+ offset++;
+ return ch;
+ }
+
+
+ @Override
+ public int read(byte[] buf, int ofs, int len) throws IOException {
+ ensureBuffer();
+ if (maxOffset == 0) {
+ return 0;
+ }
+ len = Math.min(len, remaining());
+ System.arraycopy(buffer, offset, buf, ofs, len);
+ offset += len;
+ return len;
+ }
+
+
+ @Override
+ public int read(byte[] buf) throws IOException {
+ return read(buf, 0, buf.length);
+ }
+
+
+ @Override
+ public long skip(long c) throws IOException {
+ ensureBuffer();
+ if (maxOffset == 0) {
+ return 0;
+ }
+ int count = (c > Integer.MAX_VALUE) ? Integer.MAX_VALUE : (int)c;
+ int skip = Math.min(count, remaining());
+ offset += skip;
+ return skip;
+ }
+
+
+ /**
+ * Returns the stream's current position.
+ *
+ * @return the current position
+ */
+ public long position() throws IOException {
+ return input.position() - buffer.length + offset;
+ }
+
+
+ /**
+ * Seeks to the given position. This method avoids re-filling the buffer
+ * if at all possible.
+ *
+ * @param p the position to set
+ * @throws IOException if an IO error occurs
+ */
+ public void position(long p) throws IOException {
+ long blockStart = (input.position() - maxOffset)
+ / buffer.length * buffer.length;
+ long blockEnd = blockStart + maxOffset;
+ if ((p >= blockStart) && (p < blockEnd)) {
+ // Desired position is somewhere inside current buffer
+ long adj = p - blockStart;
+ offset = (int)adj;
+ return;
+ }
+ positionDirect(p);
+ }
+
+
+ /**
+ * Positions the underlying stream at the given position, then refills
+ * the buffer.
+ *
+ * @param p the position to set
+ * @throws IOException if an IO error occurs
+ */
+ private void positionDirect(long p) throws IOException {
+ long newBlockStart = p / buffer.length * buffer.length;
+ input.position(newBlockStart);
+ buffer();
+ offset = (int)(p % buffer.length);
+ }
+
+ /**
+ * Close the stream, including the wrapped input stream.
+ */
+ public void close() throws IOException {
+ super.close();
+ if(this.input!=null) {
+ this.input.close();
+ }
+ }
+
+
+}
diff --git a/src/main/java/org/archive/io/CharSubSequence.java b/src/main/java/org/archive/io/CharSubSequence.java
new file mode 100644
index 00000000..1e89da56
--- /dev/null
+++ b/src/main/java/org/archive/io/CharSubSequence.java
@@ -0,0 +1,90 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+
+/**
+ * Provides a subsequence view onto a CharSequence.
+ *
+ * @author gojomo
+ * @version $Revision$, $Date$
+ */
+public class CharSubSequence implements CharSequence {
+
+ protected CharSequence inner;
+ protected int start;
+ protected int end;
+
+ public CharSubSequence(CharSequence inner, int start, int end) {
+ if (end < start) {
+ throw new IllegalArgumentException("Start " + start + " is > " +
+ " than end " + end);
+ }
+
+ if (end < 0 || start < 0) {
+ throw new IllegalArgumentException("Start " + start + " or end " +
+ end + " is < 0.");
+ }
+
+ if (inner == null) {
+ throw new NullPointerException("Passed charsequence is null.");
+ }
+
+ this.inner = inner;
+ this.start = start;
+ this.end = end;
+ }
+
+ /*
+ * (non-Javadoc)
+ * @see java.lang.CharSequence#length()
+ */
+ public int length() {
+ return this.end - this.start;
+ }
+
+ /*
+ * (non-Javadoc)
+ * @see java.lang.CharSequence#charAt(int)
+ */
+ public char charAt(int index) {
+ return this.inner.charAt(this.start + index);
+ }
+
+ /*
+ * (non-Javadoc)
+ * @see java.lang.CharSequence#subSequence(int, int)
+ */
+ public CharSequence subSequence(int begin, int finish) {
+ return new CharSubSequence(this, begin, finish);
+ }
+
+ /*
+ * (non-Javadoc)
+ * @see java.lang.CharSequence#toString()
+ */
+ public String toString() {
+ StringBuffer sb = new StringBuffer(length());
+ // could use StringBuffer.append(CharSequence) if willing to do 1.5 & up
+ for (int i = 0;i filenames;
+
+ /* (non-Javadoc)
+ * @see java.io.InputStream#read()
+ */
+ public int read() throws IOException {
+ int c = super.read();
+ if( c == -1 && filenames.hasNext() ) {
+ cueStream();
+ return read();
+ }
+ return c;
+ }
+ /* (non-Javadoc)
+ * @see java.io.InputStream#read(byte[], int, int)
+ */
+ public int read(byte[] b, int off, int len) throws IOException {
+ int c = super.read(b, off, len);
+ if( c == -1 && filenames.hasNext() ) {
+ cueStream();
+ return read(b,off,len);
+ }
+ return c;
+ }
+ /* (non-Javadoc)
+ * @see java.io.InputStream#read(byte[])
+ */
+ public int read(byte[] b) throws IOException {
+ int c = super.read(b);
+ if( c == -1 && filenames.hasNext() ) {
+ cueStream();
+ return read(b);
+ }
+ return c;
+ }
+
+ /* (non-Javadoc)
+ * @see java.io.InputStream#skip(long)
+ */
+ public long skip(long n) throws IOException {
+ long s = super.skip(n);
+ if( s files) throws IOException {
+ super(null);
+ filenames = files.iterator();
+ cueStream();
+ }
+
+ private void cueStream() throws IOException {
+ if(filenames.hasNext()) {
+ this.in = new FileInputStream(filenames.next());
+ }
+ }
+
+}
diff --git a/src/main/java/org/archive/io/CompositeFileReader.java b/src/main/java/org/archive/io/CompositeFileReader.java
new file mode 100644
index 00000000..14b56219
--- /dev/null
+++ b/src/main/java/org/archive/io/CompositeFileReader.java
@@ -0,0 +1,40 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.List;
+
+
+/**
+ * @author gojomo
+ */
+public class CompositeFileReader extends InputStreamReader {
+
+ /**
+ * @param filenames
+ * @throws IOException
+ */
+ public CompositeFileReader(List filenames) throws IOException {
+ super(new CompositeFileInputStream(filenames));
+ }
+
+}
diff --git a/src/main/java/org/archive/io/Endian.java b/src/main/java/org/archive/io/Endian.java
new file mode 100644
index 00000000..f6d89aaa
--- /dev/null
+++ b/src/main/java/org/archive/io/Endian.java
@@ -0,0 +1,125 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.InputStream;
+
+
+/**
+ * Reads integers stored in big or little endian streams.
+ *
+ * @author pjack
+ */
+public class Endian {
+
+
+ /**
+ * Static utility class.
+ */
+ private Endian() {
+ }
+
+
+ /**
+ * Reads the next little-endian unsigned 16 bit integer from the
+ * given stream.
+ *
+ * @param input the input stream to read from
+ * @return the next 16-bit little-endian integer
+ * @throws IOException if an IO error occurs
+ */
+ public static char littleChar(InputStream input) throws IOException {
+ int lo = input.read();
+ if (lo < 0) {
+ throw new EOFException();
+ }
+ int hi = input.read();
+ if (hi < 0) {
+ throw new EOFException();
+ }
+ return (char)((hi << 8) | lo);
+ }
+
+
+ /**
+ * Reads the next little-endian signed 16-bit integer from the
+ * given stream.
+ *
+ * @param input the input stream to read from
+ * @return the next 16-bit little-endian integer
+ * @throws IOException if an IO error occurs
+ */
+ public static short littleShort(InputStream input) throws IOException {
+ return (short)littleChar(input);
+ }
+
+
+ /**
+ * Reads the next little-endian signed 32-bit integer from the
+ * given stream.
+ *
+ * @param input the input stream to read from
+ * @return the next 32-bit little-endian integer
+ * @throws IOException if an IO error occurs
+ */
+ public static int littleInt(InputStream input) throws IOException {
+ char lo = littleChar(input);
+ char hi = littleChar(input);
+ return (hi << 16) | lo;
+ }
+
+
+ /**
+ * Reads the next big-endian unsigned 16 bit integer from the
+ * given stream.
+ *
+ * @param input the input stream to read from
+ * @return the next 16-bit big-endian integer
+ * @throws IOException if an IO error occurs
+ */
+ public static char bigChar(InputStream input) throws IOException {
+ int hi = input.read();
+ if (hi < 0) {
+ throw new EOFException();
+ }
+ int lo = input.read();
+ if (lo < 0) {
+ throw new EOFException();
+ }
+ return (char)((hi << 8) | lo);
+ }
+
+
+ /**
+ * Reads the next big-endian signed 32-bit integer from the
+ * given stream.
+ *
+ * @param input the input stream to read from
+ * @return the next 32-bit big-endian integer
+ * @throws IOException if an IO error occurs
+ */
+ public static int bigInt(InputStream input) throws IOException {
+ char hi = bigChar(input);
+ char lo = bigChar(input);
+ return (hi << 16) | lo;
+ }
+}
diff --git a/src/main/java/org/archive/io/GZIPMembersInputStream.java b/src/main/java/org/archive/io/GZIPMembersInputStream.java
new file mode 100644
index 00000000..35fb9e90
--- /dev/null
+++ b/src/main/java/org/archive/io/GZIPMembersInputStream.java
@@ -0,0 +1,38 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * @deprecated use {@link org.archive.util.zip.GZIPMembersInputStream}
+ */
+@Deprecated
+public class GZIPMembersInputStream extends org.archive.util.zip.GZIPMembersInputStream {
+
+ public GZIPMembersInputStream(InputStream in) throws IOException {
+ super(in);
+ }
+
+ public GZIPMembersInputStream(InputStream in, int size) throws IOException {
+ super(in, size);
+ }
+
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/io/GenerationFileHandler.java b/src/main/java/org/archive/io/GenerationFileHandler.java
new file mode 100644
index 00000000..c1ce8d79
--- /dev/null
+++ b/src/main/java/org/archive/io/GenerationFileHandler.java
@@ -0,0 +1,200 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.logging.FileHandler;
+import java.util.logging.Formatter;
+import java.util.logging.LogRecord;
+
+import org.archive.util.FileUtils;
+
+
+/**
+ * FileHandler with support for rotating the current file to
+ * an archival name with a specified integer suffix, and
+ * provision of a new replacement FileHandler with the current
+ * filename.
+ *
+ * @author gojomo
+ */
+public class GenerationFileHandler extends FileHandler {
+ private LinkedList filenameSeries = new LinkedList();
+ private boolean shouldManifest = false;
+
+ /**
+ * @return Returns the filenameSeries.
+ */
+ public List getFilenameSeries() {
+ return filenameSeries;
+ }
+
+ /**
+ * Constructor.
+ * @param pattern
+ * @param append
+ * @param shouldManifest
+ * @throws IOException
+ * @throws SecurityException
+ */
+ public GenerationFileHandler(String pattern, boolean append,
+ boolean shouldManifest)
+ throws IOException, SecurityException {
+ super(pattern, append);
+ filenameSeries.addFirst(pattern);
+ this.shouldManifest = shouldManifest;
+ }
+
+ /**
+ * @param filenameSeries
+ * @param shouldManifest
+ * @throws IOException
+ */
+ public GenerationFileHandler(LinkedList filenameSeries,
+ boolean shouldManifest)
+ throws IOException {
+ super((String)filenameSeries.getFirst(), false); // Never append in this case
+ this.filenameSeries = filenameSeries;
+ this.shouldManifest = shouldManifest;
+ }
+
+ /**
+ * Move the current file to a new filename with the storeSuffix in place
+ * of the activeSuffix; continuing logging to a new file under the
+ * original filename.
+ *
+ * @param storeSuffix Suffix to put in place of activeSuffix
+ * @param activeSuffix Suffix to replace with storeSuffix.
+ * @return GenerationFileHandler instance.
+ * @throws IOException
+ */
+ public GenerationFileHandler rotate(String storeSuffix,
+ String activeSuffix)
+ throws IOException {
+ return rotate(storeSuffix, activeSuffix, false);
+ }
+
+ public GenerationFileHandler rotate(String storeSuffix,
+ String activeSuffix, boolean mergeOld) throws IOException {
+ close();
+ String filename = (String) filenameSeries.getFirst();
+ if (!filename.endsWith(activeSuffix)) {
+ throw new FileNotFoundException("Active file does not have"
+ + " expected suffix");
+ }
+ String storeFilename = filename.substring(0, filename.length()
+ - activeSuffix.length())
+ + storeSuffix;
+ File activeFile = new File(filename);
+ File storeFile = new File(storeFilename);
+ FileUtils.moveAsideIfExists(storeFile);
+
+ if (mergeOld) {
+ File fileToAppendTo = new File(filenameSeries.getLast());
+ for (int i = filenameSeries.size() - 2; i >= 0; i--) {
+ File f = new File(filenameSeries.get(i));
+ FileUtils.appendTo(fileToAppendTo, f);
+ f.delete();
+ }
+ filenameSeries.clear();
+ filenameSeries.add(filename);
+ if (!fileToAppendTo.renameTo(storeFile)) {
+ throw new IOException("Unable to move " + fileToAppendTo + " to "
+ + storeFilename);
+ }
+ } else {
+ if (!activeFile.renameTo(storeFile)) {
+ throw new IOException("Unable to move " + filename + " to "
+ + storeFilename);
+ }
+ }
+ filenameSeries.add(1, storeFilename);
+ GenerationFileHandler newGfh = new GenerationFileHandler(
+ filenameSeries, shouldManifest);
+ newGfh.setFormatter(this.getFormatter());
+ return newGfh;
+ }
+
+ /**
+ * @return True if should manifest.
+ */
+ public boolean shouldManifest() {
+ return this.shouldManifest;
+ }
+
+ /**
+ * Constructor-helper that rather than clobbering any existing
+ * file, moves it aside with a timestamp suffix.
+ *
+ * @param filename
+ * @param append
+ * @param shouldManifest
+ * @return
+ * @throws SecurityException
+ * @throws IOException
+ */
+ public static GenerationFileHandler makeNew(String filename, boolean append, boolean shouldManifest) throws SecurityException, IOException {
+ FileUtils.moveAsideIfExists(new File(filename));
+ return new GenerationFileHandler(filename, append, shouldManifest);
+ }
+
+ @Override
+ public void publish(LogRecord record) {
+ // when possible preformat outside synchronized superclass method
+ // (our most involved UriProcessingFormatter can cache result)
+ Formatter f = getFormatter();
+ if(!(f instanceof Preformatter)) {
+ super.publish(record);
+ } else {
+ try {
+ ((Preformatter)f).preformat(record);
+ super.publish(record);
+ } finally {
+ ((Preformatter)f).clear();
+ }
+ }
+ }
+//
+// TODO: determine if there's another way to have this optimization without
+// negative impact on log-following (esp. in web UI)
+// /**
+// * Flush only 1/100th of the usual once-per-record, to reduce the time
+// * spent holding the synchronization lock. (Flush is primarily called in
+// * a superclass's synchronized publish()).
+// *
+// * The eventual close calls a direct flush on the target writer, so all
+// * rotates/ends will ultimately be fully flushed.
+// *
+// * @see java.util.logging.StreamHandler#flush()
+// */
+// @Override
+// public synchronized void flush() {
+// flushCount++;
+// if(flushCount==100) {
+// super.flush();
+// flushCount=0;
+// }
+// }
+// int flushCount;
+
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/io/GenericReplayCharSequence.java b/src/main/java/org/archive/io/GenericReplayCharSequence.java
new file mode 100644
index 00000000..1af3922b
--- /dev/null
+++ b/src/main/java/org/archive/io/GenericReplayCharSequence.java
@@ -0,0 +1,412 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.nio.CharBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.charset.CharacterCodingException;
+import java.nio.charset.Charset;
+import java.text.NumberFormat;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import org.apache.commons.io.IOUtils;
+import org.archive.util.DevUtils;
+
+import com.google.common.base.Charsets;
+import com.google.common.primitives.Ints;
+
+/**
+ * (Replay)CharSequence view on recorded streams.
+ *
+ * For small streams, use {@link InMemoryReplayCharSequence}.
+ *
+ *
Call {@link close()} on this class when done to clean up resources.
+ *
+ * @contributor stack
+ * @contributor nlevitt
+ * @version $Revision$, $Date$
+ */
+public class GenericReplayCharSequence implements ReplayCharSequence {
+
+ protected static Logger logger = Logger
+ .getLogger(GenericReplayCharSequence.class.getName());
+
+ /**
+ * Name of the encoding we use writing out concatenated decoded prefix
+ * buffer and decoded backing file.
+ *
+ *
This define is also used as suffix for the file that holds the
+ * decodings. The name of the file that holds the decoding is the name
+ * of the backing file w/ this encoding for a suffix.
+ *
+ *
See Encoding.
+ */
+ public static final Charset WRITE_ENCODING = Charsets.UTF_16BE;
+
+ private static final long MAP_MAX_BYTES = 64 * 1024 * 1024; // 64M
+
+ /**
+ * When the memory map moves away from the beginning of the file
+ * (to the "right") in order to reach a certain index, it will
+ * map up to this many bytes preceding (to the left of) the target character.
+ * Consequently it will map up to
+ * MAP_MAX_BYTES - MAP_TARGET_LEFT_PADDING
+ * bytes to the right of the target.
+ */
+ private static final long MAP_TARGET_LEFT_PADDING_BYTES = (long) (MAP_MAX_BYTES * 0.01);
+
+ /**
+ * Total length of character stream to replay minus the HTTP headers
+ * if present.
+ *
+ * If the backing file is larger than Integer.MAX_VALUE (i.e. 2gb),
+ * only the first Integer.MAX_VALUE characters are available through this API.
+ * We're overriding java.lang.CharSequence so that we can use
+ * java.util.regex directly on the data, and the CharSequence
+ * API uses int for the length and index.
+ */
+ protected int length;
+
+ /** counter of decoding exceptions for report at end */
+ protected long decodingExceptions = 0;
+ protected CharacterCodingException codingException = null;
+
+ /**
+ * Byte offset into the file where the memory mapped portion begins.
+ */
+ private long mapByteOffset;
+
+ // XXX do we need to keep the input stream around?
+ private FileInputStream backingFileIn = null;
+
+ private FileChannel backingFileChannel = null;
+
+ private long bytesPerChar;
+
+ private CharBuffer mappedBuffer = null;
+
+ /**
+ * File that has decoded content.
+ *
+ * Keep it around so we can remove on close.
+ */
+ private File decodedFile = null;
+
+ /*
+ * This portion of the CharSequence precedes what's in the backing file. In
+ * cases where we decodeToFile(), this is always empty, because we decode
+ * the entire input stream.
+ */
+ private CharBuffer prefixBuffer = null;
+
+ private boolean isOpen = true;
+
+ protected Charset charset = null;
+
+ /**
+ * Constructor.
+ *
+ * @param contentReplayInputStream inputStream of content
+ * @param charset Encoding to use reading the passed prefix
+ * buffer and backing file. Must not be null.
+ * @param backingFilename Path to backing file with content in excess of
+ * whats in buffer.
+ *
+ * @throws IOException
+ */
+ public GenericReplayCharSequence(InputStream contentReplayInputStream,
+ int prefixMax,
+ String backingFilename,
+ Charset charset) throws IOException {
+ super();
+ logger.fine("characterEncoding=" + charset + " backingFilename="
+ + backingFilename);
+
+ if(charset==null) {
+ charset = ReplayCharSequence.FALLBACK_CHARSET;
+ }
+ // decodes only up to Integer.MAX_VALUE characters
+ decode(contentReplayInputStream, prefixMax, backingFilename, charset);
+
+ this.bytesPerChar = 2;
+
+ if(length>prefixBuffer.position()) {
+ this.backingFileIn = new FileInputStream(decodedFile);
+ this.backingFileChannel = backingFileIn.getChannel();
+ this.mapByteOffset = 0;
+ updateMemoryMappedBuffer();
+ }
+ }
+
+ private void updateMemoryMappedBuffer() {
+ long charLength = (long) this.length() - (long) prefixBuffer.limit(); // in characters
+ long mapSize = Math.min((charLength * bytesPerChar) - mapByteOffset, MAP_MAX_BYTES);
+ logger.fine("updateMemoryMappedBuffer: mapOffset="
+ + NumberFormat.getInstance().format(mapByteOffset)
+ + " mapSize=" + NumberFormat.getInstance().format(mapSize));
+ try {
+ // TODO: stress-test without these possibly-costly requests!
+// System.gc();
+// System.runFinalization();
+ // TODO: Confirm the READ_ONLY works. I recall it not working.
+ // The buffers seem to always say that the buffer is writable.
+ mappedBuffer = backingFileChannel.map(
+ FileChannel.MapMode.READ_ONLY, mapByteOffset, mapSize)
+ .asReadOnlyBuffer().asCharBuffer();
+ } catch (IOException e) {
+ // TODO convert this to a runtime error?
+ DevUtils.logger.log(Level.SEVERE,
+ " backingFileChannel.map() mapByteOffset=" + mapByteOffset
+ + " mapSize=" + mapSize + "\n" + "decodedFile="
+ + decodedFile + " length=" + length + "\n"
+ + DevUtils.extraInfo(), e);
+ throw new RuntimeException(e);
+ }
+ }
+
+ /**
+ * Converts the first Integer.MAX_VALUE characters from the
+ * file backingFilename from encoding encoding to
+ * encoding WRITE_ENCODING and saves as
+ * this.decodedFile, which is named backingFilename
+ * + "." + WRITE_ENCODING.
+ *
+ * @throws IOException
+ */
+ protected void decode(InputStream inStream, int prefixMax,
+ String backingFilename, Charset charset) throws IOException {
+
+ this.charset = charset;
+
+ // TODO: consider if BufferedReader is helping any
+ // TODO: consider adding TBW 'LimitReader' to stop reading at
+ // Integer.MAX_VALUE characters because of charAt(int) limit
+ BufferedReader reader = new BufferedReader(new InputStreamReader(
+ inStream, charset));
+
+ logger.fine("backingFilename=" + backingFilename + " encoding="
+ + charset + " decodedFile=" + decodedFile);
+
+ this.prefixBuffer = CharBuffer.allocate(prefixMax);
+
+ long count = 0;
+ while(count < prefixMax) {
+ int read = reader.read(prefixBuffer);
+ if(read<0) {
+ break;
+ }
+ count += read;
+ }
+
+ int ch = reader.read();
+ if(ch >= 0) {
+ count++;
+
+ // more to decode to file overflow
+ this.decodedFile = new File(backingFilename + "." + WRITE_ENCODING);
+
+ FileOutputStream fos;
+ try {
+ fos = new FileOutputStream(this.decodedFile);
+ } catch (FileNotFoundException e) {
+ // Windows workaround attempt
+ System.gc();
+ System.runFinalization();
+ this.decodedFile = new File(decodedFile.getAbsolutePath()+".win");
+ logger.info("Windows 'file with a user-mapped section open' "
+ + "workaround gc/finalization/name-extension performed.");
+ // try again
+ fos = new FileOutputStream(this.decodedFile);
+ }
+
+ Writer writer = new OutputStreamWriter(fos,WRITE_ENCODING);
+ writer.write(ch);
+ count += IOUtils.copyLarge(reader, writer);
+ writer.close();
+ reader.close();
+ }
+
+ this.length = Ints.saturatedCast(count);
+ if(count>Integer.MAX_VALUE) {
+ logger.warning("input stream is longer than Integer.MAX_VALUE="
+ + NumberFormat.getInstance().format(Integer.MAX_VALUE)
+ + " characters -- only first "
+ + NumberFormat.getInstance().format(Integer.MAX_VALUE)
+ + " are accessible through this GenericReplayCharSequence");
+ }
+
+ logger.fine("decode: decoded " + count + " characters" +
+ ((decodedFile==null) ? ""
+ : " ("+(count-prefixBuffer.length())+" to "+decodedFile+")"));
+ }
+
+ /**
+ * Get character at passed absolute position.
+ * @param index Index into content
+ * @return Character at offset index.
+ */
+ public char charAt(int index) {
+ if (index < 0 || index >= this.length()) {
+ throw new IndexOutOfBoundsException("index=" + index
+ + " - should be between 0 and length()=" + this.length());
+ }
+
+ // is it in the buffer
+ if (index < prefixBuffer.limit()) {
+ return prefixBuffer.get(index);
+ }
+
+ // otherwise we gotta get it from disk via memory map
+ long charFileIndex = (long) index - (long) prefixBuffer.limit();
+ long charFileLength = (long) this.length() - (long) prefixBuffer.limit(); // in characters
+ if (charFileIndex * bytesPerChar < mapByteOffset) {
+ logger.log(Level.WARNING,"left-fault; probably don't want to use CharSequence that far backward");
+ }
+ if (charFileIndex * bytesPerChar < mapByteOffset
+ || charFileIndex - (mapByteOffset / bytesPerChar) >= mappedBuffer.limit()) {
+ // fault
+ /*
+ * mapByteOffset is bounded by 0 and file size +/- size of the map,
+ * and starts as close to fileIndex -
+ * MAP_TARGET_LEFT_PADDING_BYTES as it can while also not
+ * being smaller than it needs to be.
+ */
+ mapByteOffset = Math.min(charFileIndex * bytesPerChar - MAP_TARGET_LEFT_PADDING_BYTES,
+ charFileLength * bytesPerChar - MAP_MAX_BYTES);
+ mapByteOffset = Math.max(0, mapByteOffset);
+ updateMemoryMappedBuffer();
+ }
+
+ return mappedBuffer.get((int)(charFileIndex-(mapByteOffset/bytesPerChar)));
+ }
+
+ public CharSequence subSequence(int start, int end) {
+ return new CharSubSequence(this, start, end);
+ }
+
+ private void deleteFile(File fileToDelete) {
+ deleteFile(fileToDelete, null);
+ }
+
+ private void deleteFile(File fileToDelete, final Exception e) {
+ if (e != null) {
+ // Log why the delete to help with debug of
+ // java.io.FileNotFoundException:
+ // ....tt53http.ris.UTF-16BE.
+ logger.severe("Deleting " + fileToDelete + " because of "
+ + e.toString());
+ }
+ if (fileToDelete != null && fileToDelete.exists()) {
+ logger.fine("deleting file: " + fileToDelete);
+ fileToDelete.delete();
+ }
+ }
+
+
+ @Override
+ public boolean isOpen() {
+ return this.isOpen;
+ }
+
+ public void close() throws IOException {
+ this.isOpen = false;
+
+ logger.fine("closing");
+
+ if (this.backingFileChannel != null && this.backingFileChannel.isOpen()) {
+ this.backingFileChannel.close();
+ }
+ if (backingFileIn != null) {
+ backingFileIn.close();
+ }
+
+ deleteFile(this.decodedFile);
+
+ // clear decodedFile -- so that double-close (as in finalize()) won't
+ // delete a later instance with same name see bug [ 1218961 ]
+ // "failed get of replay" in ExtractorHTML... usu: UTF-16BE
+ this.decodedFile = null;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.lang.Object#finalize()
+ */
+ protected void finalize() throws Throwable {
+ super.finalize();
+ logger.fine("finalizing");
+ close();
+ }
+
+ /**
+ * Convenience method for getting a substring.
+ *
+ * @deprecated please use subSequence() and then toString() directly
+ */
+ public String substring(int offset, int len) {
+ return subSequence(offset, offset + len).toString();
+ }
+
+ public String toString() {
+ StringBuilder sb = new StringBuilder(this.length());
+ sb.append(this);
+ return sb.toString();
+ }
+
+ public int length() {
+ return length;
+ }
+
+ /* (non-Javadoc)
+ * @see org.archive.io.ReplayCharSequence#getDecodeExceptionCount()
+ */
+ @Override
+ public long getDecodeExceptionCount() {
+ return decodingExceptions;
+ }
+
+
+ /* (non-Javadoc)
+ * @see org.archive.io.ReplayCharSequence#getCodingException()
+ */
+ @Override
+ public CharacterCodingException getCodingException() {
+ return codingException;
+ }
+
+ /* (non-Javadoc)
+ * @see org.archive.io.ReplayCharSequence#getCharset()
+ */
+ public Charset getCharset() {
+ return charset;
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/io/GzipHeader.java b/src/main/java/org/archive/io/GzipHeader.java
new file mode 100644
index 00000000..6b8263bc
--- /dev/null
+++ b/src/main/java/org/archive/io/GzipHeader.java
@@ -0,0 +1,26 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+/**
+ * @deprecated use {@link org.archive.util.zip.GzipHeader}
+ */
+@Deprecated
+public class GzipHeader extends org.archive.util.zip.GzipHeader {
+}
diff --git a/src/main/java/org/archive/io/HeaderedArchiveRecord.java b/src/main/java/org/archive/io/HeaderedArchiveRecord.java
new file mode 100644
index 00000000..3cce595b
--- /dev/null
+++ b/src/main/java/org/archive/io/HeaderedArchiveRecord.java
@@ -0,0 +1,423 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.PrintStream;
+
+import org.apache.commons.httpclient.Header;
+import org.apache.commons.httpclient.HttpParser;
+import org.apache.commons.httpclient.StatusLine;
+import org.apache.commons.httpclient.util.EncodingUtil;
+import org.archive.io.arc.ARCConstants;
+import org.archive.util.LaxHttpParser;
+
+/**
+ * An ArchiveRecord whose content has a preamble of RFC822-like headers: e.g.
+ * The ArchiveRecord is a http response that leads off with http response
+ * headers. Use this ArchiveRecord Decorator to get at the content headers and
+ * the header/content demarcation.
+ *
+ * @author stack
+ * @author Olaf Freyer
+ */
+public class HeaderedArchiveRecord extends ArchiveRecord {
+ private int contentHeadersLength = -1;
+ private int statusCode = -1;
+
+ /**
+ * Http header bytes.
+ *
+ * If non-null and bytes available, give out its contents before we
+ * go back to the underlying stream.
+ */
+ private InputStream contentHeaderStream = null;
+
+ /**
+ * Content headers.
+ *
+ * Only available after the reading of headers.
+ */
+ private Header [] contentHeaders = null;
+
+
+ public HeaderedArchiveRecord(final ArchiveRecord ar) throws IOException {
+ super(ar);
+ }
+
+ public HeaderedArchiveRecord(final ArchiveRecord ar,
+ final boolean readContentHeader) throws IOException {
+ super(ar);
+ if (readContentHeader) {
+ this.contentHeaderStream = readContentHeaders();
+ }
+ }
+
+ /**
+ * Skip over the the content headers if present.
+ *
+ * Subsequent reads will get the body.
+ *
+ *
Calling this method in the midst of reading the header
+ * will make for strange results. Otherwise, safe to call
+ * at any time though before reading any of the record
+ * content is only time that it makes sense.
+ *
+ *
After calling this method, you can call
+ * {@link #getContentHeaders()} to get the read http header.
+ *
+ * @throws IOException
+ */
+ public void skipHttpHeader() throws IOException {
+ if (this.contentHeaderStream == null) {
+ return;
+ }
+ // Empty the contentHeaderStream
+ for (int available = this.contentHeaderStream.available();
+ this.contentHeaderStream != null
+ && (available = this.contentHeaderStream.available()) > 0;) {
+ // We should be in this loop once only we should only do this
+ // buffer allocation once.
+ byte[] buffer = new byte[available];
+ // The read nulls out httpHeaderStream when done with it so
+ // need check for null in the loop control line.
+ read(buffer, 0, available);
+ }
+ }
+
+ public void dumpHttpHeader() throws IOException {
+ dumpHttpHeader(System.out);
+ }
+
+ public void dumpHttpHeader(final PrintStream stream) throws IOException {
+ if (this.contentHeaderStream == null) {
+ return;
+ }
+ // Dump the httpHeaderStream to STDOUT
+ for (int available = this.contentHeaderStream.available();
+ this.contentHeaderStream != null
+ && (available = this.contentHeaderStream.available()) > 0;) {
+ // We should be in this loop only once and should do this
+ // buffer allocation once.
+ byte[] buffer = new byte[available];
+ // The read nulls out httpHeaderStream when done with it so
+ // need check for null in the loop control line.
+ int read = read(buffer, 0, available);
+ stream.write(buffer, 0, read);
+ }
+ }
+
+ /**
+ * Read header if present. Technique borrowed from HttpClient HttpParse
+ * class. Using http parser code for now. Later move to more generic header
+ * parsing code if there proves a need.
+ *
+ * @return ByteArrayInputStream with the http header in it or null if no
+ * http header.
+ * @throws IOException
+ */
+ private InputStream readContentHeaders() throws IOException {
+ // If judged a record that doesn't have an http header, return
+ // immediately.
+ if (!hasContentHeaders()) {
+ return null;
+ }
+ byte [] statusBytes = LaxHttpParser.readRawLine(getIn());
+ int eolCharCount = getEolCharsCount(statusBytes);
+ if (eolCharCount <= 0) {
+ throw new IOException("Failed to read raw lie where one " +
+ " was expected: " + new String(statusBytes));
+ }
+ String statusLine = EncodingUtil.getString(statusBytes, 0,
+ statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING);
+ if (statusLine == null) {
+ throw new NullPointerException("Expected status line is null");
+ }
+ // TODO: Tighten up this test.
+ boolean isHttpResponse = StatusLine.startsWithHTTP(statusLine);
+ boolean isHttpRequest = false;
+ if (!isHttpResponse) {
+ isHttpRequest = statusLine.toUpperCase().startsWith("GET") ||
+ !statusLine.toUpperCase().startsWith("POST");
+ }
+ if (!isHttpResponse && !isHttpRequest) {
+ throw new UnexpectedStartLineIOException("Failed parse of " +
+ "status line: " + statusLine);
+ }
+ this.statusCode = isHttpResponse?
+ (new StatusLine(statusLine)).getStatusCode(): -1;
+
+ // Save off all bytes read. Keep them as bytes rather than
+ // convert to strings so we don't have to worry about encodings
+ // though this should never be a problem doing http headers since
+ // its all supposed to be ascii.
+ ByteArrayOutputStream baos =
+ new ByteArrayOutputStream(statusBytes.length + 4 * 1024);
+ baos.write(statusBytes);
+
+ // Now read rest of the header lines looking for the separation
+ // between header and body.
+ for (byte [] lineBytes = null; true;) {
+ lineBytes = LaxHttpParser.readRawLine(getIn());
+ eolCharCount = getEolCharsCount(lineBytes);
+ if (eolCharCount <= 0) {
+ throw new IOException("Failed reading headers: " +
+ ((lineBytes != null)? new String(lineBytes): null));
+ }
+ // Save the bytes read.
+ baos.write(lineBytes);
+ if ((lineBytes.length - eolCharCount) <= 0) {
+ // We've finished reading the http header.
+ break;
+ }
+ }
+
+ byte [] headerBytes = baos.toByteArray();
+ // Save off where content body, post content headers, starts.
+ this.contentHeadersLength = headerBytes.length;
+ ByteArrayInputStream bais =
+ new ByteArrayInputStream(headerBytes);
+ if (!bais.markSupported()) {
+ throw new IOException("ByteArrayInputStream does not support mark");
+ }
+ bais.mark(headerBytes.length);
+ // Read the status line. Don't let it into the parseHeaders function.
+ // It doesn't know what to do with it.
+ bais.read(statusBytes, 0, statusBytes.length);
+ this.contentHeaders = LaxHttpParser.parseHeaders(bais,
+ ARCConstants.DEFAULT_ENCODING);
+ bais.reset();
+ return bais;
+ }
+
+ public static class UnexpectedStartLineIOException
+ extends RecoverableIOException {
+ private static final long serialVersionUID = 1L;
+
+ public UnexpectedStartLineIOException(final String reason) {
+ super(reason);
+ }
+ }
+
+ /**
+ * @param bytes Array of bytes to examine for an EOL.
+ * @return Count of end-of-line characters or zero if none.
+ */
+ private int getEolCharsCount(byte [] bytes) {
+ int count = 0;
+ if (bytes != null && bytes.length >=1 &&
+ bytes[bytes.length - 1] == '\n') {
+ count++;
+ if (bytes.length >=2 && bytes[bytes.length -2] == '\r') {
+ count++;
+ }
+ }
+ return count;
+ }
+
+ /**
+ * @return If headers are for a http response AND the headers have been
+ * read, return status code. Else return -1.
+ */
+ public int getStatusCode() {
+ return this.statusCode;
+ }
+
+ /**
+ * @return Returns length of content headers or -1 if headers have
+ * not yet been read.
+ */
+ public int getContentHeadersLength() {
+ return this.contentHeadersLength;
+ }
+
+ public Header[] getContentHeaders() {
+ return contentHeaders;
+ }
+
+ /**
+ * @return Next character in this ARCRecord's content else -1 if at end of
+ * this record.
+ * @throws IOException
+ */
+ public int read() throws IOException {
+ int c = -1;
+ if (this.contentHeaderStream != null &&
+ (this.contentHeaderStream.available() > 0)) {
+ // If http header, return bytes from it before we go to underlying
+ // stream.
+ c = this.contentHeaderStream.read();
+ // If done with the header stream, null it out.
+ if (this.contentHeaderStream.available() <= 0) {
+ this.contentHeaderStream = null;
+ }
+ // do not increment position -
+ // the underlying ArchiveRecord stream allready did this
+ // incrementPosition();
+ } else {
+ c = super.read();
+ }
+ return c;
+ }
+
+ public int read(byte [] b, int offset, int length) throws IOException {
+ int read = -1;
+ if (this.contentHeaderStream != null &&
+ (this.contentHeaderStream.available() > 0)) {
+ // If http header, return bytes from it before we go to underlying
+ // stream.
+ read = Math.min(length, this.contentHeaderStream.available());
+ if (read == 0) {
+ read = -1;
+ } else {
+ read = this.contentHeaderStream.read(b, offset, read);
+ }
+ // If done with the header stream, null it out.
+ if (this.contentHeaderStream.available() <= 0) {
+ this.contentHeaderStream = null;
+ }
+ // do not increment position -
+ // the underlying ArchiveRecord stream allready did this
+ //incrementPosition();
+ } else {
+ read = super.read(b, offset, length);
+ }
+ return read;
+ }
+
+ @Override
+ public int available() {
+ return ((ArchiveRecord)this.in).available();
+ }
+
+ @Override
+ public void close() throws IOException {
+ ((ArchiveRecord)this.in).close();
+ }
+
+ @Override
+ public void dump() throws IOException {
+ ((ArchiveRecord)this.in).dump();
+ }
+
+ @Override
+ public void dump(OutputStream os) throws IOException {
+ ((ArchiveRecord)this.in).dump(os);
+ }
+
+ @Override
+ protected String getDigest4Cdx(ArchiveRecordHeader h) {
+ return ((ArchiveRecord)this.in).getDigest4Cdx(h);
+ }
+
+ @Override
+ public String getDigestStr() {
+ return ((ArchiveRecord)this.in).getDigestStr();
+ }
+
+ @Override
+ public ArchiveRecordHeader getHeader() {
+ return ((ArchiveRecord)this.in).getHeader();
+ }
+
+ @Override
+ protected String getIp4Cdx(ArchiveRecordHeader h) {
+ return ((ArchiveRecord)this.in).getIp4Cdx(h);
+ }
+
+ @Override
+ protected String getMimetype4Cdx(ArchiveRecordHeader h) {
+ return ((ArchiveRecord)this.in).getMimetype4Cdx(h);
+ }
+
+ @Override
+ public long getPosition() {
+ return ((ArchiveRecord)this.in).getPosition();
+ }
+
+ @Override
+ protected String getStatusCode4Cdx(ArchiveRecordHeader h) {
+ return ((ArchiveRecord)this.in).getStatusCode4Cdx(h);
+ }
+
+ @Override
+ public boolean hasContentHeaders() {
+ return ((ArchiveRecord)this.in).hasContentHeaders();
+ }
+
+ @Override
+ protected void incrementPosition() {
+ ((ArchiveRecord)this.in).incrementPosition();
+ }
+
+ @Override
+ protected void incrementPosition(long incr) {
+ ((ArchiveRecord)this.in).incrementPosition(incr);
+ }
+
+ @Override
+ protected boolean isEor() {
+ return ((ArchiveRecord)this.in).isEor();
+ }
+
+ @Override
+ public boolean isStrict() {
+ return ((ArchiveRecord)this.in).isStrict();
+ }
+
+ @Override
+ public boolean markSupported() {
+ return ((ArchiveRecord)this.in).markSupported();
+ }
+
+ @Override
+ protected String outputCdx(String strippedFileName) throws IOException {
+ return ((ArchiveRecord)this.in).outputCdx(strippedFileName);
+ }
+
+ @Override
+ protected void setEor(boolean eor) {
+ ((ArchiveRecord)this.in).setEor(eor);
+ }
+
+ @Override
+ protected void setHeader(ArchiveRecordHeader header) {
+ ((ArchiveRecord)this.in).setHeader(header);
+ }
+
+ @Override
+ public void setStrict(boolean strict) {
+ ((ArchiveRecord)this.in).setStrict(strict);
+ }
+
+ @Override
+ protected void skip() throws IOException {
+ ((ArchiveRecord)this.in).skip();
+ }
+
+ @Override
+ public long skip(long n) throws IOException {
+ return ((ArchiveRecord)this.in).skip(n);
+ }
+}
diff --git a/src/main/java/org/archive/io/LoudObjectOutputStream.java b/src/main/java/org/archive/io/LoudObjectOutputStream.java
new file mode 100644
index 00000000..959c2620
--- /dev/null
+++ b/src/main/java/org/archive/io/LoudObjectOutputStream.java
@@ -0,0 +1,63 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io;
+
+import java.io.IOException;
+import java.io.ObjectOutputStream;
+import java.io.OutputStream;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.logging.Logger;
+
+/**
+ * ObjectOutputStream that logs class name of each object that is written
+ * to the stream. Useful for tracking down sources of NotSerializableException.
+ *
+ * @author pjack
+ *
+ */
+public class LoudObjectOutputStream extends ObjectOutputStream {
+
+
+ final private static Logger LOGGER = Logger.getLogger(
+ LoudObjectOutputStream.class.getName());
+
+ // Only log each class name once
+ private Set alreadyLogged = new HashSet();
+
+ public LoudObjectOutputStream(OutputStream out) throws IOException {
+ super(out);
+ this.enableReplaceObject(true);
+ }
+
+
+ @Override
+ protected Object replaceObject(Object obj) throws IOException {
+ if (obj != null) {
+ String name = obj.getClass().getName();
+ if (alreadyLogged.add(name)) {
+ LOGGER.info("WROTE: " + name);
+ }
+ }
+ return obj;
+ }
+
+
+}
diff --git a/src/main/java/org/archive/io/MiserOutputStream.java b/src/main/java/org/archive/io/MiserOutputStream.java
new file mode 100644
index 00000000..f10ac9ca
--- /dev/null
+++ b/src/main/java/org/archive/io/MiserOutputStream.java
@@ -0,0 +1,82 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io;
+
+import java.io.FilterOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+
+/**
+ * A filter stream that both counts bytes written, and optionally swallows
+ * flush() requests.
+ *
+ * @contributor gojomo
+ */
+public class MiserOutputStream extends FilterOutputStream {
+ protected long count;
+ protected boolean passFlushes;
+
+ /**
+ * Wraps another output stream, counting the number of bytes written.
+ *
+ * @param out the output stream to be wrapped
+ */
+ public MiserOutputStream(OutputStream out) {
+ this(out,true);
+ }
+
+ /**
+ * Wraps another output stream, counting the number of bytes written.
+ *
+ * @param out the output stream to be wrapped
+ */
+ public MiserOutputStream(OutputStream out, boolean passFlushes) {
+ super(out);
+ this.passFlushes = passFlushes;
+ }
+
+ /** Returns the number of bytes written. */
+ public long getCount() {
+ return count;
+ }
+
+ @Override public void write(byte[] b, int off, int len) throws IOException {
+ out.write(b, off, len);
+ count += len;
+ }
+
+ @Override public void write(int b) throws IOException {
+ out.write(b);
+ count++;
+ }
+
+ @Override
+ public void close() throws IOException {
+ passFlushes = true;
+ super.close();
+ }
+
+ @Override
+ public void flush() throws IOException {
+ if(passFlushes) {
+ super.flush();
+ }
+ }
+}
diff --git a/src/main/java/org/archive/io/NoGzipMagicException.java b/src/main/java/org/archive/io/NoGzipMagicException.java
new file mode 100644
index 00000000..27d1058a
--- /dev/null
+++ b/src/main/java/org/archive/io/NoGzipMagicException.java
@@ -0,0 +1,26 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+/**
+ * @deprecated use {@link org.archive.util.zip.NoGzipMagicException}
+ */
+@Deprecated
+public class NoGzipMagicException extends org.archive.util.zip.NoGzipMagicException {
+}
diff --git a/src/main/java/org/archive/io/ObjectPlusFilesInputStream.java b/src/main/java/org/archive/io/ObjectPlusFilesInputStream.java
new file mode 100644
index 00000000..892860ed
--- /dev/null
+++ b/src/main/java/org/archive/io/ObjectPlusFilesInputStream.java
@@ -0,0 +1,143 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.ObjectInputStream;
+import java.util.Iterator;
+import java.util.LinkedList;
+
+import org.archive.util.FileUtils;
+
+
+/**
+ * Enhanced ObjectOutputStream with support for restoring
+ * files that had been saved, in parallel with object
+ * serialization.
+ *
+ * @author gojomo
+ *
+ */
+public class ObjectPlusFilesInputStream extends ObjectInputStream {
+ protected LinkedList auxiliaryDirectoryStack = new LinkedList();
+ protected LinkedList postRestoreTasks = new LinkedList();
+
+ /**
+ * Instantiate over the given stream and using the supplied
+ * auxiliary storage directory.
+ *
+ * @param in
+ * @param storeDir
+ * @throws IOException
+ */
+ public ObjectPlusFilesInputStream(InputStream in, File storeDir)
+ throws IOException {
+ super(in);
+ auxiliaryDirectoryStack.addFirst(storeDir);
+ }
+
+ /**
+ * Push another default storage directory for use
+ * until popped.
+ *
+ * @param dir
+ */
+ public void pushAuxiliaryDirectory(String dir) {
+ auxiliaryDirectoryStack.
+ addFirst(new File(getAuxiliaryDirectory(), dir));
+ }
+
+ /**
+ * Discard the top auxiliary directory.
+ */
+ public void popAuxiliaryDirectory() {
+ auxiliaryDirectoryStack.removeFirst();
+ }
+
+ /**
+ * Return the top auxiliary directory, from
+ * which saved files are restored.
+ *
+ * @return Auxillary directory.
+ */
+ public File getAuxiliaryDirectory() {
+ return (File)auxiliaryDirectoryStack.getFirst();
+ }
+
+ /**
+ * Restore a file from storage, using the name and length
+ * info on the serialization stream and the file from the
+ * current auxiliary directory, to the given File.
+ *
+ * @param destination
+ * @throws IOException
+ */
+ public void restoreFile(File destination) throws IOException {
+ String nameAsStored = readUTF();
+ long lengthAtStoreTime = readLong();
+ File storedFile = new File(getAuxiliaryDirectory(),nameAsStored);
+ FileUtils.copyFile(storedFile, destination, lengthAtStoreTime);
+ }
+
+ /**
+ * Restore a file from storage, using the name and length
+ * info on the serialization stream and the file from the
+ * current auxiliary directory, to the given File.
+ *
+ * @param directory
+ * @throws IOException
+ */
+ public void restoreFileTo(File directory) throws IOException {
+ String nameAsStored = readUTF();
+ long lengthAtStoreTime = readLong();
+ File storedFile = new File(getAuxiliaryDirectory(),nameAsStored);
+ File destination = new File(directory,nameAsStored);
+ FileUtils.copyFile(storedFile, destination, lengthAtStoreTime);
+ }
+
+ /**
+ * Register a task to be done when the ObjectPlusFilesInputStream
+ * is closed.
+ *
+ * @param task
+ */
+ public void registerFinishTask(Runnable task) {
+ postRestoreTasks.addFirst(task);
+ }
+
+ private void doFinishTasks() {
+ Iterator iter = postRestoreTasks.iterator();
+ while(iter.hasNext()) {
+ ((Runnable)iter.next()).run();
+ }
+ }
+
+ /**
+ * In addition to default, do any registered cleanup tasks.
+ *
+ * @see java.io.InputStream#close()
+ */
+ public void close() throws IOException {
+ super.close();
+ doFinishTasks();
+ }
+}
diff --git a/src/main/java/org/archive/io/ObjectPlusFilesOutputStream.java b/src/main/java/org/archive/io/ObjectPlusFilesOutputStream.java
new file mode 100644
index 00000000..224f24e7
--- /dev/null
+++ b/src/main/java/org/archive/io/ObjectPlusFilesOutputStream.java
@@ -0,0 +1,134 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.ObjectOutputStream;
+import java.io.OutputStream;
+import java.util.LinkedList;
+
+import org.archive.util.FileUtils;
+
+
+/**
+ * Enhanced ObjectOutputStream which maintains (a stack of) auxiliary
+ * directories and offers convenience methods for serialized objects
+ * to save their related disk files alongside their serialized version.
+ *
+ * @author gojomo
+ */
+public class ObjectPlusFilesOutputStream extends ObjectOutputStream {
+ protected LinkedList auxiliaryDirectoryStack = new LinkedList();
+
+ /**
+ * Constructor
+ *
+ * @param out
+ * @param topDirectory
+ * @throws java.io.IOException
+ */
+ public ObjectPlusFilesOutputStream(OutputStream out, File topDirectory) throws IOException {
+ super(out);
+ auxiliaryDirectoryStack.addFirst(topDirectory);
+ }
+
+ /**
+ * Add another subdirectory for any file-capture needs during the
+ * current serialization.
+ *
+ * @param dir
+ */
+ public void pushAuxiliaryDirectory(String dir) {
+ auxiliaryDirectoryStack.addFirst(new File(getAuxiliaryDirectory(),dir));
+ }
+
+ /**
+ * Remove the top subdirectory.
+ *
+ */
+ public void popAuxiliaryDirectory() {
+ auxiliaryDirectoryStack.removeFirst();
+ }
+
+ /**
+ * Return the current auxiliary directory for storing
+ * files associated with serialized objects.
+ *
+ * @return Auxillary directory.
+ */
+ public File getAuxiliaryDirectory() {
+ return (File)auxiliaryDirectoryStack.getFirst();
+ }
+
+ /**
+ * Store a snapshot of an object's supporting file to the
+ * current auxiliary directory. Should only be used for
+ * files which are strictly appended-to, because it tries
+ * to use a "hard link" where possible (meaning that
+ * future edits to the original file's contents will
+ * also affect the snapshot).
+ *
+ * Remembers current file extent to allow a future restore
+ * to ignore subsequent appended data.
+ *
+ * @param file
+ * @throws IOException
+ */
+ public void snapshotAppendOnlyFile(File file) throws IOException {
+ // write filename
+ String name = file.getName();
+ writeUTF(name);
+ // write current file length
+ writeLong(file.length());
+ File auxDir = getAuxiliaryDirectory();
+ if(!auxDir.exists()) {
+ FileUtils.ensureWriteableDirectory(auxDir);
+ }
+ File destination = new File(auxDir,name);
+ hardlinkOrCopy(file, destination);
+ }
+
+ /**
+ * Create a backup of this given file, first by trying a "hard
+ * link", then by using a copy if hard linking is unavailable
+ * (either because it is unsupported or the origin and checkpoint
+ * directories are on different volumes).
+ *
+ * @param file
+ * @param destination
+ * @throws IOException
+ */
+ private void hardlinkOrCopy(File file, File destination) throws IOException {
+ // For Linux/UNIX, try a hard link first.
+ Process link = Runtime.getRuntime().exec("ln "+file.getAbsolutePath()+" "+destination.getAbsolutePath());
+ // TODO NTFS also supports hard links; add appropriate try
+ try {
+ link.waitFor();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ if(link.exitValue()!=0) {
+ // hard link failed
+ FileUtils.copyFile(file,destination);
+ }
+ }
+
+}
diff --git a/src/main/java/org/archive/io/OriginSeekInputStream.java b/src/main/java/org/archive/io/OriginSeekInputStream.java
new file mode 100644
index 00000000..00605d82
--- /dev/null
+++ b/src/main/java/org/archive/io/OriginSeekInputStream.java
@@ -0,0 +1,121 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+
+import java.io.IOException;
+
+
+/**
+ * Alters the origin of some other SeekInputStream. This class allows you
+ * to completely ignore everything in the underlying stream before a specified
+ * position, the origin position.
+ *
+ *
With the exception of {@link #position()} and {@link position(long)},
+ * all of the methods in this class simply delegate to the underlying input
+ * stream. The position methods adjust the position of the
+ * underlying stream relative to the origin specified at construction time.
+ *
+ * @author pjack
+ */
+public class OriginSeekInputStream extends SeekInputStream {
+
+
+ /**
+ * The underlying stream.
+ */
+ final private SeekInputStream input;
+
+
+ /**
+ * The origin position. In other words, this.position(0)
+ * resolves to input.position(start).
+ */
+ final private long origin;
+
+
+ /**
+ * Constructor.
+ *
+ * @param input the underlying stream
+ * @param origin the origin position
+ * @throws IOException if an IO error occurs
+ */
+ public OriginSeekInputStream(SeekInputStream input, long origin)
+ throws IOException {
+ this.input = input;
+ this.origin = origin;
+ input.position(origin);
+ }
+
+
+ @Override
+ public int available() throws IOException {
+ return input.available();
+ }
+
+
+ @Override
+ public int read() throws IOException {
+ return input.read();
+ }
+
+
+ @Override
+ public int read(byte[] buf, int ofs, int len) throws IOException {
+ return input.read(buf, ofs, len);
+ }
+
+
+ @Override
+ public int read(byte[] buf) throws IOException {
+ return input.read(buf);
+ }
+
+
+ @Override
+ public long skip(long count) throws IOException {
+ return input.skip(count);
+ }
+
+
+ /**
+ * Returns the position of the underlying stream relative to the origin.
+ *
+ * @return the relative position
+ * @throws IOException if an IO error occurs
+ */
+ public long position() throws IOException {
+ return input.position() - origin;
+ }
+
+
+ /**
+ * Positions the underlying stream relative to the origin.
+ * In other words, this.position(0) resolves to input.position(origin),
+ * where input is underlying stream and origin is the origin specified
+ * at construction time.
+ *
+ * @param p the new position for this stream
+ * @throws IOException if an IO error occurs
+ */
+ public void position(long p) throws IOException {
+ input.position(p + origin);
+ }
+}
diff --git a/src/main/java/org/archive/io/Preformatter.java b/src/main/java/org/archive/io/Preformatter.java
new file mode 100644
index 00000000..dcd31bb6
--- /dev/null
+++ b/src/main/java/org/archive/io/Preformatter.java
@@ -0,0 +1,32 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+import java.util.logging.LogRecord;
+
+/**
+ * Interface indicating a logging Formatter can preformat a record (outside
+ * the standard-implementation synchronized block) and cache it, returning it
+ * for the next request for formatting from the same thread.
+ * @contributor gojomo
+ */
+public interface Preformatter {
+ public void preformat(LogRecord record);
+ public void clear();
+}
diff --git a/src/main/java/org/archive/io/RandomAccessInputStream.java b/src/main/java/org/archive/io/RandomAccessInputStream.java
new file mode 100644
index 00000000..d8dd260b
--- /dev/null
+++ b/src/main/java/org/archive/io/RandomAccessInputStream.java
@@ -0,0 +1,180 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+
+import java.io.File;
+import java.io.IOException;
+import java.io.RandomAccessFile;
+
+
+/**
+ * Wraps a RandomAccessFile with an InputStream interface.
+ *
+ * @author gojomo
+ */
+public class RandomAccessInputStream extends SeekInputStream {
+
+ /**
+ * Reference to the random access file this stream is reading from.
+ */
+ private RandomAccessFile raf = null;
+
+ /**
+ * When mark is called, save here the current position so we can go back
+ * on reset.
+ */
+ private long markpos = -1;
+
+ /**
+ * True if we are to close the underlying random access file when this
+ * stream is closed.
+ */
+ private boolean sympathyClose;
+
+ /**
+ * Constructor.
+ *
+ * If using this constructor, caller created the RAF and therefore
+ * its assumed wants to control close of the RAF. The RAF.close
+ * is not called if this constructor is used on close of this stream.
+ *
+ * @param raf RandomAccessFile to wrap.
+ * @throws IOException
+ */
+ public RandomAccessInputStream(RandomAccessFile raf)
+ throws IOException {
+ this(raf, false, 0);
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param file File to get RAFIS on. Creates an RAF from passed file.
+ * Closes the created RAF when this stream is closed.
+ * @throws IOException
+ */
+ public RandomAccessInputStream(final File file)
+ throws IOException {
+ this(new RandomAccessFile(file, "r"), true, 0);
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param file File to get RAFIS on. Creates an RAF from passed file.
+ * Closes the created RAF when this stream is closed.
+ * @param offset
+ * @throws IOException
+ */
+ public RandomAccessInputStream(final File file, final long offset)
+ throws IOException {
+ this(new RandomAccessFile(file, "r"), true, offset);
+ }
+
+ /**
+ * @param raf RandomAccessFile to wrap.
+ * @param sympathyClose Set to true if we are to close the RAF
+ * file when this stream is closed.
+ * @param offset
+ * @throws IOException
+ */
+ public RandomAccessInputStream(final RandomAccessFile raf,
+ final boolean sympathyClose, final long offset)
+ throws IOException {
+ super();
+ this.sympathyClose = sympathyClose;
+ this.raf = raf;
+ if (offset > 0) {
+ this.raf.seek(offset);
+ }
+ }
+
+ /* (non-Javadoc)
+ * @see java.io.InputStream#read()
+ */
+ public int read() throws IOException {
+ return this.raf.read();
+ }
+
+ /* (non-Javadoc)
+ * @see java.io.InputStream#read(byte[], int, int)
+ */
+ public int read(byte[] b, int off, int len) throws IOException {
+ return this.raf.read(b, off, len);
+ }
+
+ /* (non-Javadoc)
+ * @see java.io.InputStream#read(byte[])
+ */
+ public int read(byte[] b) throws IOException {
+ return this.raf.read(b);
+ }
+
+ /* (non-Javadoc)
+ * @see java.io.InputStream#skip(long)
+ */
+ public long skip(long n) throws IOException {
+ this.raf.seek(this.raf.getFilePointer() + n);
+ return n;
+ }
+
+ public long position() throws IOException {
+ return this.raf.getFilePointer();
+ }
+
+ public void position(long position) throws IOException {
+ this.raf.seek(position);
+ }
+
+ public int available() throws IOException {
+ long amount = this.raf.length() - this.position();
+ return (amount >= Integer.MAX_VALUE)? Integer.MAX_VALUE: (int)amount;
+ }
+
+ public boolean markSupported() {
+ return true;
+ }
+
+ public synchronized void mark(int readlimit) {
+ try {
+ this.markpos = position();
+ } catch (IOException e) {
+ // Set markpos to -1. Will cause exception reset.
+ this.markpos = -1;
+ }
+ }
+
+ public synchronized void reset() throws IOException {
+ if (this.markpos == -1) {
+ throw new IOException("Mark has not been set.");
+ }
+ position(this.markpos);
+ }
+
+ public void close() throws IOException {
+ try {
+ super.close();
+ } finally {
+ if (this.sympathyClose) {
+ this.raf.close();
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/io/RandomAccessOutputStream.java b/src/main/java/org/archive/io/RandomAccessOutputStream.java
new file mode 100644
index 00000000..225f995f
--- /dev/null
+++ b/src/main/java/org/archive/io/RandomAccessOutputStream.java
@@ -0,0 +1,69 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.RandomAccessFile;
+
+
+/**
+ * Wraps a RandomAccessFile with OutputStream interface.
+ *
+ * @author gojomo
+ */
+public class RandomAccessOutputStream extends OutputStream {
+ protected RandomAccessFile raf;
+
+ /**
+ * Wrap the given RandomAccessFile
+ */
+ public RandomAccessOutputStream(RandomAccessFile raf) {
+ super();
+ this.raf = raf;
+ }
+
+ /* (non-Javadoc)
+ * @see java.io.OutputStream#write(int)
+ */
+ public void write(int b) throws IOException {
+ raf.write(b);
+ }
+
+ /* (non-Javadoc)
+ * @see java.io.OutputStream#close()
+ */
+ public void close() throws IOException {
+ raf.close();
+ }
+
+ /* (non-Javadoc)
+ * @see java.io.OutputStream#write(byte[], int, int)
+ */
+ public void write(byte[] b, int off, int len) throws IOException {
+ raf.write(b, off, len);
+ }
+
+ /* (non-Javadoc)
+ * @see java.io.OutputStream#write(byte[])
+ */
+ public void write(byte[] b) throws IOException {
+ raf.write(b);
+ }
+}
diff --git a/src/main/java/org/archive/io/ReadSource.java b/src/main/java/org/archive/io/ReadSource.java
new file mode 100644
index 00000000..a3c29967
--- /dev/null
+++ b/src/main/java/org/archive/io/ReadSource.java
@@ -0,0 +1,37 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io;
+
+import java.io.Reader;
+
+/**
+ * Interface for objects that can provide a Reader view of their
+ * contents.
+ *
+ */
+public interface ReadSource {
+ /**
+ * Obtain a Reader. Not named 'getReader' so that it is not
+ * considered a simple costless read-only property by
+ * bean-convention introspection tools.
+ * @return a Reader on this object
+ */
+ Reader obtainReader();
+}
diff --git a/src/main/java/org/archive/io/RecorderIOException.java b/src/main/java/org/archive/io/RecorderIOException.java
new file mode 100644
index 00000000..07b30061
--- /dev/null
+++ b/src/main/java/org/archive/io/RecorderIOException.java
@@ -0,0 +1,38 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+import java.io.IOException;
+
+/**
+ *
+ * @author Gordon Mohr
+ */
+public class RecorderIOException extends IOException {
+
+ private static final long serialVersionUID = 5907470275350314277L;
+
+ public RecorderIOException() {
+ super();
+ }
+
+ public RecorderIOException(String msg) {
+ super(msg);
+ }
+}
diff --git a/src/main/java/org/archive/io/RecorderLengthExceededException.java b/src/main/java/org/archive/io/RecorderLengthExceededException.java
new file mode 100644
index 00000000..8c3e067d
--- /dev/null
+++ b/src/main/java/org/archive/io/RecorderLengthExceededException.java
@@ -0,0 +1,39 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+
+/**
+ * Indicates a length exception thrown by the Recorder.
+ *
+ * @author Gordon Mohr
+ */
+public class RecorderLengthExceededException
+extends RecorderIOException {
+
+ private static final long serialVersionUID = 6655419033414648444L;
+
+ public RecorderLengthExceededException() {
+ super();
+ }
+
+ public RecorderLengthExceededException(String msg) {
+ super(msg);
+ }
+}
diff --git a/src/main/java/org/archive/io/RecorderTimeoutException.java b/src/main/java/org/archive/io/RecorderTimeoutException.java
new file mode 100644
index 00000000..32be5b5d
--- /dev/null
+++ b/src/main/java/org/archive/io/RecorderTimeoutException.java
@@ -0,0 +1,37 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+/**
+ * Indicates a timeout thrown by the RecordingInputStream.
+ *
+ * @author Gordon Mohr
+ */
+public class RecorderTimeoutException extends RecorderIOException {
+
+ private static final long serialVersionUID = 7433214063765078269L;
+
+ public RecorderTimeoutException() {
+ super();
+ }
+
+ public RecorderTimeoutException(String msg) {
+ super(msg);
+ }
+}
diff --git a/src/main/java/org/archive/io/RecorderTooMuchHeaderException.java b/src/main/java/org/archive/io/RecorderTooMuchHeaderException.java
new file mode 100644
index 00000000..23f5d264
--- /dev/null
+++ b/src/main/java/org/archive/io/RecorderTooMuchHeaderException.java
@@ -0,0 +1,40 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+
+/**
+ * Indicates a too much header material exception thrown by the Recorder
+ * (specificially the RecordingOutputStream)
+ *
+ * @author Gordon Mohr
+ */
+public class RecorderTooMuchHeaderException
+extends RecorderIOException {
+
+ private static final long serialVersionUID = 3528516034898129150L;
+
+ public RecorderTooMuchHeaderException() {
+ super();
+ }
+
+ public RecorderTooMuchHeaderException(String msg) {
+ super(msg);
+ }
+}
diff --git a/src/main/java/org/archive/io/RecordingInputStream.java b/src/main/java/org/archive/io/RecordingInputStream.java
new file mode 100644
index 00000000..b46905ed
--- /dev/null
+++ b/src/main/java/org/archive/io/RecordingInputStream.java
@@ -0,0 +1,355 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.SocketException;
+import java.net.SocketTimeoutException;
+import java.security.MessageDigest;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import org.apache.commons.io.IOUtils;
+
+
+/**
+ * Stream which records all data read from it, which it acquires from a wrapped
+ * input stream.
+ *
+ * Makes use of a RecordingOutputStream for recording because of its being
+ * file backed so we can write massive amounts of data w/o worrying about
+ * overflowing memory.
+ *
+ * @author gojomo
+ *
+ */
+public class RecordingInputStream
+ extends InputStream {
+
+ protected static Logger logger =
+ Logger.getLogger("org.archive.io.RecordingInputStream");
+
+ /**
+ * Where we are recording to.
+ */
+ private RecordingOutputStream recordingOutputStream;
+
+ /**
+ * Stream to record.
+ */
+ private InputStream in = null;
+
+ /**
+ * Reusable buffer to avoid reallocation on each readFullyUntil
+ */
+ protected byte[] drainBuffer = new byte[16*1024];
+
+ /**
+ * Create a new RecordingInputStream.
+ *
+ * @param bufferSize Size of buffer to use.
+ * @param backingFilename Name of backing file.
+ */
+ public RecordingInputStream(int bufferSize, String backingFilename)
+ {
+ this.recordingOutputStream = new RecordingOutputStream(bufferSize,
+ backingFilename);
+ }
+
+ public void open(InputStream wrappedStream) throws IOException {
+ logger.fine(Thread.currentThread().getName() + " opening " +
+ wrappedStream + ", " + Thread.currentThread().getName());
+ if(isOpen()) {
+ // error; should not be opening/wrapping in an unclosed
+ // stream remains open
+ throw new IOException("RIS already open for "
+ +Thread.currentThread().getName());
+ }
+ try {
+ this.in = wrappedStream;
+ this.recordingOutputStream.open();
+ } catch (IOException ioe) {
+ close(); // ...and rethrow...
+ throw ioe;
+ }
+ }
+
+ public int read() throws IOException {
+ if (!isOpen()) {
+ throw new IOException("Stream closed " +
+ Thread.currentThread().getName());
+ }
+ int b = this.in.read();
+ if (b != -1) {
+ assert this.recordingOutputStream != null: "ROS is null " +
+ Thread.currentThread().getName();
+ this.recordingOutputStream.write(b);
+ }
+ return b;
+ }
+
+ public int read(byte[] b, int off, int len) throws IOException {
+ if (!isOpen()) {
+ throw new IOException("Stream closed " +
+ Thread.currentThread().getName());
+ }
+ int count = this.in.read(b,off,len);
+ if (count > 0) {
+ assert this.recordingOutputStream != null: "ROS is null " +
+ Thread.currentThread().getName();
+ this.recordingOutputStream.write(b,off,count);
+ }
+ return count;
+ }
+
+ public int read(byte[] b) throws IOException {
+ if (!isOpen()) {
+ throw new IOException("Stream closed " +
+ Thread.currentThread().getName());
+ }
+ int count = this.in.read(b);
+ if (count > 0) {
+ assert this.recordingOutputStream != null: "ROS is null " +
+ Thread.currentThread().getName();
+ this.recordingOutputStream.write(b,0,count);
+ }
+ return count;
+ }
+
+ public void close() throws IOException {
+ if (logger.isLoggable(Level.FINE)) {
+ logger.fine(Thread.currentThread().getName() + " closing " +
+ this.in + ", " + Thread.currentThread().getName());
+ }
+ IOUtils.closeQuietly(this.in);
+ this.in = null;
+ IOUtils.closeQuietly(this.recordingOutputStream);
+ }
+
+ public ReplayInputStream getReplayInputStream() throws IOException {
+ return this.recordingOutputStream.getReplayInputStream();
+ }
+
+ public ReplayInputStream getMessageBodyReplayInputStream() throws IOException {
+ return this.recordingOutputStream.getMessageBodyReplayInputStream();
+ }
+
+ public long readFully() throws IOException {
+ while(read(drainBuffer) != -1) {
+ // Empty out stream.
+ continue;
+ }
+ return this.recordingOutputStream.getSize();
+ }
+
+ /**
+ * Read all of a stream (Or read until we timeout or have read to the max).
+ * @param softMaxLength Maximum length to read; if zero or < 0, then no
+ * limit. If met, return normally.
+ * @param hardMaxLength Maximum length to read; if zero or < 0, then no
+ * limit. If exceeded, throw RecorderLengthExceededException
+ * @param timeout Timeout in milliseconds for total read; if zero or
+ * negative, timeout is Long.MAX_VALUE. If exceeded, throw
+ * RecorderTimeoutException
+ * @param maxBytesPerMs How many bytes per millisecond.
+ * @throws IOException failed read.
+ * @throws RecorderLengthExceededException
+ * @throws RecorderTimeoutException
+ * @throws InterruptedException
+ */
+ public void readFullyOrUntil(long softMaxLength)
+ throws IOException, RecorderLengthExceededException,
+ RecorderTimeoutException, InterruptedException {
+ // Check we're open before proceeding.
+ if (!isOpen()) {
+ // TODO: should this be a noisier exception-raising error?
+ return;
+ }
+
+ long totalBytes = 0L;
+ long bytesRead = -1L;
+ long maxToRead = -1;
+ while (true) {
+ try {
+ // read no more than soft max
+ maxToRead = (softMaxLength <= 0)
+ ? drainBuffer.length
+ : Math.min(drainBuffer.length, softMaxLength - totalBytes);
+ // nor more than hard max
+ maxToRead = Math.min(maxToRead, recordingOutputStream.getRemainingLength());
+ // but always at least 1 (to trigger hard max exception
+ maxToRead = Math.max(maxToRead, 1);
+
+ bytesRead = read(drainBuffer,0,(int)maxToRead);
+ if (bytesRead == -1) {
+ break;
+ }
+ totalBytes += bytesRead;
+
+ if (Thread.interrupted()) {
+ throw new InterruptedException("Interrupted during IO");
+ }
+ } catch (SocketTimeoutException e) {
+ // A socket timeout is just a transient problem, meaning
+ // nothing was available in the configured timeout period,
+ // but something else might become available later.
+ // Take this opportunity to check the overall
+ // timeout (below). One reason for this timeout is
+ // servers that keep up the connection, 'keep-alive', even
+ // though we asked them to not keep the connection open.
+ if (logger.isLoggable(Level.FINE)) {
+ logger.log(Level.FINE, "socket timeout", e);
+ }
+ // check for interrupt
+ if (Thread.interrupted()) {
+ throw new InterruptedException("Interrupted during IO");
+ }
+ // check for overall timeout
+ recordingOutputStream.checkLimits();
+ } catch (SocketException se) {
+ throw se;
+ } catch (NullPointerException e) {
+ // [ 896757 ] NPEs in Andy's Th-Fri Crawl.
+ // A crawl was showing NPE's in this part of the code but can
+ // not reproduce. Adding this rethrowing catch block w/
+ // diagnostics to help should we come across the problem in the
+ // future.
+ throw new NullPointerException("Stream " + this.in + ", " +
+ e.getMessage() + " " + Thread.currentThread().getName());
+ }
+
+ // if have read 'enough', just finish
+ if (softMaxLength > 0 && totalBytes >= softMaxLength) {
+ break; // return
+ }
+ }
+ }
+
+ public long getSize() {
+ return this.recordingOutputStream.getSize();
+ }
+
+ public void markContentBegin() {
+ this.recordingOutputStream.markMessageBodyBegin();
+ }
+
+ public long getContentBegin() {
+ return this.recordingOutputStream.getMessageBodyBegin();
+ }
+
+ public void startDigest() {
+ this.recordingOutputStream.startDigest();
+ }
+
+ /**
+ * Convenience method for setting SHA1 digest.
+ */
+ public void setSha1Digest() {
+ this.recordingOutputStream.setSha1Digest();
+ }
+
+ /**
+ * Sets a digest algorithm which may be applied to recorded data.
+ * As usually only a subset of the recorded data should
+ * be fed to the digest, you must also call startDigest()
+ * to begin digesting.
+ *
+ * @param algorithm
+ */
+ public void setDigest(String algorithm) {
+ this.recordingOutputStream.setDigest(algorithm);
+ }
+
+ /**
+ * Sets a digest function which may be applied to recorded data.
+ * As usually only a subset of the recorded data should
+ * be fed to the digest, you must also call startDigest()
+ * to begin digesting.
+ *
+ * @param md
+ */
+ public void setDigest(MessageDigest md) {
+ this.recordingOutputStream.setDigest(md);
+ }
+
+ /**
+ * Return the digest value for any recorded, digested data. Call
+ * only after all data has been recorded; otherwise, the running
+ * digest state is ruined.
+ *
+ * @return the digest final value
+ */
+ public byte[] getDigestValue() {
+ return this.recordingOutputStream.getDigestValue();
+ }
+
+ public long getResponseContentLength() {
+ return this.recordingOutputStream.getResponseContentLength();
+ }
+
+ public void closeRecorder() throws IOException {
+ this.recordingOutputStream.closeRecorder();
+ }
+
+ /**
+ * @return True if we've been opened.
+ */
+ public boolean isOpen()
+ {
+ return this.in != null;
+ }
+
+ @Override
+ public synchronized void mark(int readlimit) {
+ this.in.mark(readlimit);
+ this.recordingOutputStream.mark();
+ }
+
+ @Override
+ public boolean markSupported() {
+ return this.in.markSupported();
+ }
+
+ @Override
+ public synchronized void reset() throws IOException {
+ this.in.reset();
+ this.recordingOutputStream.reset();
+ }
+
+ /**
+ * Set limits to be enforced by internal recording-out
+ */
+ public void setLimits(long hardMax, long timeoutMs, long maxRateKBps) {
+ recordingOutputStream.setLimits(hardMax, timeoutMs, maxRateKBps);
+ }
+
+ /**
+ * Expose the amount of in-memory buffering used by the internal
+ * recording stream.
+ * @return int buffer size
+ */
+ public int getRecordedBufferLength() {
+ return recordingOutputStream.getBufferLength();
+ }
+
+ public void clearForReuse() throws IOException {
+ recordingOutputStream.clearForReuse();
+ }
+}
diff --git a/src/main/java/org/archive/io/RecordingOutputStream.java b/src/main/java/org/archive/io/RecordingOutputStream.java
new file mode 100644
index 00000000..4d0713da
--- /dev/null
+++ b/src/main/java/org/archive/io/RecordingOutputStream.java
@@ -0,0 +1,576 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io;
+
+import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
+
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+
+/**
+ * An output stream that records all writes to wrapped output
+ * stream.
+ *
+ * A RecordingOutputStream can be wrapped around any other
+ * OutputStream to record all bytes written to it. You can
+ * then request a ReplayInputStream to read those bytes.
+ *
+ *
The RecordingOutputStream uses an in-memory buffer and
+ * backing disk file to allow it to record streams of
+ * arbitrary length limited only by available disk space.
+ *
+ *
As long as the stream recorded is smaller than the
+ * in-memory buffer, no disk access will occur.
+ *
+ *
Recorded content can be recovered as a ReplayInputStream
+ * (via getReplayInputStream() or, for only the content after
+ * the content-begin-mark is set, getContentReplayInputStream() )
+ * or as a ReplayCharSequence (via getReplayCharSequence()).
+ *
+ *
This class is also used as a straight output stream
+ * by {@link RecordingInputStream} to which it records all reads.
+ * {@link RecordingInputStream} is exploiting the file backed buffer
+ * facility of this class passing null for the stream
+ * to wrap. TODO: Make a FileBackedOutputStream class that is
+ * subclassed by RecordingInputStream.
+ *
+ * @author gojomo
+ *
+ */
+public class RecordingOutputStream extends OutputStream {
+ protected static Logger logger =
+ Logger.getLogger(RecordingOutputStream.class.getName());
+
+ /**
+ * Size of recording.
+ *
+ * Later passed to ReplayInputStream on creation. It uses it to know when
+ * EOS.
+ */
+ protected long size = 0;
+
+ protected String backingFilename;
+ protected OutputStream diskStream = null;
+
+ /**
+ * Buffer we write recordings to.
+ *
+ * We write all recordings here first till its full. Thereafter we
+ * write the backing file.
+ */
+ private byte[] buffer;
+
+ /** current virtual position in the recording */
+ private long position;
+
+ /** flag to disable recording */
+ private boolean recording;
+
+ /**
+ * Reusable buffer for FastBufferedOutputStream
+ */
+ protected byte[] bufStreamBuf =
+ new byte [ FastBufferedOutputStream.DEFAULT_BUFFER_SIZE ];
+
+ /**
+ * True if we're to digest content.
+ */
+ private boolean shouldDigest = false;
+
+ /**
+ * Digest instance.
+ */
+ private MessageDigest digest = null;
+
+ /**
+ * Define for SHA1 algarithm.
+ */
+ private static final String SHA1 = "SHA1";
+
+ /**
+ * Maximum amount of header material to accept without the content
+ * body beginning -- if more, throw a RecorderTooMuchHeaderException.
+ * TODO: make configurable? make smaller?
+ */
+ protected static final long MAX_HEADER_MATERIAL = 1024*1024; // 1MB
+
+ // configurable max length, max time limits
+ /** maximum length of material to record before throwing exception */
+ protected long maxLength = Long.MAX_VALUE;
+ /** maximum time to record before throwing exception */
+ protected long timeoutMs = Long.MAX_VALUE;
+ /** maximum rate to record (adds delays to hit target rate) */
+ protected long maxRateBytesPerMs = Long.MAX_VALUE;
+ /** time recording begins for timeout, rate calculations */
+ protected long startTime = Long.MAX_VALUE;
+
+ /**
+ * When recording HTTP, where the content-body starts.
+ */
+ protected long messageBodyBeginMark;
+
+ /**
+ * Stream to record.
+ */
+ private OutputStream out = null;
+
+ // mark/reset support
+ /** furthest position reached before any reset()s */
+ private long maxPosition = 0;
+ /** remembered position to reset() to */
+ private long markPosition = 0;
+
+ /**
+ * Create a new RecordingOutputStream.
+ *
+ * @param bufferSize Buffer size to use.
+ * @param backingFilename Name of backing file to use.
+ */
+ public RecordingOutputStream(int bufferSize, String backingFilename) {
+ this.buffer = new byte[bufferSize];
+ this.backingFilename = backingFilename;
+ recording = true;
+ }
+
+ /**
+ * Wrap the given stream, both recording and passing along any data written
+ * to this RecordingOutputStream.
+ *
+ * @throws IOException If failed creation of backing file.
+ */
+ public void open() throws IOException {
+ this.open(null);
+ }
+
+ /**
+ * Wrap the given stream, both recording and passing along any data written
+ * to this RecordingOutputStream.
+ *
+ * @param wrappedStream Stream to wrap. May be null for case where we
+ * want to write to a file backed stream only.
+ *
+ * @throws IOException If failed creation of backing file.
+ */
+ public void open(OutputStream wrappedStream) throws IOException {
+ if(isOpen()) {
+ // error; should not be opening/wrapping in an unclosed
+ // stream remains open
+ throw new IOException("ROS already open for "
+ +Thread.currentThread().getName());
+ }
+ clearForReuse();
+ this.out = wrappedStream;
+ if (this.diskStream == null) {
+ // TODO: Fix so we only make file when its actually needed.
+ FileOutputStream fis = new FileOutputStream(this.backingFilename);
+
+ this.diskStream = new RecyclingFastBufferedOutputStream(fis, bufStreamBuf);
+ }
+ startTime = System.currentTimeMillis();
+ }
+
+ public void write(int b) throws IOException {
+ if(position< maxPosition) {
+ if(position+len<=maxPosition) {
+ // revisiting; do nothing but advance position
+ position += len;
+ return;
+ }
+ // consume part of the array doing nothing but advancing position
+ long consumeRange = maxPosition - position;
+ position += consumeRange;
+ off += consumeRange;
+ len -= consumeRange;
+ }
+ if(recording) {
+ record(b, off, len);
+ }
+ if (this.out != null) {
+ this.out.write(b, off, len);
+ }
+ checkLimits();
+ }
+
+ /**
+ * Check any enforced limits.
+ */
+ protected void checkLimits() throws RecorderIOException {
+ // too much material before finding end of headers?
+ if (messageBodyBeginMark<0) {
+ // no mark yet
+ if(position>MAX_HEADER_MATERIAL) {
+ throw new RecorderTooMuchHeaderException();
+ }
+ }
+ // overlong?
+ if(position>maxLength) {
+ throw new RecorderLengthExceededException();
+ }
+ // taking too long?
+ long duration = System.currentTimeMillis() - startTime;
+ duration = Math.max(duration,1); // !divzero
+ if(duration>timeoutMs) {
+ throw new RecorderTimeoutException();
+ }
+ // need to throttle reading to hit max configured rate?
+ if(position/duration > maxRateBytesPerMs) {
+ long desiredDuration = position / maxRateBytesPerMs;
+ try {
+ Thread.sleep(desiredDuration-duration);
+ } catch (InterruptedException e) {
+ logger.log(Level.WARNING,
+ "bandwidth throttling sleep interrupted", e);
+ }
+ }
+ }
+
+ /**
+ * Record the given byte for later recovery
+ *
+ * @param b Int to record.
+ *
+ * @exception IOException Failed write to backing file.
+ */
+ private void record(int b) throws IOException {
+ if (this.shouldDigest) {
+ this.digest.update((byte)b);
+ }
+ if (this.position >= this.buffer.length) {
+ // TODO: Its possible to call write w/o having first opened a
+ // stream. Protect ourselves against this.
+ assert this.diskStream != null: "Diskstream is null";
+ this.diskStream.write(b);
+ } else {
+ this.buffer[(int) this.position] = (byte) b;
+ }
+ this.position++;
+ }
+
+ /**
+ * Record the given byte-array range for recovery later
+ *
+ * @param b Buffer to record.
+ * @param off Offset into buffer at which to start recording.
+ * @param len Length of buffer to record.
+ *
+ * @exception IOException Failed write to backing file.
+ */
+ private void record(byte[] b, int off, int len) throws IOException {
+ if(this.shouldDigest) {
+ assert this.digest != null: "Digest is null.";
+ this.digest.update(b, off, len);
+ }
+ tailRecord(b, off, len);
+ }
+
+ /**
+ * Record without digesting.
+ *
+ * @param b Buffer to record.
+ * @param off Offset into buffer at which to start recording.
+ * @param len Length of buffer to record.
+ *
+ * @exception IOException Failed write to backing file.
+ */
+ private void tailRecord(byte[] b, int off, int len) throws IOException {
+ if(this.position >= this.buffer.length){
+ // TODO: Its possible to call write w/o having first opened a
+ // stream. Lets protect ourselves against this.
+ if (this.diskStream == null) {
+ throw new IOException("diskstream is null");
+ }
+ this.diskStream.write(b, off, len);
+ this.position += len;
+ } else {
+ assert this.buffer != null: "Buffer is null";
+ int toCopy = (int)Math.min(this.buffer.length - this.position, len);
+ assert b != null: "Passed buffer is null";
+ System.arraycopy(b, off, this.buffer, (int)this.position, toCopy);
+ this.position += toCopy;
+ // TODO verify these are +1 -1 right
+ if (toCopy < len) {
+ tailRecord(b, off + toCopy, len - toCopy);
+ }
+ }
+ }
+
+ public void close() throws IOException {
+ if(messageBodyBeginMark<0) {
+ // if unset, consider 0 posn as content-start
+ // (so that a -1 never survives to replay step)
+ messageBodyBeginMark = 0;
+ }
+ if (this.out != null) {
+ this.out.close();
+ this.out = null;
+ }
+ closeRecorder();
+ }
+
+ protected synchronized void closeDiskStream()
+ throws IOException {
+ if (this.diskStream != null) {
+ this.diskStream.close();
+ this.diskStream = null;
+ }
+ }
+
+ public void closeRecorder() throws IOException {
+ recording = false;
+ closeDiskStream(); // if any
+ // This setting of size is important. Its passed to ReplayInputStream
+ // on creation. It uses it to know EOS.
+ if (this.size == 0) {
+ this.size = this.position;
+ }
+ }
+
+ /* (non-Javadoc)
+ * @see java.io.OutputStream#flush()
+ */
+ public void flush() throws IOException {
+ if (this.out != null) {
+ this.out.flush();
+ }
+ if (this.diskStream != null) {
+ this.diskStream.flush();
+ }
+ }
+
+ public ReplayInputStream getReplayInputStream() throws IOException {
+ return getReplayInputStream(0);
+ }
+
+ public ReplayInputStream getReplayInputStream(long skip) throws IOException {
+ // If this method is being called, then assumption must be that the
+ // stream is closed. If it ain't, then the stream gotten won't work
+ // -- the size will zero so any attempt at a read will get back EOF.
+ assert this.out == null: "Stream is still open.";
+ ReplayInputStream replay = new ReplayInputStream(this.buffer,
+ this.size, this.messageBodyBeginMark, this.backingFilename);
+ replay.skip(skip);
+ return replay;
+ }
+
+ /**
+ * Return a replay stream, cued up to begining of content
+ *
+ * @throws IOException
+ * @return An RIS.
+ */
+ public ReplayInputStream getMessageBodyReplayInputStream() throws IOException {
+ return getReplayInputStream(this.messageBodyBeginMark);
+ }
+
+ public long getSize() {
+ return this.size;
+ }
+
+ /**
+ * Remember the current position as the start of the "message
+ * body". Useful when recording HTTP traffic as a way to start
+ * replays after the headers.
+ */
+ public void markMessageBodyBegin() {
+ this.messageBodyBeginMark = this.position;
+ startDigest();
+ }
+
+ /**
+ * Return stored message-body-begin-mark (which is also end-of-headers)
+ */
+ public long getMessageBodyBegin() {
+ return this.messageBodyBeginMark;
+ }
+
+ /**
+ * Starts digesting recorded data, if a MessageDigest has been
+ * set.
+ */
+ public void startDigest() {
+ if (this.digest != null) {
+ this.digest.reset();
+ this.shouldDigest = true;
+ }
+ }
+
+ /**
+ * Convenience method for setting SHA1 digest.
+ * @see #setDigest(String)
+ */
+ public void setSha1Digest() {
+ setDigest(SHA1);
+ }
+
+
+ /**
+ * Sets a digest function which may be applied to recorded data.
+ * The difference between calling this method and {@link #setDigest(MessageDigest)}
+ * is that this method tries to reuse MethodDigest instance if already allocated
+ * and of appropriate algorithm.
+ * @param algorithm Message digest algorithm to use.
+ * @see #setDigest(MessageDigest)
+ */
+ public void setDigest(String algorithm) {
+ try {
+ // Reuse extant digest if its sha1 algorithm.
+ if (this.digest == null ||
+ !this.digest.getAlgorithm().equals(algorithm)) {
+ setDigest(MessageDigest.getInstance(algorithm));
+ }
+ } catch (NoSuchAlgorithmException e) {
+ e.printStackTrace();
+ }
+ }
+
+ /**
+ * Sets a digest function which may be applied to recorded data.
+ *
+ * As usually only a subset of the recorded data should
+ * be fed to the digest, you must also call startDigest()
+ * to begin digesting.
+ *
+ * @param md Message digest function to use.
+ */
+ public void setDigest(MessageDigest md) {
+ this.digest = md;
+ }
+
+ /**
+ * Return the digest value for any recorded, digested data. Call
+ * only after all data has been recorded; otherwise, the running
+ * digest state is ruined.
+ *
+ * @return the digest final value
+ */
+ public byte[] getDigestValue() {
+ if(this.digest == null) {
+ return null;
+ }
+ return this.digest.digest();
+ }
+
+ public long getResponseContentLength() {
+ return this.size - this.messageBodyBeginMark;
+ }
+
+ /**
+ * @return True if this ROS is open.
+ */
+ public boolean isOpen() {
+ return this.out != null;
+ }
+
+ public int getBufferLength() {
+ return this.buffer.length;
+ }
+
+ /**
+ * When used alongside a mark-supporting RecordingInputStream, remember
+ * a position reachable by a future reset().
+ */
+ public void mark() {
+ // remember this position for subsequent reset()
+ this.markPosition = position;
+ }
+
+ /**
+ * When used alongside a mark-supporting RecordingInputStream, reset
+ * the position to that saved by previous mark(). Until the position
+ * again reached "new" material, none of the bytes pushed to this
+ * stream will be digested or recorded.
+ */
+ public void reset() {
+ // take note of furthest-position-reached to avoid double-recording
+ maxPosition = Math.max(maxPosition, position);
+ // reset to previous position
+ position = markPosition;
+ }
+
+ /**
+ * Set limits on length, time, and rate to enforce.
+ *
+ * @param length
+ * @param milliseconds
+ * @param rateKBps
+ */
+ public void setLimits(long length, long milliseconds, long rateKBps) {
+ maxLength = (length>0) ? length : Long.MAX_VALUE;
+ timeoutMs = (milliseconds>0) ? milliseconds : Long.MAX_VALUE;
+ maxRateBytesPerMs = (rateKBps>0) ? rateKBps*1024/1000 : Long.MAX_VALUE;
+ }
+
+ /**
+ * Reset limits to effectively-unlimited defaults
+ */
+ public void resetLimits() {
+ maxLength = Long.MAX_VALUE;
+ timeoutMs = Long.MAX_VALUE;
+ maxRateBytesPerMs = Long.MAX_VALUE;
+ }
+
+ /**
+ * Return number of bytes that could be recorded without hitting
+ * length limit
+ *
+ * @return long byte count
+ */
+ public long getRemainingLength() {
+ return maxLength - position;
+ }
+
+ public void clearForReuse() throws IOException {
+ this.out = null;
+ this.position = 0;
+ this.markPosition = 0;
+ this.maxPosition = 0;
+ this.size = 0;
+ this.messageBodyBeginMark = -1;
+ // ensure recording turned on
+ this.recording = true;
+ // Always begins false; must use startDigest() to begin
+ this.shouldDigest = false;
+ if (this.diskStream != null) {
+ closeDiskStream();
+ }
+ }
+}
+
diff --git a/src/main/java/org/archive/io/RecoverableIOException.java b/src/main/java/org/archive/io/RecoverableIOException.java
new file mode 100644
index 00000000..5ce2251a
--- /dev/null
+++ b/src/main/java/org/archive/io/RecoverableIOException.java
@@ -0,0 +1,83 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+import java.io.IOException;
+import java.io.PrintStream;
+import java.io.PrintWriter;
+
+/**
+ * A decorator on IOException for IOEs that are likely not fatal or at least
+ * merit retry.
+ * @author stack
+ * @version $Date$, $Revision$
+ */
+public class RecoverableIOException extends IOException {
+ private static final long serialVersionUID = 6194776587381865451L;
+ private final IOException decoratedIOException;
+
+ public RecoverableIOException(final String message) {
+ this(new IOException(message));
+ }
+
+ public RecoverableIOException(final IOException ioe) {
+ super();
+ this.decoratedIOException = ioe;
+ }
+
+ public Throwable getCause() {
+ return this.decoratedIOException.getCause();
+ }
+
+ public String getLocalizedMessage() {
+ return this.decoratedIOException.getLocalizedMessage();
+ }
+
+ public String getMessage() {
+ return this.decoratedIOException.getMessage();
+ }
+
+ public StackTraceElement[] getStackTrace() {
+ return this.decoratedIOException.getStackTrace();
+ }
+
+ public synchronized Throwable initCause(Throwable cause) {
+ return this.decoratedIOException.initCause(cause);
+ }
+
+ public void printStackTrace() {
+ this.decoratedIOException.printStackTrace();
+ }
+
+ public void printStackTrace(PrintStream s) {
+ this.decoratedIOException.printStackTrace(s);
+ }
+
+ public void printStackTrace(PrintWriter s) {
+ this.decoratedIOException.printStackTrace(s);
+ }
+
+ public void setStackTrace(StackTraceElement[] stackTrace) {
+ this.decoratedIOException.setStackTrace(stackTrace);
+ }
+
+ public String toString() {
+ return this.decoratedIOException.toString();
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/io/RecyclingFastBufferedOutputStream.java b/src/main/java/org/archive/io/RecyclingFastBufferedOutputStream.java
new file mode 100644
index 00000000..a3b76e46
--- /dev/null
+++ b/src/main/java/org/archive/io/RecyclingFastBufferedOutputStream.java
@@ -0,0 +1,37 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
+
+import java.io.OutputStream;
+
+/**
+ * FastBufferedOutputStream that accepts a passed-in buffer (avoiding
+ * reallocation).
+ */
+public class RecyclingFastBufferedOutputStream extends FastBufferedOutputStream {
+ public RecyclingFastBufferedOutputStream( final OutputStream os, final byte[] buffer ) {
+ super(os);
+ this.buffer = buffer;
+ avail = buffer.length;
+ }
+}
+
+
diff --git a/src/main/java/org/archive/io/ReplayCharSequence.java b/src/main/java/org/archive/io/ReplayCharSequence.java
new file mode 100644
index 00000000..aa9b9587
--- /dev/null
+++ b/src/main/java/org/archive/io/ReplayCharSequence.java
@@ -0,0 +1,77 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.nio.charset.CharacterCodingException;
+import java.nio.charset.Charset;
+
+import com.google.common.base.Charsets;
+
+
+/**
+ * CharSequence interface with addition of a {@link #close()} method.
+ *
+ * Users of implementations of this interface must call {@link #close()} so
+ * implementations get a chance at cleaning up after themselves.
+ *
+ * @author stack
+ * @version $Revision$, $Date$
+ */
+public interface ReplayCharSequence extends CharSequence, Closeable {
+
+ /** charset to use in replay when declared value
+ * is absent/illegal/unavailable */
+ public Charset FALLBACK_CHARSET = Charsets.ISO_8859_1; // TODO: should this be UTF-8?
+
+ /**
+ * Call this method when done so implementation has chance to clean up
+ * resources.
+ *
+ * @throws IOException Problem cleaning up file system resources.
+ */
+ public void close() throws IOException;
+
+ /**
+ * Report count of decoder errors silently eaten during ReplayCharSequence
+ * use. May be less than the number of individual decoding anomalies in
+ * underlying content (if decoding method doesn't allow counting individual
+ * errors).
+ */
+ public long getDecodeExceptionCount();
+
+ /**
+ * Return the first coding-exception encountered, if the count > 0.
+ * @return CharacterCodingException
+ */
+ public CharacterCodingException getCodingException();
+
+ /**
+ * @return false if {@link #close()} has been called
+ */
+ public boolean isOpen();
+
+ /**
+ * Return the effective Charset used to create this CharSequence from
+ * (raw byte) source material.
+ */
+ public Charset getCharset();
+}
diff --git a/src/main/java/org/archive/io/ReplayInputStream.java b/src/main/java/org/archive/io/ReplayInputStream.java
new file mode 100644
index 00000000..fccf5fd3
--- /dev/null
+++ b/src/main/java/org/archive/io/ReplayInputStream.java
@@ -0,0 +1,325 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+
+import org.apache.commons.io.IOUtils;
+import org.archive.util.ArchiveUtils;
+import org.archive.util.FileUtils;
+
+
+/**
+ * Replays the bytes recorded from a RecordingInputStream or
+ * RecordingOutputStream.
+ *
+ * This InputStream supports mark and reset.
+ *
+ * @author gojomo
+ */
+public class ReplayInputStream extends SeekInputStream
+{
+ private static final int DEFAULT_BUFFER_SIZE = 256*1024; // 256KiB
+ private BufferedSeekInputStream diskStream;
+ private byte[] buffer;
+ private long position;
+
+ /**
+ * Total size of stream content.
+ *
+ * Size of data to replay.
+ */
+ private long size = -1;
+
+ /**
+ * Where the response body starts, if marked
+ */
+ protected long responseBodyStart = -1;
+
+
+ /**
+ * Constructor.
+ *
+ * @param buffer Buffer to read from.
+ * @param size Size of data to replay.
+ * @param responseBodyStart Start of the response body.
+ * @param backingFilename Backing file that sits behind the buffer. If
+ * size > than buffer then we go to backing file to read
+ * data that is beyond buffer.length.
+ *
+ * @throws IOException If we fail to open an input stream on
+ * backing file.
+ */
+ public ReplayInputStream(byte[] buffer, long size, long responseBodyStart,
+ String backingFilename)
+ throws IOException
+ {
+ this(buffer, size, backingFilename);
+ this.responseBodyStart = responseBodyStart;
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param buffer Buffer to read from.
+ * @param size Size of data to replay.
+ * @param backingFilename Backing file that sits behind the buffer. If
+ * size > than buffer then we go to backing file to read
+ * data that is beyond buffer.length.
+ * @throws IOException If we fail to open an input stream on
+ * backing file.
+ */
+ public ReplayInputStream(byte[] buffer, long size, String backingFilename)
+ throws IOException
+ {
+ this.buffer = buffer;
+ this.size = size;
+ if (size > buffer.length) {
+ setupDiskStream(new File(backingFilename));
+ }
+ }
+
+ protected void setupDiskStream(File backingFile) throws IOException {
+ RandomAccessInputStream rais = new RandomAccessInputStream(backingFile);
+ diskStream = new BufferedSeekInputStream(rais, 4096);
+ }
+
+ protected File backingFile;
+
+ /**
+ * Create a ReplayInputStream from the given source stream. Requires
+ * reading the entire stream (and possibly overflowing to a temporary
+ * file). Primary reason for doing so would be to have a repositionable
+ * version of the original stream's contents.
+ *
+ * If created via this constructor, use the destroy() method to ensure
+ * prompt deletion of any associated tmp file when done.
+ *
+ * @param fillStream
+ * @throws IOException
+ */
+ public ReplayInputStream(InputStream fillStream) throws IOException {
+ this.buffer = new byte[DEFAULT_BUFFER_SIZE];
+ long count = ArchiveUtils.readFully(fillStream, buffer);
+ if(fillStream.available()>0) {
+ this.backingFile = File.createTempFile("tid"+Thread.currentThread().getId(), "ris");
+ count += FileUtils.readFullyToFile(fillStream, backingFile);
+ setupDiskStream(backingFile);
+ }
+ this.size = count;
+ }
+
+ /**
+ * Close & destroy any internally-generated temporary files.
+ */
+ public void destroy() {
+ IOUtils.closeQuietly(this);
+ if(backingFile!=null) {
+ FileUtils.deleteSoonerOrLater(backingFile);
+ }
+ }
+
+ public long setToResponseBodyStart() throws IOException {
+ position(responseBodyStart);
+ return this.position;
+ }
+
+
+ /* (non-Javadoc)
+ * @see java.io.InputStream#read()
+ */
+ public int read() throws IOException {
+ if (position == size) {
+ return -1; // EOF
+ }
+ if (position < buffer.length) {
+ // Convert to unsigned int.
+ int c = buffer[(int) position] & 0xFF;
+ position++;
+ return c;
+ }
+ int c = diskStream.read();
+ if (c >= 0) {
+ position++;
+ }
+ return c;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.io.InputStream#read(byte[], int, int)
+ */
+ public int read(byte[] b, int off, int len) throws IOException {
+ if (position == size) {
+ return -1; // EOF
+ }
+ if (position < buffer.length) {
+ int toCopy = (int)Math.min(size - position,
+ Math.min(len, buffer.length - position));
+ System.arraycopy(buffer, (int)position, b, off, toCopy);
+ if (toCopy > 0) {
+ position += toCopy;
+ }
+ return toCopy;
+ }
+ // into disk zone
+ int read = diskStream.read(b,off,len);
+ if(read>0) {
+ position += read;
+ }
+ return read;
+ }
+
+ public void readFullyTo(OutputStream os) throws IOException {
+ byte[] buf = new byte[4096];
+ int c = read(buf);
+ while (c != -1) {
+ os.write(buf,0,c);
+ c = read(buf);
+ }
+ }
+
+ /*
+ * Like 'readFullyTo', but only reads the header-part.
+ * Starts from the beginning each time it is called.
+ */
+ public void readHeaderTo(OutputStream os) throws IOException {
+ position = 0;
+ byte[] buf = new byte[(int)responseBodyStart];
+ int c = read(buf,0,buf.length);
+ if(c != -1) {
+ os.write(buf,0,c);
+ }
+ }
+
+ /*
+ * Like 'readFullyTo', but only reads the content-part.
+ */
+ public void readContentTo(OutputStream os) throws IOException {
+ setToResponseBodyStart();
+ byte[] buf = new byte[4096];
+ int c = read(buf);
+ while (c != -1) {
+ os.write(buf,0,c);
+ c = read(buf);
+ }
+ }
+
+ /**
+ * Convenience method to copy content out to target stream.
+ * @param os stream to write content to
+ * @param maxSize maximum count of bytes to copy
+ * @throws IOException
+ */
+ public void readContentTo(OutputStream os, long maxSize) throws IOException {
+ setToResponseBodyStart();
+ byte[] buf = new byte[4096];
+ int c = read(buf);
+ long tot = 0;
+ while (c != -1 && tot < maxSize) {
+ os.write(buf,0,c);
+ c = read(buf);
+ tot += c;
+ }
+ }
+
+ /* (non-Javadoc)
+ * @see java.io.InputStream#close()
+ */
+ public void close() throws IOException {
+ super.close();
+ if(diskStream != null) {
+ diskStream.close();
+ }
+ }
+
+ /**
+ * Total size of stream content.
+ * @return Returns the size.
+ */
+ public long getSize()
+ {
+ return size;
+ }
+
+ /**
+ * Total size of header.
+ * @return the size of the header.
+ */
+ public long getHeaderSize()
+ {
+ return responseBodyStart;
+ }
+
+ /**
+ * Total size of content.
+ * @return the size of the content.
+ */
+ public long getContentSize()
+ {
+ return size - responseBodyStart;
+ }
+
+ /**
+ * @return Amount THEORETICALLY remaining (TODO: Its not theoretical
+ * seemingly. The class implemetentation depends on it being exact).
+ */
+ public long remaining() {
+ return size - position;
+ }
+
+
+ /**
+ * Reposition the stream.
+ *
+ * @param p the new position for this stream
+ * @throws IOException if an IO error occurs
+ */
+ public void position(long p) throws IOException {
+ if (p < 0) {
+ throw new IOException("Negative seek offset.");
+ }
+ if (p > size) {
+ throw new IOException("Desired position exceeds size.");
+ }
+ if (p < buffer.length) {
+ // Only seek file if necessary
+ if (position > buffer.length) {
+ diskStream.position(0);
+ }
+ } else {
+ diskStream.position(p - buffer.length);
+ }
+ this.position = p;
+ }
+
+
+ public long position() throws IOException {
+ return position;
+ }
+
+ protected byte[] getBuffer() {
+ return buffer;
+ }
+}
diff --git a/src/main/java/org/archive/io/RepositionableInputStream.java b/src/main/java/org/archive/io/RepositionableInputStream.java
new file mode 100644
index 00000000..6f885130
--- /dev/null
+++ b/src/main/java/org/archive/io/RepositionableInputStream.java
@@ -0,0 +1,133 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+import it.unimi.dsi.fastutil.io.RepositionableStream;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * Wrapper around an {@link InputStream} to make a primitive Repositionable
+ * stream. Uses a {@link BufferedInputStream}. Calls mark on every read so
+ * we'll remember at least the last thing read (You can only backup on the
+ * last thing read -- not last 2 or 3 things read). Used by
+ * {@link GzippedInputStream} when reading streams over a network. Wraps a
+ * HTTP, etc., stream so we can back it up if needs be after the
+ * GZIP inflater has done a fill of its full buffer though it only needed
+ * the first few bytes to finish decompressing the current GZIP member.
+ *
+ *
TODO: More robust implementation. Tried to use the it.unimi.dsi.io
+ * FastBufferdInputStream but relies on FileChannel ByteBuffers and if not
+ * present -- as would be the case reading from a network stream, the main
+ * application for this instance -- then it expects the underlying stream
+ * implements RepositionableStream interface so chicken or egg problem.
+ * @author stack
+ */
+public class RepositionableInputStream extends BufferedInputStream implements
+ RepositionableStream {
+ private long position = 0;
+ private long markPosition = -1;
+
+ public RepositionableInputStream(InputStream in) {
+ super(in);
+ }
+
+ public RepositionableInputStream(InputStream in, int size) {
+ super(in, size);
+ }
+
+ public int read(byte[] b) throws IOException {
+ int read = super.read(b);
+ if (read != -1) {
+ position += read;
+ }
+ return read;
+ }
+
+ public synchronized int read(byte[] b, int offset, int ct)
+ throws IOException {
+ // Mark the underlying stream so that we'll remember what we are about
+ // to read unless a mark has been set in this RepositionableStream
+ // (We have two levels of mark). In this latter case we want the
+ // underlying stream to preserve its mark position so aligns with
+ // this RS when eset is called.
+ if (!isMarked()) {
+ super.mark((ct > offset)? ct - offset: ct);
+ }
+ int read = super.read(b, offset, ct);
+ if (read != -1) {
+ position += read;
+ }
+ return read;
+ }
+
+ public int read() throws IOException {
+ // Mark the underlying stream so that we'll remember what we are about
+ // to read unless a mark has been set in this RepositionableStream
+ // (We have two levels of mark). In this latter case we want the
+ // underlying stream to preserve its mark position so aligns with
+ // this RS when eset is called.
+ if (!isMarked()) {
+ super.mark(1);
+ }
+ int c = super.read();
+ if (c != -1) {
+ position++;
+ }
+ return c;
+ }
+
+ public void position(final long offset) {
+ if (this.position == offset) {
+ return;
+ }
+ int diff = (int)(offset - this.position);
+ long lowerBound = this.position - this.pos;
+ long upperBound = lowerBound + this.count;
+ if (offset < lowerBound || offset >= upperBound) {
+ throw new IllegalAccessError("Offset goes outside " +
+ "current this.buf (TODO: Do buffer fills if positive)");
+ }
+ this.position = offset;
+ this.pos += diff;
+ // Clear any mark.
+ this.markPosition = -1;
+ }
+
+ public void mark(int readlimit) {
+ this.markPosition = this.position;
+ super.mark(readlimit);
+ }
+
+ public void reset() throws IOException {
+ super.reset();
+ this.position = this.markPosition;
+ this.markPosition = -1;
+ }
+
+ protected boolean isMarked() {
+ return this.markPosition != -1;
+ }
+
+ public long position() {
+ return this.position;
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/io/SafeSeekInputStream.java b/src/main/java/org/archive/io/SafeSeekInputStream.java
new file mode 100644
index 00000000..0d8f83b1
--- /dev/null
+++ b/src/main/java/org/archive/io/SafeSeekInputStream.java
@@ -0,0 +1,124 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+
+import java.io.IOException;
+
+
+/**
+ * Enables multiple concurrent streams based on the same underlying stream.
+ *
+ * @author pjack
+ */
+public class SafeSeekInputStream extends SeekInputStream {
+
+
+ /**
+ * The underlying stream.
+ */
+ private SeekInputStream input;
+
+
+ /**
+ * The expected position of the underlying stream.
+ */
+ private long expected;
+
+
+ /**
+ * Constructor. The given stream will be positioned to 0 so that an
+ * accurate position can be tracked.
+ *
+ * @param input the underlying input stream
+ * @throws IOException if an IO error occurs
+ */
+ public SafeSeekInputStream(SeekInputStream input) throws IOException {
+ this.input = input;
+ this.expected = input.position();
+ }
+
+
+ /**
+ * Ensures that the underlying stream's position is what we expect to be.
+ *
+ * @throws IOException if an IO error occurs
+ */
+ private void ensure() throws IOException {
+ if (expected != input.position()) {
+ input.position(expected);
+ }
+ }
+
+
+ @Override
+ public int read() throws IOException {
+ ensure();
+ int c = input.read();
+ if (c >= 0) {
+ expected++;
+ }
+ return c;
+ }
+
+
+ @Override
+ public int read(byte[] buf, int ofs, int len) throws IOException {
+ ensure();
+ int r = input.read(buf, ofs, len);
+ if (r > 0) {
+ expected += r;
+ }
+ return r;
+ }
+
+
+ @Override
+ public int read(byte[] buf) throws IOException {
+ ensure();
+ int r = input.read(buf);
+ if (r > 0) {
+ expected += r;
+ }
+ return r;
+ }
+
+
+ @Override
+ public long skip(long c) throws IOException {
+ ensure();
+ long r = input.skip(c);
+ if (r > 0) {
+ expected += r;
+ }
+ return r;
+ }
+
+
+ public void position(long p) throws IOException {
+ input.position(p);
+ expected = p;
+ }
+
+
+ public long position() throws IOException {
+ return expected;
+ }
+
+}
diff --git a/src/main/java/org/archive/io/SeekInputStream.java b/src/main/java/org/archive/io/SeekInputStream.java
new file mode 100644
index 00000000..177724ec
--- /dev/null
+++ b/src/main/java/org/archive/io/SeekInputStream.java
@@ -0,0 +1,81 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+
+import it.unimi.dsi.fastutil.io.RepositionableStream;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+
+/**
+ * Base class for repositionable input streams.
+ *
+ * @author pjack
+ */
+public abstract class SeekInputStream extends InputStream
+implements RepositionableStream {
+
+
+ /**
+ * The marked file position. A value less than zero
+ * indicates that no mark has been set.
+ */
+ private long mark = -1;
+
+
+ /**
+ * Marks the current position of the stream. The limit parameter is
+ * ignored; the mark will remain valid until reset is called or the
+ * stream is closed.
+ *
+ * @param limit ignored
+ */
+ public void mark(int limit) {
+ try {
+ this.mark = position();
+ } catch (IOException e) {
+ mark = -1;
+ }
+ }
+
+
+ /**
+ * Resets this stream to its marked position.
+ *
+ * @throws IOException if there is no mark, or if an IO error occurs
+ */
+ public void reset() throws IOException {
+ if (mark < 0) {
+ throw new IOException("No mark.");
+ }
+ position(mark);
+ }
+
+
+ /**
+ * Returns true, since SeekInputStreams support mark/reset by default.
+ *
+ * @return true
+ */
+ public boolean markSupported() {
+ return true;
+ }
+}
diff --git a/src/main/java/org/archive/io/SeekReader.java b/src/main/java/org/archive/io/SeekReader.java
new file mode 100644
index 00000000..4abf7847
--- /dev/null
+++ b/src/main/java/org/archive/io/SeekReader.java
@@ -0,0 +1,84 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+
+import java.io.IOException;
+import java.io.Reader;
+
+import it.unimi.dsi.fastutil.io.RepositionableStream;
+
+
+/**
+ * Base class for repositionable readers.
+ *
+ * @author pjack
+ */
+public abstract class SeekReader extends Reader
+implements RepositionableStream {
+
+
+ /**
+ * The marked file position. A value less than zero
+ * indicates that no mark has been set.
+ */
+ private long mark = -1;
+
+
+ /**
+ * Marks the current position of the stream. The limit parameter is
+ * ignored; the mark will remain valid until reset is called or the
+ * stream is closed.
+ *
+ * @param limit ignored
+ */
+ @Override
+ public void mark(int limit) {
+ try {
+ this.mark = position();
+ } catch (IOException e) {
+ mark = -1;
+ }
+ }
+
+
+ /**
+ * Resets this stream to its marked position.
+ *
+ * @throws IOException if there is no mark, or if an IO error occurs
+ */
+ @Override
+ public void reset() throws IOException {
+ if (mark < 0) {
+ throw new IOException("No mark.");
+ }
+ position(mark);
+ }
+
+
+ /**
+ * Returns true, since SeekInputStreams support mark/reset by default.
+ *
+ * @return true
+ */
+ @Override
+ public boolean markSupported() {
+ return true;
+ }
+}
diff --git a/src/main/java/org/archive/io/SeekReaderCharSequence.java b/src/main/java/org/archive/io/SeekReaderCharSequence.java
new file mode 100644
index 00000000..a9b4880f
--- /dev/null
+++ b/src/main/java/org/archive/io/SeekReaderCharSequence.java
@@ -0,0 +1,56 @@
+package org.archive.io;
+
+import java.io.IOException;
+
+public class SeekReaderCharSequence implements CharSequence {
+
+
+ final private SeekReader reader;
+ final private int size;
+
+
+ public SeekReaderCharSequence(SeekReader reader, int size) {
+ this.reader = reader;
+ this.size = size;
+ }
+
+
+ public int length() {
+ return size;
+ }
+
+
+ public char charAt(int index) {
+ if ((index < 0) || (index >= length())) {
+ throw new IndexOutOfBoundsException(Integer.toString(index));
+ }
+ try {
+ reader.position(index);
+ int r = reader.read();
+ if (r < 0) {
+ throw new IllegalStateException("EOF");
+ }
+ return (char)reader.read();
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+
+ public CharSequence subSequence(int start, int end) {
+ return new CharSubSequence(this, start, end);
+ }
+
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ try {
+ reader.position(0);
+ for (int ch = reader.read(); ch >= 0; ch = reader.read()) {
+ sb.append((char)ch);
+ }
+ return sb.toString();
+ } catch (IOException e) {
+ throw new IllegalStateException(e);
+ }
+ }
+}
diff --git a/src/main/java/org/archive/io/SinkHandlerLogThread.java b/src/main/java/org/archive/io/SinkHandlerLogThread.java
new file mode 100644
index 00000000..0070785e
--- /dev/null
+++ b/src/main/java/org/archive/io/SinkHandlerLogThread.java
@@ -0,0 +1,34 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io;
+
+
+/**
+ * Implemented by threads that provide extra information.
+ *
+ * TODO: rename class, rename getCurrentProcessorName()
+ */
+public interface SinkHandlerLogThread {
+
+ String getName();
+ String getCurrentProcessorName();
+ int getSerialNumber();
+
+}
diff --git a/src/main/java/org/archive/io/UTF8Bytes.java b/src/main/java/org/archive/io/UTF8Bytes.java
new file mode 100644
index 00000000..c280b08d
--- /dev/null
+++ b/src/main/java/org/archive/io/UTF8Bytes.java
@@ -0,0 +1,37 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+import java.io.UnsupportedEncodingException;
+
+/**
+ * Marker Interface for instances that can be serialized as UTF8 bytes.
+ * TODO: Do we need a UTF8Stream Marker Interface?
+ * @author stack
+ * @version $Date$ $Version$
+ */
+public interface UTF8Bytes {
+ public static final String UTF8 = "UTF-8";
+
+ /**
+ * @return Instance as UTF-8 bytes.
+ * @throws UnsupportedEncodingException
+ */
+ public byte [] getUTF8Bytes() throws UnsupportedEncodingException;
+}
diff --git a/src/main/java/org/archive/io/WriterPool.java b/src/main/java/org/archive/io/WriterPool.java
new file mode 100644
index 00000000..2dc385a1
--- /dev/null
+++ b/src/main/java/org/archive/io/WriterPool.java
@@ -0,0 +1,343 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.LinkedList;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import org.json.JSONArray;
+import org.json.JSONException;
+import org.json.JSONObject;
+
+/**
+ * Pool of Writers.
+ *
+ * Abstract. Override and pass in the Constructor a factory that creates
+ * {@link WriterPoolMember} implementations.
+ *
+ * @author stack
+ */
+public abstract class WriterPool {
+ private final Logger logger = Logger.getLogger(this.getClass().getName());
+
+ /**
+ * Used to generate unique filename sequences.
+ */
+ final protected AtomicInteger serialNo;
+
+ /**
+ * Default maximum active number of files in the pool.
+ */
+ public static final int DEFAULT_MAX_ACTIVE = 1;
+
+ /** Assumed largest possible value of maxActive; pool will have this
+ * maximum capacity, so dynamic changes beyond this number won't work. */
+ protected static final int LARGEST_MAX_ACTIVE = 255;
+
+ /**
+ * Maximum time to wait on a free file before considering
+ * making a new one (if not already at max)
+ */
+ public static final int DEFAULT_MAX_WAIT_FOR_IDLE = 500;
+
+ /**
+ * File settings.
+ * Keep in data structure rather than as individual values.
+ */
+ protected final WriterPoolSettings settings;
+
+ /** maximum number of writers to create at a time*/
+ protected int maxActive;
+ /** maximum ms to wait before considering creation of a writer */
+ protected int maxWait;
+ /** current count of active writers; only read/mutated in synchronized blocks */
+ protected int currentActive = 0;
+ /** round-robin queue of available writers */
+ protected BlockingQueue availableWriters;
+
+ /** system time when writer was last wanted (because one was not ready in time) */
+ protected long lastWriterNeededTime;
+ /** system time when writer was last 'rolled over' (imminent creation of new file) */
+ protected long lastWriterRolloverTime;
+
+ /**
+ * Constructor
+ * @param serial Used to generate unique filename sequences
+ * @param factory Factory that knows how to make a {@link WriterPoolMember}.
+ * @param settings Settings for this pool.
+ * @param poolMaximumActive
+ * @param poolMaximumWait
+ */
+ public WriterPool(final AtomicInteger serial,
+ final WriterPoolSettings settings,
+ final int poolMaximumActive, final int poolMaximumWait) {
+ logger.info("Initial configuration:" +
+ " prefix=" + settings.getPrefix() +
+ ", template=" + settings.getTemplate() +
+ ", compress=" + settings.getCompress() +
+ ", maxSize=" + settings.getMaxFileSizeBytes() +
+ ", maxActive=" + poolMaximumActive +
+ ", maxWait=" + poolMaximumWait);
+ this.settings = settings;
+ this.maxActive = poolMaximumActive;
+ this.maxWait = poolMaximumWait;
+ availableWriters = new ArrayBlockingQueue(LARGEST_MAX_ACTIVE, true);
+ this.serialNo = serial;
+ }
+
+ /**
+ * Check out a {@link WriterPoolMember}.
+ *
+ * This method should be followed by a call to
+ * {@link #returnFile(WriterPoolMember)} or
+ * {@link #invalidateFile(WriterPoolMember)} else pool starts leaking.
+ *
+ * @return Writer checked out of a pool of files or created
+ * @throws IOException Problem getting Writer from pool (Converted
+ * from Exception to IOException so this pool can live as a good citizen
+ * down in depths of ARCSocketFactory).
+ */
+ public WriterPoolMember borrowFile()
+ throws IOException {
+ WriterPoolMember writer = null;
+ while(writer == null) {
+ try {
+ writer = availableWriters.poll(maxWait,TimeUnit.MILLISECONDS);
+ } catch (InterruptedException e) {
+ // nothing to do but proceed
+ }
+ if(writer==null) {
+ writer = makeNewWriterIfAppropriate();
+ }
+ }
+ return writer;
+ }
+
+ /**
+ * Create a new writer instance, if still below maxActive count.
+ * Remember times to help make later decision when writer should
+ * be discarded.
+ *
+ * @return WriterPoolMember or null if already at max
+ */
+ protected synchronized WriterPoolMember makeNewWriterIfAppropriate() {
+ long now = System.currentTimeMillis();
+ lastWriterNeededTime = now;
+ if(currentActive < maxActive) {
+ currentActive++;
+ lastWriterRolloverTime = now;
+ return makeWriter();
+ }
+ return null;
+ }
+
+ /**
+ * @return new WriterPoolMember of appropriate type
+ */
+ protected abstract WriterPoolMember makeWriter();
+
+ /**
+ * Discard a previously-used writer, cleanly closing it and leaving it out
+ * of the pool.
+ * @param writer
+ * @throws IOException
+ */
+ public synchronized void destroyWriter(WriterPoolMember writer) throws IOException {
+ currentActive--;
+ writer.close();
+ }
+ /**
+ * Return a writer, for likely reuse unless (1) writer's current file has
+ * reached its target size; and (2) there's been no demand for additional
+ * writers since the last time a new writer-file was rolled-over. In that
+ * case, the possibly-superfluous writer instance is discarded.
+ * @param writer Writer to return to the pool.
+ * @throws IOException Problem returning File to pool.
+ */
+ public void returnFile(WriterPoolMember writer)
+ throws IOException {
+ synchronized(this) {
+ if(writer.isOversize()) {
+ // maybe retire writer rather than recycle
+ if(lastWriterNeededTime<=lastWriterRolloverTime) {
+ // no timeouts waiting for recycled writer since last writer rollover
+ destroyWriter(writer);
+ return;
+ } else {
+ // reuse writer instance, causing new file to be created
+ lastWriterRolloverTime = System.currentTimeMillis();
+ }
+ }
+ }
+ if(!availableWriters.offer(writer)) {
+ logger.log(Level.WARNING, "writer unreturnable to available pool; closing early");
+ destroyWriter(writer);
+ }
+ }
+
+ /**
+ * Close and discard a writer that experienced a potentially-corrupting
+ * error.
+ * @param f writer with problem
+ * @throws IOException
+ */
+ public synchronized void invalidateFile(WriterPoolMember f)
+ throws IOException {
+ try {
+ destroyWriter(f);
+ } catch (Exception e) {
+ // Convert exception.
+ throw new IOException(e.getMessage());
+ }
+ // It'll have been closed. Rename with an '.invalid' suffix so it
+ // gets attention.
+ File file = f.getFile();
+ file.renameTo(new File(file.getAbsoluteFile() +
+ WriterPoolMember.INVALID_SUFFIX));
+ }
+
+ /**
+ * @return Number of {@link WriterPoolMember}s checked out of pool.
+ * @throws java.lang.UnsupportedOperationException
+ */
+ public synchronized int getNumActive()
+ throws UnsupportedOperationException {
+ return currentActive - getNumIdle();
+ }
+
+ /**
+ * @return Number of {@link WriterPoolMember} instances still in the pool.
+ * @throws java.lang.UnsupportedOperationException
+ */
+ public int getNumIdle()
+ throws UnsupportedOperationException {
+ return availableWriters.size();
+ }
+
+ /**
+ * Close all {@link WriterPoolMember}s in pool.
+ */
+ public void close() {
+ Collection writers = drainAllWriters();
+ for (WriterPoolMember writer: writers) {
+ try {
+ destroyWriter(writer);
+ } catch (IOException e) {
+ logger.log(Level.WARNING,"problem closing writer",e);
+ }
+ }
+ }
+
+ /**
+ * @return Returns settings.
+ */
+ public WriterPoolSettings getSettings() {
+ return this.settings;
+ }
+
+ /**
+ * @return State of the pool string
+ */
+ protected String getPoolState() {
+ StringBuffer buffer = new StringBuffer("Active ");
+ buffer.append(getNumActive());
+ buffer.append(" of max ");
+ buffer.append(maxActive);
+ buffer.append(", idle ");
+ buffer.append(getNumIdle());
+ return buffer.toString();
+ }
+
+ /**
+ * Returns the atomic integer used to generate serial numbers
+ * for files.
+ *
+ * @return the serial number generator
+ */
+ public AtomicInteger getSerialNo() {
+ return serialNo;
+ }
+
+ /**
+ * Drains all the active writers from {@link #availableWriters}, blocking to
+ * wait for any writers currently in use to become available.
+ *
+ *
+ * When finished with writers, call availableWriters.addAll(...) to put them
+ * back into the rotation.
+ *
+ * @return all the active writers
+ */
+ protected synchronized Collection drainAllWriters() {
+ LinkedList writers = new LinkedList();
+ availableWriters.drainTo(writers);
+
+ while (writers.size() < currentActive) {
+ try {
+ WriterPoolMember w = availableWriters.take();
+ writers.add(w);
+ } catch (InterruptedException e) {
+ logger.severe("caught " + e + " while waiting for writers to free up; returning only "
+ + writers.size() + " of " + currentActive + " active writers");
+ break;
+ }
+ }
+
+ return writers;
+ }
+
+ public void flush() {
+ Collection writers = drainAllWriters();
+
+ for (WriterPoolMember writer: writers) {
+ try {
+ writer.flush();
+ } catch (IOException e) {
+ logger.log(Level.WARNING, "problem flushing writer " + writer, e);
+ }
+ }
+
+ availableWriters.addAll(writers);
+ }
+
+ public JSONArray jsonStatus() throws JSONException {
+ Collection writers = drainAllWriters();
+
+ JSONArray ja = new JSONArray();
+ for (WriterPoolMember w: writers) {
+ JSONObject jo = new JSONObject();
+ jo.put("file", w.getFile());
+ jo.put("position", w.getPosition());
+ ja.put(jo);
+ }
+
+ availableWriters.addAll(writers);
+
+ return ja;
+ }
+}
diff --git a/src/main/java/org/archive/io/WriterPoolMember.java b/src/main/java/org/archive/io/WriterPoolMember.java
new file mode 100644
index 00000000..6ea6b295
--- /dev/null
+++ b/src/main/java/org/archive/io/WriterPoolMember.java
@@ -0,0 +1,487 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.text.DecimalFormat;
+import java.text.NumberFormat;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Properties;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.logging.Logger;
+import java.util.zip.GZIPOutputStream;
+
+import org.archive.util.ArchiveUtils;
+import org.archive.util.FileUtils;
+import org.archive.util.PropertyUtils;
+
+
+
+/**
+ * Member of {@link WriterPool}.
+ * Implements rotating off files, file naming with some guarantee of
+ * uniqueness, and position in file. Subclass to pick up functionality for a
+ * particular Writer type.
+ * @author stack
+ * @version $Date$ $Revision$
+ */
+public abstract class WriterPoolMember implements ArchiveFileConstants {
+ private final Logger logger = Logger.getLogger(this.getClass().getName());
+
+ public static final String UTF8 = "UTF-8";
+
+ /**
+ * Default archival-aggregate filename template.
+ *
+ * Under usual assumptions -- hostnames aren't shared among crawling hosts;
+ * processes have unique PIDs and admin ports; timestamps inside one process
+ * don't repeat (see UniqueTimestampService); clocks are generally
+ * accurate -- will generate a unique name.
+ *
+ * Stands for Internet Archive Heritrix.
+ */
+ public static final String DEFAULT_TEMPLATE =
+ "${prefix}-${timestamp17}-${serialno}-${heritrix.pid}~${heritrix.hostname}~${heritrix.port}";
+
+ /**
+ * Default for file prefix.
+ */
+ public static final String DEFAULT_PREFIX = "WEB";
+
+ /**
+ * Reference to file we're currently writing.
+ */
+ protected File f = null;
+
+ /** Output stream for file. */
+ protected OutputStream out = null;
+ /** Counting stream for metering */
+ protected MiserOutputStream countOut = null;
+
+ /** reusable buffer for recycling scenarios */
+ protected byte[] rebuf;
+
+ protected WriterPoolSettings settings;
+ private final String extension;
+
+ /**
+ * Creation date for the current file.
+ * Set by {@link #createFile()}.
+ */
+ protected String currentTimestamp = "UNSET!!!";
+
+ protected String currentBasename;
+
+ /**
+ * A running sequence used making unique file names.
+ */
+ final private AtomicInteger serialNo;
+
+ /**
+ * Directories round-robin index.
+ */
+ protected static int roundRobinIndex = 0;
+
+ /**
+ * NumberFormat instance for formatting serial number.
+ *
+ * Pads serial number with zeros.
+ */
+ protected static NumberFormat serialNoFormatter = new DecimalFormat("00000");
+
+
+ /**
+ * Buffer to reuse writing streams.
+ */
+ protected final byte [] scratchbuffer = new byte[4 * 1024];
+
+
+ /**
+ * Constructor.
+ * Takes a stream. Use with caution. There is no upperbound check on size.
+ * Will just keep writing.
+ *
+ * @param serialNo used to create unique filename sequences
+ * @param out Where to write.
+ * @param file File the out is connected to.
+ * @param cmprs Compress the content written.
+ * @param a14DigitDate If null, we'll write current time.
+ * @throws IOException
+ */
+ protected WriterPoolMember(AtomicInteger serialNo,
+ final OutputStream out, final File file,
+ final WriterPoolSettings settings)
+ throws IOException {
+ this(serialNo, settings, null);
+ this.countOut = (out instanceof MiserOutputStream)
+ ? (MiserOutputStream)out
+ : new MiserOutputStream(out, settings.getFrequentFlushes());
+ this.out = this.countOut;
+ this.f = file;
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param serialNo used to create unique filename sequences
+ * @param dirs Where to drop files.
+ * @param prefix File prefix to use.
+ * @param cmprs Compress the records written.
+ * @param maxSize Maximum size for ARC files written.
+ * @param template filenaming template to use
+ * @param extension Extension to give file.
+ */
+ public WriterPoolMember(AtomicInteger serialNo,
+ final WriterPoolSettings settings, final String extension) {
+ this.settings = settings;
+ this.extension = extension;
+ this.serialNo = serialNo;
+ }
+
+ /**
+ * Call this method just before/after any significant write.
+ *
+ * Call at the end of the writing of a record or just before we start
+ * writing a new record. Will close current file and open a new file
+ * if file size has passed out maxSize.
+ *
+ *
Creates and opens a file if none already open. One use of this method
+ * then is after construction, call this method to add the metadata, then
+ * call {@link #getPosition()} to find offset of first record.
+ *
+ * TODO: perhaps this should be called checkForNewOpen? because it also
+ * handles initial open, even when not rolling oversize
+ *
+ * @exception IOException
+ */
+ public void checkSize() throws IOException {
+ if (this.out == null || isOversize()) {
+ createFile();
+ }
+ }
+
+ /** Check if underlying file has already reached its target size.
+ * @return boolean true if file has reached target size and due to be closed
+ */
+ public boolean isOversize() {
+ return settings.getMaxFileSizeBytes() != -1 && (this.getPosition() > settings.getMaxFileSizeBytes());
+ }
+
+ /**
+ * Create a new file.
+ * Rotates off the current Writer and creates a new in its place
+ * to take subsequent writes. Usually called from {@link #checkSize()}.
+ * @return Name of file created.
+ * @throws IOException
+ */
+ protected String createFile() throws IOException {
+ generateNewBasename();
+ String name = currentBasename + '.' + this.extension +
+ ((settings.getCompress())? DOT_COMPRESSED_FILE_EXTENSION: "") +
+ OCCUPIED_SUFFIX;
+ File dir = getNextDirectory(settings.calcOutputDirs());
+ return createFile(new File(dir, name));
+ }
+
+ protected String createFile(final File file) throws IOException {
+ close();
+ this.f = file;
+ FileOutputStream fos = new FileOutputStream(this.f);
+ if(rebuf==null) {
+ rebuf = new byte[settings.getWriteBufferSize()];
+ }
+ this.countOut = new MiserOutputStream(new RecyclingFastBufferedOutputStream(fos,rebuf),settings.getFrequentFlushes());
+ this.out = this.countOut;
+ logger.fine("Opened " + this.f.getAbsolutePath());
+ return this.f.getName();
+ }
+
+ /**
+ * @param dirs List of File objects that point at directories.
+ * @return Find next directory to write an arc too. If more
+ * than one, it tries to round-robin through each in turn.
+ * @throws IOException
+ */
+ protected File getNextDirectory(List dirs)
+ throws IOException {
+ if (WriterPoolMember.roundRobinIndex >= dirs.size()) {
+ WriterPoolMember.roundRobinIndex = 0;
+ }
+ File d = null;
+ try {
+ d = checkWriteable((File)dirs.
+ get(WriterPoolMember.roundRobinIndex));
+ } catch (IndexOutOfBoundsException e) {
+ // Dirs list might be altered underneath us.
+ // If so, we get this exception -- just keep on going.
+ }
+ if (d == null && dirs.size() > 1) {
+ for (Iterator i = dirs.iterator(); d == null && i.hasNext();) {
+ d = checkWriteable((File)i.next());
+ }
+ } else {
+ WriterPoolMember.roundRobinIndex++;
+ }
+ if (d == null) {
+ throw new IOException("Directories unusable.");
+ }
+ return d;
+ }
+
+ protected File checkWriteable(File d) {
+ if (d == null) {
+ return d;
+ }
+
+ try {
+ FileUtils.ensureWriteableDirectory(d);
+ } catch(IOException e) {
+ logger.warning("Directory " + d.getPath() + " is not" +
+ " writeable or cannot be created: " + e.getMessage());
+ d = null;
+ }
+ return d;
+ }
+
+ /**
+ * Generate a new basename by interpolating values in the configured
+ * template. Values come from local state, other configured values, and
+ * global system properties. The recommended default template will
+ * generate a unique basename under reasonable assumptions.
+ */
+ protected void generateNewBasename() {
+ Properties localProps = new Properties();
+ localProps.setProperty("prefix", settings.getPrefix());
+ synchronized(this.getClass()) {
+ // ensure that serialNo and timestamp are minted together (never inverted sort order)
+ String paddedSerialNumber = WriterPoolMember.serialNoFormatter.format(serialNo.getAndIncrement());
+ String timestamp17 = ArchiveUtils.getUnique17DigitDate();
+ String timestamp14 = ArchiveUtils.getUnique14DigitDate();
+ currentTimestamp = timestamp17;
+ localProps.setProperty("serialno", paddedSerialNumber);
+ localProps.setProperty("timestamp17", timestamp17);
+ localProps.setProperty("timestamp14", timestamp14);
+ }
+ currentBasename = PropertyUtils.interpolateWithProperties(settings.getTemplate(),
+ localProps, System.getProperties());
+ }
+
+
+ /**
+ * Get the file name
+ *
+ * @return the filename, as if uncompressed
+ */
+ protected String getBaseFilename() {
+ String name = this.f.getName();
+ if (settings.getCompress() && name.endsWith(DOT_COMPRESSED_FILE_EXTENSION)) {
+ return name.substring(0,name.length() - 3);
+ } else if(settings.getCompress() &&
+ name.endsWith(DOT_COMPRESSED_FILE_EXTENSION +
+ OCCUPIED_SUFFIX)) {
+ return name.substring(0, name.length() -
+ (3 + OCCUPIED_SUFFIX.length()));
+ } else {
+ return name;
+ }
+ }
+
+ /**
+ * Get this file.
+ *
+ * Used by junit test to test for creation and when {@link WriterPool} wants
+ * to invalidate a file.
+ *
+ * @return The current file.
+ */
+ public File getFile() {
+ return this.f;
+ }
+
+ /**
+ * Post write tasks.
+ *
+ * Has side effects. Will open new file if we're at the upper bound.
+ * If we're writing compressed files, it will wrap output stream with a
+ * GZIP writer with side effect that GZIP header is written out on the
+ * stream.
+ *
+ * @exception IOException
+ */
+ protected void preWriteRecordTasks()
+ throws IOException {
+ if (this.out == null) {
+ createFile();
+ }
+ if (settings.getCompress()) {
+ // Wrap stream in GZIP Writer.
+ // The below construction immediately writes the GZIP 'default'
+ // header out on the underlying stream.
+ this.out = new CompressedStream(this.out);
+ }
+ }
+
+ /**
+ * Post file write tasks.
+ * If compressed, finishes up compression and flushes stream so any
+ * subsequent checks get good reading.
+ *
+ * @exception IOException
+ */
+ protected void postWriteRecordTasks()
+ throws IOException {
+ if (settings.getCompress()) {
+ CompressedStream o = (CompressedStream)this.out;
+ o.finish();
+ o.flush();
+ o.end();
+ this.out = o.getWrappedStream();
+ }
+ }
+
+ /**
+ * Position in raw output (typically, physical file).
+ * Used making accounting of bytes written.
+ * @return Position in final media (assuming all flushing completes)
+ * @throws IOException
+ */
+ public long getPosition() {
+ return (countOut==null)? 0L : this.countOut.getCount();
+ }
+
+ public boolean isCompressed() {
+ return settings.getCompress();
+ }
+
+ protected void write(final byte [] b) throws IOException {
+ this.out.write(b);
+ }
+
+ protected void flush() throws IOException {
+ this.out.flush();
+ }
+
+ protected void write(byte[] b, int off, int len) throws IOException {
+ this.out.write(b, off, len);
+ }
+
+ protected void write(int b) throws IOException {
+ this.out.write(b);
+ }
+
+ /**
+ * Copy bytes from the provided InputStream to the target file/stream being
+ * written.
+ *
+ * @return number of bytes written (normally equal to {@code enforceLength})
+ * @param is
+ * InputStream to copy bytes from
+ * @param recordLength
+ * expected number of bytes to copy
+ * @param enforceLength
+ * whether to throw an exception if too many/too few bytes are
+ * available from stream
+ * @throws IOException
+ */
+ protected long copyFrom(final InputStream is, final long recordLength,
+ boolean enforceLength) throws IOException {
+ int read = scratchbuffer.length;
+ long tot = 0;
+ while ((tot < recordLength)
+ && (read = is.read(scratchbuffer)) != -1) {
+ int write = read;
+ // never write more than enforced length
+ write = (int) Math.min(write, recordLength - tot);
+ tot += read;
+ write(scratchbuffer, 0, write);
+ }
+ if (enforceLength && tot != recordLength) {
+ // throw exception if desired for read vs. declared mismatches
+ throw new IOException("Read " + tot + " but expected "
+ + recordLength);
+ }
+
+ return tot;
+ }
+
+ public void close() throws IOException {
+ if (this.out == null) {
+ return;
+ }
+ this.out.close();
+ this.out = null;
+ if (this.f != null && this.f.exists()) {
+ String path = this.f.getAbsolutePath();
+ if (path.endsWith(OCCUPIED_SUFFIX)) {
+ File f = new File(path.substring(0,
+ path.length() - OCCUPIED_SUFFIX.length()));
+ if (f.exists() & !f.delete()) {
+ logger.warning("Failed delete of " + f);
+ }
+ if (!this.f.renameTo(f)) {
+ logger.warning("Failed rename of " + path);
+ }
+ this.f = f;
+ }
+
+ logger.fine("Closed " + this.f.getAbsolutePath() +
+ ", size " + this.f.length());
+ }
+ }
+
+ protected OutputStream getOutputStream() {
+ return this.out;
+ }
+
+ /**
+ * An override so we get access to underlying output stream.
+ * and offer an end() that does not accompany closing underlying
+ * stream.
+ * @author stack
+ */
+ private class CompressedStream extends GZIPOutputStream {
+ public CompressedStream(OutputStream out)
+ throws IOException {
+ super(out);
+ }
+
+ /**
+ * @return Reference to stream being compressed.
+ */
+ OutputStream getWrappedStream() {
+ return this.out;
+ }
+
+ /**
+ * Release the deflater's native process resources,
+ * which otherwise would not occur until either
+ * finalization or DeflaterOutputStream.close()
+ * (which would also close underlying stream).
+ */
+ public void end() {
+ def.end();
+ }
+ }
+}
diff --git a/src/main/java/org/archive/io/WriterPoolSettings.java b/src/main/java/org/archive/io/WriterPoolSettings.java
new file mode 100644
index 00000000..d0805cdc
--- /dev/null
+++ b/src/main/java/org/archive/io/WriterPoolSettings.java
@@ -0,0 +1,39 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+import java.io.File;
+import java.util.List;
+
+/**
+ * Settings object for a {@link WriterPool}.
+ * Used creating {@link WriterPoolMember}s.
+ * @author stack
+ * @version $Date$, $Revision$
+ */
+public interface WriterPoolSettings {
+ public long getMaxFileSizeBytes();
+ public String getPrefix();
+ public String getTemplate();
+ public List calcOutputDirs();
+ public boolean getCompress();
+ public List getMetadata();
+ public boolean getFrequentFlushes();
+ public int getWriteBufferSize();
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/io/arc/ARC2WCDX.java b/src/main/java/org/archive/io/arc/ARC2WCDX.java
new file mode 100644
index 00000000..19010131
--- /dev/null
+++ b/src/main/java/org/archive/io/arc/ARC2WCDX.java
@@ -0,0 +1,243 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io.arc;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.PrintStream;
+import java.util.Date;
+import java.util.Iterator;
+import java.util.zip.GZIPOutputStream;
+
+import org.apache.commons.httpclient.Header;
+import org.apache.commons.httpclient.HeaderGroup;
+import org.apache.commons.httpclient.util.DateParseException;
+import org.apache.commons.httpclient.util.DateUtil;
+import org.archive.io.ArchiveRecord;
+import org.archive.util.ArchiveUtils;
+import org.archive.util.SURT;
+
+/**
+ * Create a 'Wide' CDX from an ARC. Takes one argument, the path to the ARC.
+ * Writes .wcdx.gz in same directory.
+ *
+ * @author gojomo
+ */
+public class ARC2WCDX {
+ final public static String WCDX_VERSION="0.1";
+
+ public static void main(String[] args) throws IOException {
+ String arcFilename = args[0];
+ createWcdx(arcFilename);
+ }
+
+ public static Object[] createWcdx(String arcFilename) throws IOException {
+ ARCReader reader = ARCReaderFactory.get(arcFilename);
+ Object[] retVal = createWcdx(reader);
+ reader.close();
+ return retVal;
+ }
+
+ public static Object[] createWcdx(ARCReader reader) {
+ reader.setDigest(true);
+
+ String wcdxPath = reader.getReaderIdentifier().replaceAll("\\.arc(\\.gz)?$",".wcdx.gz");
+ File wcdxFile = new File(wcdxPath+".open");
+ PrintStream writer = null;
+ long count = 0;
+ try {
+ writer = new PrintStream(new GZIPOutputStream(new FileOutputStream(wcdxFile)));
+
+ // write header: legend + timestamp
+ StringBuilder legend = new StringBuilder();
+ appendField(legend,"CDX");
+ appendField(legend,"surt-uri");
+ appendField(legend,"b"); // ARC timestamp
+ appendField(legend,"http-date");
+ appendField(legend,"s"); // status code
+ appendField(legend,"m"); // media type
+ appendField(legend,"sha1"); // content sha1
+ appendField(legend,"g"); // ARC name
+ appendField(legend,"V"); // start offset
+ appendField(legend,"end-offset"); // TODO: implement
+ appendField(legend,"n"); // ARC record length TODO: verify
+ appendField(legend,"http-content-length");
+ appendField(legend,"http-last-modified");
+ appendField(legend,"http-expires");
+ appendField(legend,"http-etag");
+ appendField(legend,"http-location");
+ appendField(legend,"e"); // IP
+ appendField(legend,"a"); // original URL
+ // WCDX version+creation time: crude version control
+ appendField(legend,WCDX_VERSION+"@"+ArchiveUtils.get14DigitDate());
+ writer.println(legend.toString());
+
+ Iterator iter = reader.iterator();
+ count = 0;
+ while(iter.hasNext()) {
+ ARCRecord record = (ARCRecord) iter.next();
+ record.close();
+ ARCRecordMetaData h = (ARCRecordMetaData) record.getHeader();
+ Header[] httpHeaders = record.getHttpHeaders();
+ if(httpHeaders==null) {
+ httpHeaders = new Header[0];
+ }
+ HeaderGroup hg = new HeaderGroup();
+ hg.setHeaders(httpHeaders);
+ StringBuilder builder = new StringBuilder();
+
+ // SURT-form URI
+ appendField(builder,SURT.fromURI(h.getUrl()));
+ // record timestamp ('b')
+ appendField(builder,h.getDate());
+ // http header date
+ appendTimeField(builder,hg.getFirstHeader("Date"));
+ // response code ('s')
+ appendField(builder,h.getStatusCode());
+ // media type ('m')
+ appendField(builder,h.getMimetype());
+ // content checksum (like 'c', but here Base32 SHA1)
+ appendField(builder,record.getDigestStr());
+ // arc name ('g')
+ appendField(builder,reader.getFileName());
+ // compressed start offset ('V')
+ appendField(builder,h.getOffset());
+
+ // compressed end offset (?)
+// appendField(builder,
+// reader.getInputStream() instanceof RepositionableStream
+// ? ((GzippedInputStream)reader.getInputStream()).vPosition()
+// : "-");
+ // TODO; leave unavail for now
+ appendField(builder, "-");
+
+ // uncompressed (declared in ARC headerline) record length
+ appendField(builder,h.getLength());
+ // http header content-length
+ appendField(builder,hg.getFirstHeader("Content-Length"));
+
+ // http header mod-date
+ appendTimeField(builder,hg.getFirstHeader("Last-Modified"));
+ // http header expires
+ appendTimeField(builder,hg.getFirstHeader("Expires"));
+
+ // http header etag
+ appendField(builder,hg.getFirstHeader("ETag"));
+ // http header redirect ('Location' header?)
+ appendField(builder,hg.getFirstHeader("Location"));
+ // ip ('e')
+ appendField(builder,h.getIp());
+ // original URI
+ appendField(builder,h.getUrl());
+ // TODO MAYBE - a title from inside content?
+
+ writer.println(builder.toString());
+ count++;
+ }
+ wcdxFile.renameTo(new File(wcdxPath));
+ } catch (IOException e) {
+ // soldier on: but leave '.open' wcdx file as indicator of error
+ if(!wcdxFile.exists()) {
+ try {
+ wcdxFile.createNewFile();
+ } catch (IOException e1) {
+ // TODO Auto-generated catch block
+ throw new RuntimeException(e1);
+ }
+ }
+ } catch (RuntimeException e) {
+ // soldier on: but leave '.open' wcdx file as indicator of error
+ if(!wcdxFile.exists()) {
+ try {
+ wcdxFile.createNewFile();
+ } catch (IOException e1) {
+ // TODO Auto-generated catch block
+ throw new RuntimeException(e1);
+ }
+ }
+ } finally {
+ if(writer!=null) {
+ writer.close();
+ }
+ }
+
+ return new Object[] {wcdxPath, count};
+ }
+
+ protected static void appendField(StringBuilder builder, Object obj) {
+ if(builder.length()>0) {
+ // prepend with delimiter
+ builder.append(' ');
+ }
+ if(obj instanceof Header) {
+ obj = ((Header)obj).getValue().trim();
+ }
+
+ builder.append((obj==null||obj.toString().length()==0)?"-":obj);
+ }
+
+ protected static void appendTimeField(StringBuilder builder, Object obj) {
+ if(builder.length()>0) {
+ // prepend with delimiter
+ builder.append(' ');
+ }
+ if(obj==null) {
+ builder.append("-");
+ return;
+ }
+ if(obj instanceof Header) {
+ String s = ((Header)obj).getValue().trim();
+ try {
+ Date date = DateUtil.parseDate(s);
+ String d = ArchiveUtils.get14DigitDate(date);
+ if(d.startsWith("209")) {
+ d = "199"+d.substring(3);
+ }
+ obj = d;
+ } catch (DateParseException e) {
+ builder.append('e');
+ return;
+ }
+
+ }
+ builder.append(obj);
+ }
+}
+
+//'wide' CDX
+//a original url
+//b timestamp
+//s resp code
+//m type
+//? content md5 (full 'k'? 'c'?
+//g arc name
+//V compressed start offset
+//? compressed length
+//n? uncompressed length
+//? mod date
+//? expires
+//? server 'date' hdr
+//? etag
+//r redirect ('Location'?)
+//e ip
+//MAYBE:
+//? TITLE from HTML or other format?
+
+
diff --git a/src/main/java/org/archive/io/arc/ARCConstants.java b/src/main/java/org/archive/io/arc/ARCConstants.java
new file mode 100644
index 00000000..c44cfef7
--- /dev/null
+++ b/src/main/java/org/archive/io/arc/ARCConstants.java
@@ -0,0 +1,29 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io.arc;
+
+
+/**
+ * Constants used by ARC files and in ARC file processing.
+ *
+ * @author stack
+ * @deprecated
+ */
+public interface ARCConstants extends org.archive.format.arc.ARCConstants {
+}
diff --git a/src/main/java/org/archive/io/arc/ARCLocation.java b/src/main/java/org/archive/io/arc/ARCLocation.java
new file mode 100644
index 00000000..c6c64437
--- /dev/null
+++ b/src/main/java/org/archive/io/arc/ARCLocation.java
@@ -0,0 +1,37 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io.arc;
+
+/**
+ * Datastructure to hold ARC record location.
+ * Used by wayback machine.
+ * @author stack
+ */
+public interface ARCLocation {
+ /**
+ * @return Returns the ARC filename. Can be full path to ARC, URL to an
+ * ARC or just the portion of an ARC name that is unique to a collection.
+ */
+ public String getName();
+
+ /**
+ * @return Returns the offset into the ARC.
+ */
+ public long getOffset();
+}
diff --git a/src/main/java/org/archive/io/arc/ARCReader.java b/src/main/java/org/archive/io/arc/ARCReader.java
new file mode 100644
index 00000000..7f85cc2a
--- /dev/null
+++ b/src/main/java/org/archive/io/arc/ARCReader.java
@@ -0,0 +1,553 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io.arc;
+
+import java.io.ByteArrayOutputStream;
+import java.io.Closeable;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.logging.Logger;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+import org.apache.commons.cli.PosixParser;
+import org.archive.io.ArchiveReader;
+import org.archive.io.ArchiveRecord;
+import org.archive.io.ArchiveRecordHeader;
+import org.archive.io.RecoverableIOException;
+import org.archive.io.WriterPoolMember;
+import org.archive.util.ArchiveUtils;
+
+
+/**
+ * Get an iterator on an ARC file or get a record by absolute position.
+ *
+ * ARC files are described here:
+ * Arc
+ * File Format.
+ *
+ *
This class knows how to parse an ARC file. Pass it a file path
+ * or an URL to an ARC. It can parse ARC Version 1 and 2.
+ *
+ *
Iterator returns ARCRecord
+ * though {@link Iterator#next()} is returning
+ * java.lang.Object. Cast the return.
+ *
+ *
Profiling java.io vs. memory-mapped ByteBufferInputStream shows the
+ * latter slightly slower -- but not by much. TODO: Test more. Just
+ * change {@link #getInputStream(File, long)}.
+ *
+ * @author stack
+ * @version $Date$ $Revision$
+ */
+public abstract class ARCReader extends ArchiveReader
+implements ARCConstants, Closeable {
+ private final Logger logger = Logger.getLogger(ARCReader.class.getName());
+
+ /**
+ * Set to true if we are aligned on first record of Archive file.
+ * We used depend on offset. If offset was zero, then we were
+ * aligned on first record. This is no longer necessarily the case when
+ * Reader is created at an offset into an Archive file: The offset is zero
+ * but its relative to where we started reading.
+ */
+ private boolean alignedOnFirstRecord = true;
+
+ private boolean parseHttpHeaders = true;
+
+ protected ARCReader() {
+ super();
+ }
+
+ /**
+ * Skip over any trailing new lines at end of the record so we're lined up
+ * ready to read the next.
+ * @param record
+ * @throws IOException
+ */
+ protected void gotoEOR(ArchiveRecord record) throws IOException {
+ if (getIn().available() <= 0) {
+ return;
+ }
+
+ // Remove any trailing LINE_SEPARATOR
+ int c = -1;
+ while (getIn().available() > 0) {
+ if (getIn().markSupported()) {
+ getIn().mark(1);
+ }
+ c = getIn().read();
+ if (c != -1) {
+ if (c == LINE_SEPARATOR) {
+ continue;
+ }
+ if (getIn().markSupported()) {
+ // We've overread. We're probably in next record. There is
+ // no way of telling for sure. It may be dross at end of
+ // current record. Backup.
+ getIn().reset();
+ break;
+ }
+ ArchiveRecordHeader h = (getCurrentRecord() != null)?
+ record.getHeader(): null;
+ throw new IOException("Read " + (char)c +
+ " when only " + LINE_SEPARATOR + " expected. " +
+ getReaderIdentifier() + ((h != null)?
+ h.getHeaderFields().toString(): ""));
+ }
+ }
+ }
+
+ /**
+ * Create new arc record.
+ *
+ * Encapsulate housekeeping that has to do w/ creating a new record.
+ *
+ *
Call this method at end of constructor to read in the
+ * arcfile header. Will be problems reading subsequent arc records
+ * if you don't since arcfile header has the list of metadata fields for
+ * all records that follow.
+ *
+ *
When parsing through ARCs writing out CDX info, we spend about
+ * 38% of CPU in here -- about 30% of which is in getTokenizedHeaderLine
+ * -- of which 16% is reading.
+ *
+ * @param is InputStream to use.
+ * @param offset Absolute offset into arc file.
+ * @return An arc record.
+ * @throws IOException
+ */
+ protected ARCRecord createArchiveRecord(InputStream is, long offset)
+ throws IOException {
+ try {
+ String version = super.getVersion();
+ ARCRecord record = new ARCRecord(is, getReaderIdentifier(), offset,
+ isDigest(), isStrict(), isParseHttpHeaders(),
+ isAlignedOnFirstRecord(), version);
+ if (version != null && super.getVersion() == null)
+ super.setVersion(version);
+ currentRecord(record);
+ } catch (IOException e) {
+ if (e instanceof RecoverableIOException) {
+ // Don't mess with RecoverableIOExceptions. Let them out.
+ throw e;
+ }
+ IOException newE = new IOException(e.getMessage() + " (Offset " +
+ offset + ").");
+ newE.setStackTrace(e.getStackTrace());
+ throw newE;
+ }
+ return (ARCRecord)getCurrentRecord();
+ }
+
+ /**
+ * Returns version of this ARC file. Usually read from first record of ARC.
+ * If we're reading without having first read the first record -- e.g.
+ * random access into middle of an ARC -- then version will not have been
+ * set. For now, we return a default, version 1.1. Later, if more than
+ * just one version of ARC, we could look at such as the meta line to see
+ * what version of ARC this is.
+ * @return Version of this ARC file.
+ */
+ public String getVersion() {
+ return (super.getVersion() == null)? "1.1": super.getVersion();
+ }
+
+ protected boolean isAlignedOnFirstRecord() {
+ return alignedOnFirstRecord;
+ }
+
+ protected void setAlignedOnFirstRecord(boolean alignedOnFirstRecord) {
+ this.alignedOnFirstRecord = alignedOnFirstRecord;
+ }
+
+ /**
+ * @return Returns the parseHttpHeaders.
+ */
+ public boolean isParseHttpHeaders() {
+ return this.parseHttpHeaders;
+ }
+
+ /**
+ * @param parse The parseHttpHeaders to set.
+ */
+ public void setParseHttpHeaders(boolean parse) {
+ this.parseHttpHeaders = parse;
+ }
+
+ public String getFileExtension() {
+ return ARC_FILE_EXTENSION;
+ }
+
+ public String getDotFileExtension() {
+ return DOT_ARC_FILE_EXTENSION;
+ }
+
+ protected boolean output(final String format)
+ throws IOException, java.text.ParseException {
+ boolean result = super.output(format);
+ if(!result && (format.equals(NOHEAD) || format.equals(HEADER))) {
+ throw new IOException(format +
+ " format only supported for single Records");
+ }
+ return result;
+ }
+
+ public boolean outputRecord(final String format) throws IOException {
+ boolean result = super.outputRecord(format);
+ if (result) {
+ return result;
+ }
+ if (format.equals(NOHEAD)) {
+ // No point digesting if dumping content.
+ setDigest(false);
+ ARCRecord r = (ARCRecord) get();
+ r.skipHttpHeader();
+ r.dump();
+ result = true;
+ } else if (format.equals(HEADER)) {
+ // No point digesting if dumping content.
+ setDigest(false);
+ ARCRecord r = (ARCRecord) get();
+ r.dumpHttpHeader();
+ result = true;
+ }
+
+ return result;
+ }
+
+ public void dump(final boolean compress)
+ throws IOException, java.text.ParseException {
+ // No point digesting if we're doing a dump.
+ setDigest(false);
+ boolean firstRecord = true;
+ ARCWriter writer = null;
+ for (Iterator ii = iterator(); ii.hasNext();) {
+ ARCRecord r = (ARCRecord)ii.next();
+ // We're to dump the arc on stdout.
+ // Get the first record's data if any.
+ ARCRecordMetaData meta = r.getMetaData();
+ if (firstRecord) {
+ firstRecord = false;
+ // Get an ARCWriter.
+ ByteArrayOutputStream baos =
+ new ByteArrayOutputStream(r.available());
+ // This is slow but done only once at top of ARC.
+ while (r.available() > 0) {
+ baos.write(r.read());
+ }
+ List listOfMetadata = new ArrayList();
+ listOfMetadata.add(baos.toString(WriterPoolMember.UTF8));
+ // Assume getArc returns full path to file. ARCWriter
+ // or new File will complain if it is otherwise.
+ List outDirs = new ArrayList();
+ WriterPoolSettingsData settings =
+ new WriterPoolSettingsData("","",-1L,compress,outDirs,listOfMetadata);
+ writer = new ARCWriter(new AtomicInteger(), System.out,
+ new File(meta.getArc()), settings);
+ continue;
+ }
+
+ writer.write(meta.getUrl(), meta.getMimetype(), meta.getIp(),
+ ArchiveUtils.parse14DigitDate(meta.getDate()).getTime(),
+ (int)meta.getLength(), r);
+ }
+ // System.out.println(System.currentTimeMillis() - start);
+ }
+
+ /**
+ * @return an ArchiveReader that will delete a local file on close. Used
+ * when we bring Archive files local and need to clean up afterward.
+ */
+ public ARCReader getDeleteFileOnCloseReader(final File f) {
+ final ARCReader d = this;
+ return new ARCReader() {
+ private final ARCReader delegate = d;
+ private File archiveFile = f;
+
+ public void close() throws IOException {
+ this.delegate.close();
+ if (this.archiveFile != null) {
+ if (archiveFile.exists()) {
+ archiveFile.delete();
+ }
+ this.archiveFile = null;
+ }
+ }
+
+ public ArchiveRecord get(long o) throws IOException {
+ return this.delegate.get(o);
+ }
+
+ public boolean isDigest() {
+ return this.delegate.isDigest();
+ }
+
+ public boolean isStrict() {
+ return this.delegate.isStrict();
+ }
+
+ public Iterator iterator() {
+ return this.delegate.iterator();
+ }
+
+ public void setDigest(boolean d) {
+ this.delegate.setDigest(d);
+ }
+
+ public void setStrict(boolean s) {
+ this.delegate.setStrict(s);
+ }
+
+ public List validate() throws IOException {
+ return this.delegate.validate();
+ }
+
+ @Override
+ public ArchiveRecord get() throws IOException {
+ return this.delegate.get();
+ }
+
+ @Override
+ public String getVersion() {
+ return this.delegate.getVersion();
+ }
+
+ @Override
+ public List validate(int noRecords) throws IOException {
+ return this.delegate.validate(noRecords);
+ }
+
+ @Override
+ protected ARCRecord createArchiveRecord(InputStream is,
+ long offset)
+ throws IOException {
+ return this.delegate.createArchiveRecord(is, offset);
+ }
+
+ @Override
+ protected void gotoEOR(ArchiveRecord record) throws IOException {
+ this.delegate.gotoEOR(record);
+ }
+
+ @Override
+ public void dump(boolean compress)
+ throws IOException, java.text.ParseException {
+ this.delegate.dump(compress);
+ }
+
+ @Override
+ public String getDotFileExtension() {
+ return this.delegate.getDotFileExtension();
+ }
+
+ @Override
+ public String getFileExtension() {
+ return this.delegate.getFileExtension();
+ }
+ };
+ }
+
+ // Static methods follow.
+
+ /**
+ *
+ * @param formatter Help formatter instance.
+ * @param options Usage options.
+ * @param exitCode Exit code.
+ */
+ private static void usage(HelpFormatter formatter, Options options,
+ int exitCode) {
+ formatter.printHelp("java org.archive.io.arc.ARCReader" +
+ " [--digest=true|false] \\\n" +
+ " [--format=cdx|cdxfile|dump|gzipdump|header|nohead]" +
+ " [--offset=#] \\\n[--strict] [--parse] ARC_FILE|ARC_URL",
+ options);
+ System.exit(exitCode);
+ }
+
+ /**
+ * Write out the arcfile.
+ *
+ * @param reader
+ * @param format Format to use outputting.
+ * @throws IOException
+ * @throws java.text.ParseException
+ */
+ protected static void output(ARCReader reader, String format)
+ throws IOException, java.text.ParseException {
+ if (!reader.output(format)) {
+ throw new IOException("Unsupported format: " + format);
+ }
+ }
+
+ /**
+ * Generate a CDX index file for an ARC file.
+ *
+ * @param urlOrPath The ARC file to generate a CDX index for
+ * @throws IOException
+ * @throws java.text.ParseException
+ */
+ public static void createCDXIndexFile(String urlOrPath)
+ throws IOException, java.text.ParseException {
+ ARCReader r = ARCReaderFactory.get(urlOrPath);
+ r.setStrict(false);
+ r.setParseHttpHeaders(true);
+ r.setDigest(true);
+ output(r, CDX_FILE);
+ }
+
+ /**
+ * Command-line interface to ARCReader.
+ *
+ * Here is the command-line interface:
+ *
+ * usage: java org.archive.io.arc.ARCReader [--offset=#] ARCFILE
+ * -h,--help Prints this message and exits.
+ * -o,--offset Outputs record at this offset into arc file.
+ *
+ *
See in $HERITRIX_HOME/bin/arcreader for a script that'll
+ * take care of classpaths and the calling of ARCReader.
+ *
+ *
Outputs using a pseudo-CDX format as described here:
+ * CDX
+ * Legent and here
+ * Example.
+ * Legend used in below is: 'CDX b e a m s c V (or v if uncompressed) n g'.
+ * Hash is hard-coded straight SHA-1 hash of content.
+ *
+ * @param args Command-line arguments.
+ * @throws ParseException Failed parse of the command line.
+ * @throws IOException
+ * @throws java.text.ParseException
+ */
+ @SuppressWarnings("unchecked")
+ public static void main(String [] args)
+ throws ParseException, IOException, java.text.ParseException {
+ Options options = getOptions();
+ options.addOption(new Option("p","parse", false, "Parse headers."));
+ PosixParser parser = new PosixParser();
+ CommandLine cmdline = parser.parse(options, args, false);
+ List cmdlineArgs = cmdline.getArgList();
+ Option [] cmdlineOptions = cmdline.getOptions();
+ HelpFormatter formatter = new HelpFormatter();
+
+ // If no args, print help.
+ if (cmdlineArgs.size() <= 0) {
+ usage(formatter, options, 0);
+ }
+
+ // Now look at options passed.
+ long offset = -1;
+ boolean digest = false;
+ boolean strict = false;
+ boolean parse = false;
+ String format = CDX;
+ for (int i = 0; i < cmdlineOptions.length; i++) {
+ switch(cmdlineOptions[i].getId()) {
+ case 'h':
+ usage(formatter, options, 0);
+ break;
+
+ case 'o':
+ offset =
+ Long.parseLong(cmdlineOptions[i].getValue());
+ break;
+
+ case 's':
+ strict = true;
+ break;
+
+ case 'p':
+ parse = true;
+ break;
+
+ case 'd':
+ digest = getTrueOrFalse(cmdlineOptions[i].getValue());
+ break;
+
+ case 'f':
+ format = cmdlineOptions[i].getValue().toLowerCase();
+ boolean match = false;
+ // List of supported formats.
+ final String [] supportedFormats =
+ {CDX, DUMP, GZIP_DUMP, HEADER, NOHEAD, CDX_FILE};
+ for (int ii = 0; ii < supportedFormats.length; ii++) {
+ if (supportedFormats[ii].equals(format)) {
+ match = true;
+ break;
+ }
+ }
+ if (!match) {
+ usage(formatter, options, 1);
+ }
+ break;
+
+ default:
+ throw new RuntimeException("Unexpected option: " +
+ + cmdlineOptions[i].getId());
+ }
+ }
+
+ if (offset >= 0) {
+ if (cmdlineArgs.size() != 1) {
+ System.out.println("Error: Pass one arcfile only.");
+ usage(formatter, options, 1);
+ }
+ ARCReader arc = ARCReaderFactory.get((String)cmdlineArgs.get(0),
+ offset);
+ arc.setStrict(strict);
+ // We must parse headers if we need to skip them.
+ if (format.equals(NOHEAD) || format.equals(HEADER)) {
+ parse = true;
+ }
+ arc.setParseHttpHeaders(parse);
+ outputRecord(arc, format);
+ } else {
+ for (String urlOrPath : cmdlineArgs) {
+ try {
+ ARCReader r = ARCReaderFactory.get(urlOrPath);
+ r.setStrict(strict);
+ r.setParseHttpHeaders(parse);
+ r.setDigest(digest);
+ output(r, format);
+ } catch (RuntimeException e) {
+ // Write out name of file we failed on to help with
+ // debugging. Then print stack trace and try to keep
+ // going. We do this for case where we're being fed
+ // a bunch of ARCs; just note the bad one and move
+ // on to the next.
+ System.err.println("Exception processing " + urlOrPath +
+ ": " + e.getMessage());
+ e.printStackTrace(System.err);
+ System.exit(1);
+ }
+ }
+ }
+ }
+}
diff --git a/src/main/java/org/archive/io/arc/ARCReaderFactory.java b/src/main/java/org/archive/io/arc/ARCReaderFactory.java
new file mode 100644
index 00000000..e7dc1625
--- /dev/null
+++ b/src/main/java/org/archive/io/arc/ARCReaderFactory.java
@@ -0,0 +1,454 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io.arc;
+
+import java.io.BufferedInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Iterator;
+import java.util.logging.Level;
+
+import org.archive.io.ArchiveReader;
+import org.archive.io.ArchiveReaderFactory;
+import org.archive.io.ArchiveRecord;
+import org.archive.io.ArchiveRecordHeader;
+import org.archive.util.FileUtils;
+import org.archive.util.zip.GZIPMembersInputStream;
+import org.archive.util.zip.GzipHeader;
+import org.archive.util.zip.NoGzipMagicException;
+
+import com.google.common.io.CountingInputStream;
+
+
+/**
+ * Factory that returns an ARCReader.
+ *
+ * Can handle compressed and uncompressed ARCs.
+ *
+ * @author stack
+ */
+public class ARCReaderFactory extends ArchiveReaderFactory
+implements ARCConstants {
+ /**
+ * This factory instance.
+ */
+ private static final ARCReaderFactory factory = new ARCReaderFactory();
+
+ /**
+ * Shutdown any access to default constructor.
+ */
+ protected ARCReaderFactory() {
+ super();
+ }
+
+ public static ARCReader get(String arcFileOrUrl)
+ throws MalformedURLException, IOException {
+ return (ARCReader)ARCReaderFactory.factory.
+ getArchiveReader(arcFileOrUrl);
+ }
+
+ public static ARCReader get(String arcFileOrUrl, final long offset)
+ throws MalformedURLException, IOException {
+ return (ARCReader)ARCReaderFactory.factory.
+ getArchiveReader(arcFileOrUrl, offset);
+ }
+
+ public static ARCReader get(final File f) throws IOException {
+ return (ARCReader)ARCReaderFactory.factory.getArchiveReader(f);
+ }
+
+ public static ARCReader get(final File f, final long offset)
+ throws IOException {
+ return (ARCReader)ARCReaderFactory.factory.getArchiveReader(f, offset);
+ }
+
+ protected ArchiveReader getArchiveReader(final File f, final long offset)
+ throws IOException {
+ return getArchiveReader(f, true, offset);
+ }
+
+ /**
+ * @param f An arcfile to read.
+ * @param skipSuffixTest Set to true if want to test that ARC has proper
+ * suffix. Use this method and pass false to open ARCs
+ * with the .open or otherwise suffix.
+ * @param offset Have returned ARCReader set to start reading at passed
+ * offset.
+ * @return An ARCReader.
+ * @throws IOException
+ */
+ public static ARCReader get(final File f,
+ final boolean skipSuffixTest, final long offset)
+ throws IOException {
+ return (ARCReader)ARCReaderFactory.factory.getArchiveReader(f,
+ skipSuffixTest, offset);
+ }
+
+ protected ArchiveReader getArchiveReader(final File arcFile,
+ final boolean skipSuffixTest, final long offset)
+ throws IOException {
+ boolean compressed = testCompressedARCFile(arcFile, skipSuffixTest);
+ if (!compressed) {
+ if (!FileUtils.isReadableWithExtensionAndMagic(arcFile,
+ ARC_FILE_EXTENSION, ARC_MAGIC_NUMBER)) {
+ throw new IOException(arcFile.getAbsolutePath() +
+ " is not an Internet Archive ARC file.");
+ }
+ }
+ return compressed?
+ (ARCReader)ARCReaderFactory.factory.
+ new CompressedARCReader(arcFile, offset):
+ (ARCReader)ARCReaderFactory.factory.
+ new UncompressedARCReader(arcFile, offset);
+ }
+
+ public static ArchiveReader get(final String s, final InputStream is,
+ final boolean atFirstRecord)
+ throws IOException {
+ return ARCReaderFactory.factory.getArchiveReader(s, is,
+ atFirstRecord);
+ }
+
+ protected ArchiveReader getArchiveReader(final String arc,
+ final InputStream is, final boolean atFirstRecord)
+ throws IOException {
+
+ // We do this mark() reset() stuff, wrapping in a BufferedInputStream if
+ // necessary to make it work, because testCompressedARCStream() consumes
+ // some bytes from the input stream
+ InputStream possiblyWrapped;
+ if (is.markSupported()) {
+ possiblyWrapped = is;
+ } else {
+ possiblyWrapped = new BufferedInputStream(is);
+ }
+
+ possiblyWrapped.mark(100);
+ boolean compressed = testCompressedARCStream(possiblyWrapped);
+ possiblyWrapped.reset();
+
+ if (compressed) {
+ return new CompressedARCReader(arc, possiblyWrapped, atFirstRecord);
+ } else {
+ return new UncompressedARCReader(arc, possiblyWrapped);
+ }
+ }
+
+ /**
+ * Get an ARCReader aligned at offset. This version of get
+ * will not bring the ARC local but will try to stream across the net making
+ * an HTTP 1.1 Range request on remote http server (RFC1435 Section 14.35).
+ *
+ * @param arcUrl HTTP URL for an ARC (All ARCs considered remote).
+ * @param offset Offset into ARC at which to start fetching.
+ * @return An ARCReader aligned at offset.
+ * @throws IOException
+ */
+ public static ARCReader get(final URL arcUrl, final long offset)
+ throws IOException {
+ return (ARCReader)ARCReaderFactory.factory.getArchiveReader(arcUrl,
+ offset);
+ }
+
+ /**
+ * Get an ARCReader.
+ * Pulls the ARC local into whereever the System Property
+ * java.io.tmpdir points. It then hands back an ARCReader that
+ * points at this local copy. A close on this ARCReader instance will
+ * remove the local copy.
+ * @param arcUrl An URL that points at an ARC.
+ * @return An ARCReader.
+ * @throws IOException
+ */
+ public static ARCReader get(final URL arcUrl)
+ throws IOException {
+ return (ARCReader)ARCReaderFactory.factory.getArchiveReader(arcUrl);
+ }
+
+ /**
+ * @param arcFile File to test.
+ * @return True if arcFile is compressed ARC.
+ * @throws IOException
+ */
+ public boolean isCompressed(File arcFile) throws IOException {
+ return testCompressedARCFile(arcFile);
+ }
+
+ /**
+ * Check file is compressed and in ARC GZIP format.
+ *
+ * @param arcFile File to test if its Internet Archive ARC file
+ * GZIP compressed.
+ *
+ * @return True if this is an Internet Archive GZIP'd ARC file (It begins
+ * w/ the Internet Archive GZIP header and has the
+ * COMPRESSED_ARC_FILE_EXTENSION suffix).
+ *
+ * @exception IOException If file does not exist or is not unreadable.
+ */
+ public static boolean testCompressedARCFile(File arcFile)
+ throws IOException {
+ return testCompressedARCFile(arcFile, false);
+ }
+
+ /**
+ * Check file is compressed and in ARC GZIP format.
+ *
+ * @param arcFile File to test if its Internet Archive ARC file
+ * GZIP compressed.
+ * @param skipSuffixCheck Set to true if we're not to test on the
+ * '.arc.gz' suffix.
+ *
+ * @return True if this is an Internet Archive GZIP'd ARC file (It begins
+ * w/ the Internet Archive GZIP header).
+ *
+ * @exception IOException If file does not exist or is not unreadable.
+ */
+ public static boolean testCompressedARCFile(File arcFile,
+ boolean skipSuffixCheck)
+ throws IOException {
+ boolean compressedARCFile = false;
+ FileUtils.assertReadable(arcFile);
+ if(!skipSuffixCheck && !arcFile.getName().toLowerCase()
+ .endsWith(COMPRESSED_ARC_FILE_EXTENSION)) {
+ return compressedARCFile;
+ }
+
+ final InputStream is = new FileInputStream(arcFile);
+ try {
+ compressedARCFile = testCompressedARCStream(is);
+ } finally {
+ is.close();
+ }
+ return compressedARCFile;
+ }
+
+ public static boolean isARCSuffix(final String arcName) {
+ return (arcName == null)?
+ false:
+ (arcName.toLowerCase().endsWith(DOT_COMPRESSED_ARC_FILE_EXTENSION))?
+ true:
+ (arcName.toLowerCase().endsWith(DOT_ARC_FILE_EXTENSION))?
+ true: false;
+ }
+
+ /**
+ * Tests passed stream is gzip stream by reading in the HEAD.
+ * Does not reposition the stream. That is left up to the caller.
+ * @param is An InputStream.
+ * @return True if compressed stream.
+ * @throws IOException
+ */
+ public static boolean testCompressedARCStream(final InputStream is)
+ throws IOException {
+ boolean compressedARCFile = false;
+ GzipHeader gh = null;
+ try {
+ gh = new GzipHeader(is);
+ } catch (NoGzipMagicException e) {
+ return false;
+ }
+
+ byte[] fextra = gh.getFextra();
+ // Now make sure following bytes are IA GZIP comment.
+ // First check length. ARC_GZIP_EXTRA_FIELD includes length
+ // so subtract two and start compare to ARC_GZIP_EXTRA_FIELD
+ // at +2.
+ // some Alexa ARC files gzip extra fields have changed slightly
+ // after the first two bytes, so we'll just look for the 'LX'
+ // extension for valid IA ARC files.
+ if (fextra != null) {
+ if (fextra.length >= ARC_GZIP_EXTRA_FIELD.length - 2) {
+ if (fextra[0] == ARC_GZIP_EXTRA_FIELD[2] &&
+ fextra[1] == ARC_GZIP_EXTRA_FIELD[3]) {
+ compressedARCFile = true;
+ }
+ }
+ } else {
+ // Some old arcs don't have an extra header at all, but they're still compressed
+ compressedARCFile = true;
+ }
+
+ return compressedARCFile;
+ }
+
+ /**
+ * Uncompressed arc file reader.
+ * @author stack
+ */
+ public class UncompressedARCReader extends ARCReader {
+ /**
+ * Constructor.
+ * @param f Uncompressed arcfile to read.
+ * @throws IOException
+ */
+ public UncompressedARCReader(final File f)
+ throws IOException {
+ this(f, 0);
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param f Uncompressed arcfile to read.
+ * @param offset Offset at which to position ARCReader.
+ * @throws IOException
+ */
+ public UncompressedARCReader(final File f, final long offset)
+ throws IOException {
+ // Arc file has been tested for existence by time it has come
+ // to here.
+ setIn(new CountingInputStream(getInputStream(f, offset)));
+ getIn().skip(offset);
+ initialize(f.getAbsolutePath());
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param f Uncompressed arc to read.
+ * @param is InputStream.
+ */
+ public UncompressedARCReader(final String f, final InputStream is) {
+ // Arc file has been tested for existence by time it has come
+ // to here.
+ setIn(new CountingInputStream(is));
+ initialize(f);
+ }
+ }
+
+ /**
+ * Compressed arc file reader.
+ *
+ * @author stack
+ */
+ public class CompressedARCReader extends ARCReader {
+
+ /**
+ * Constructor.
+ *
+ * @param f
+ * Compressed arcfile to read.
+ * @throws IOException
+ */
+ public CompressedARCReader(final File f) throws IOException {
+ this(f, 0);
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param f Compressed arcfile to read.
+ * @param offset Position at where to start reading file.
+ * @throws IOException
+ */
+ public CompressedARCReader(final File f, final long offset)
+ throws IOException {
+ // Arc file has been tested for existence by time it has come
+ // to here.
+ setIn(new GZIPMembersInputStream(getInputStream(f, offset)));
+ ((GZIPMembersInputStream)getIn()).compressedSeek(offset);
+ setCompressed((offset == 0)); // TODO: does this make sense???
+ initialize(f.getAbsolutePath());
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param f Compressed arcfile.
+ * @param is InputStream to use.
+ * @throws IOException
+ */
+ public CompressedARCReader(final String f, final InputStream is,
+ final boolean atFirstRecord)
+ throws IOException {
+ // Arc file has been tested for existence by time it has come
+ // to here.
+ setIn(new GZIPMembersInputStream(is));
+ setCompressed(true);
+ setAlignedOnFirstRecord(atFirstRecord);
+ initialize(f);
+ }
+
+ /**
+ * Get record at passed offset.
+ *
+ * @param offset
+ * Byte index into arcfile at which a record starts.
+ * @return An ARCRecord reference.
+ * @throws IOException
+ */
+ public ARCRecord get(long offset) throws IOException {
+ cleanupCurrentRecord();
+ ((GZIPMembersInputStream)getIn()).compressedSeek(offset);
+ return createArchiveRecord(getIn(), offset);
+ }
+
+ public Iterator iterator() {
+ /**
+ * Override ARCRecordIterator so can base returned iterator on
+ * GzippedInputStream iterator.
+ */
+ return new ArchiveRecordIterator() {
+ private GZIPMembersInputStream gis =
+ (GZIPMembersInputStream)getIn();
+
+ private Iterator gzipIterator = this.gis.memberIterator();
+
+ protected boolean innerHasNext() {
+ return this.gzipIterator.hasNext();
+ }
+
+ protected ArchiveRecord innerNext() throws IOException {
+ InputStream is = this.gzipIterator.next();
+ return createArchiveRecord(is, Math.max(gis.getCurrentMemberStart(), gis.getCurrentMemberEnd()));
+ }
+ };
+ }
+
+ protected void gotoEOR(ArchiveRecord rec) throws IOException {
+ int c;
+ while ((c = getIn().read())==LINE_SEPARATOR);
+ if(c==-1) {
+ return;
+ }
+ long skipped = 1;
+ while (getIn().read()>-1) {
+ skipped++;
+ }
+ // Report on system error the number of unexpected characters
+ // at the end of this record.
+ ArchiveRecordHeader meta = (getCurrentRecord() != null)?
+ rec.getHeader(): null;
+ String message = "Record STARTING at " +
+ ((GZIPMembersInputStream)getIn()).getCurrentMemberStart() +
+ " has " + skipped + " trailing byte(s): " +
+ ((meta != null)? meta.toString(): "");
+ if (isStrict()) {
+ throw new IOException(message);
+ }
+ logStdErr(Level.WARNING, message);
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/io/arc/ARCRecord.java b/src/main/java/org/archive/io/arc/ARCRecord.java
new file mode 100644
index 00000000..21bea07c
--- /dev/null
+++ b/src/main/java/org/archive/io/arc/ARCRecord.java
@@ -0,0 +1,835 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io.arc;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import java.util.regex.Matcher;
+
+import org.apache.commons.httpclient.Header;
+import org.apache.commons.httpclient.StatusLine;
+import org.apache.commons.httpclient.util.EncodingUtil;
+import org.apache.commons.lang.StringUtils;
+import org.archive.io.ArchiveRecord;
+import org.archive.io.ArchiveRecordHeader;
+import org.archive.io.RecoverableIOException;
+import org.archive.util.InetAddressUtil;
+import org.archive.util.LaxHttpParser;
+import org.archive.util.TextUtils;
+
+/**
+ * An ARC file record.
+ * Does not compass the ARCRecord metadata line, just the record content.
+ * @author stack
+ */
+public class ARCRecord extends ArchiveRecord implements ARCConstants {
+ /**
+ * Http status line object.
+ *
+ * May be null if record is not http.
+ */
+ private StatusLine httpStatus = null;
+
+ /**
+ * Http header bytes.
+ *
+ * If non-null and bytes available, give out its contents before we
+ * go back to the underlying stream.
+ */
+ private InputStream httpHeaderStream = null;
+
+ /**
+ * Http headers.
+ *
+ * Only populated after reading of headers.
+ */
+ private Header [] httpHeaders = null;
+
+ /**
+ * Array of field names.
+ *
+ * Used to initialize headerFieldNameKeys.
+ */
+ private final String [] headerFieldNameKeysArray = {
+ URL_FIELD_KEY,
+ IP_HEADER_FIELD_KEY,
+ DATE_FIELD_KEY,
+ MIMETYPE_FIELD_KEY,
+ LENGTH_FIELD_KEY
+ };
+
+ /**
+ * An array of the header field names found in the ARC file header on
+ * the 3rd line.
+ *
+ * We used to read these in from the arc file first record 3rd line but
+ * now we hardcode them for sake of improved performance.
+ */
+ private final List headerFieldNameKeys =
+ Arrays.asList(this.headerFieldNameKeysArray);
+
+ /**
+ * Http header bytes read while trying to read http header
+ */
+ public long httpHeaderBytesRead = -1;
+
+ /**
+ * record length from metadata line
+ */
+ public long recordDeclaredLength;
+
+ /**
+ * null if source was not compressed
+ */
+ public long compressedBytes;
+
+ /**
+ * actual payload data (not including trailing newline),
+ * should match record-declared-length
+ */
+ public long uncompressedBytes;
+
+ /**
+ * content-length header, iff HTTP and present, null otherwise
+ */
+ public long httpPayloadDeclaredLength;
+
+ /**
+ * actual http payload length, should match http-payload-declared-length
+ */
+ public long httpPayloadActualLength;
+
+ /**
+ * errors encountered reading record
+ */
+ public List errors = new ArrayList();
+
+ /**
+ * verbatim ARC record header string
+ */
+ private String headerString;
+ public String getHeaderString() {
+ return this.headerString;
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param in Stream cue'd up to be at the start of the record this instance
+ * is to represent.
+ * @param metaData Meta data.
+ * @throws IOException
+ */
+ public ARCRecord(InputStream in, ArchiveRecordHeader metaData)
+ throws IOException {
+ this(in, metaData, 0, true, false, true);
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param in Stream cue'd up to be at the start of the record this instance
+ * is to represent.
+ * @param metaData Meta data.
+ * @param bodyOffset Offset into the body. Usually 0.
+ * @param digest True if we're to calculate digest for this record. Not
+ * digesting saves about ~15% of cpu during an ARC parse.
+ * @param strict Be strict parsing (Parsing stops if ARC inproperly
+ * formatted).
+ * @param parseHttpHeaders True if we are to parse HTTP headers. Costs
+ * about ~20% of CPU during an ARC parse.
+ * @throws IOException
+ */
+ public ARCRecord(InputStream in, ArchiveRecordHeader metaData,
+ int bodyOffset, boolean digest, boolean strict,
+ final boolean parseHttpHeaders)
+ throws IOException {
+ super(in, metaData, bodyOffset, digest, strict);
+ if (parseHttpHeaders) {
+ this.httpHeaderStream = readHttpHeader();
+ }
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param in Stream cue'd up to be at the start of the records metadata
+ * this instance is to represent.
+ * @param identifier Identifier for this the hosting Reader.
+ * @param offset Current offset into in (Used to keep
+ * position properly aligned). Usually 0.
+ * @param digest True if we're to calculate digest for this record. Not
+ * digesting saves about ~15% of cpu during an ARC parse.
+ * @param strict Be strict parsing (Parsing stops if ARC inproperly
+ * formatted).
+ * @param parseHttpHeaders True if we are to parse HTTP headers. Costs
+ * about ~20% of CPU during an ARC parse.
+ * @param isAllignedOnFirstRecord True if this is the first record to be
+ * read from an archive
+ * @param String version Version information to be returned to the
+ * ARCReader constructing this record
+ *
+ * @throws IOException
+ */
+ public ARCRecord(InputStream in, final String identifier,
+ final long offset, boolean digest, boolean strict,
+ final boolean parseHttpHeaders,
+ final boolean isAlignedOnFirstRecord, String version)
+ throws IOException {
+ super(in, null, 0, digest, strict);
+ setHeader(parseHeaders(in, identifier, offset, strict, isAlignedOnFirstRecord, version));
+ if (parseHttpHeaders) {
+ this.httpHeaderStream = readHttpHeader();
+ }
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param in Stream cue'd up to be at the start of the records metadata
+ * this instance is to represent.
+ * @param identifier Identifier for this the hosting Reader.
+ * @param offset Current offset into in (Used to keep
+ * position properly aligned). Usually 0.
+ * @param digest True if we're to calculate digest for this record. Not
+ * digesting saves about ~15% of cpu during an ARC parse.
+ * @param strict Be strict parsing (Parsing stops if ARC inproperly
+ * formatted).
+ * @param parseHttpHeaders True if we are to parse HTTP headers. Costs
+ * about ~20% of CPU during an ARC parse.
+ *
+ * @throws IOException
+ */
+ public ARCRecord(InputStream in, final String identifier,
+ final long offset, boolean digest, boolean strict,
+ final boolean parseHttpHeaders)
+ throws IOException {
+ this(in, identifier, offset, digest, strict, parseHttpHeaders,
+ false, null);
+ }
+
+ private ArchiveRecordHeader parseHeaders(final InputStream in,
+ final String identifier, final long offset, final boolean strict,
+ final boolean isAlignedOnFirstRecord, String version)
+ throws IOException {
+
+ ArrayList firstLineValues = new ArrayList(20);
+ getTokenizedHeaderLine(in, firstLineValues);
+
+ int bodyOffset = 0;
+ if (offset == 0 && isAlignedOnFirstRecord) {
+ // If offset is zero and we were aligned at first record on
+ // creation (See #alignedOnFirstRecord for more on this), then no
+ // records have been read yet and we're reading our first one, the
+ // record of ARC file meta info. Its special. In ARC versions
+ // 1.x, first record has three lines of meta info. We've just read
+ // the first line. There are two more. The second line has misc.
+ // info. We're only interested in the first field, the version
+ // number. The third line is the list of field names. Here's what
+ // ARC file version 1.x meta content looks like:
+ //
+ // filedesc://testIsBoundary-JunitIAH200401070157520.arc 0.0.0.0 \\
+ // 20040107015752 text/plain 77
+ // 1 0 InternetArchive
+ // URL IP-address Archive-date Content-type Archive-length
+ //
+ ArrayList secondLineValues = new ArrayList(20);
+ bodyOffset += getTokenizedHeaderLine(in, secondLineValues);
+ version = ((String)secondLineValues.get(0) +
+ "." + (String)secondLineValues.get(1));
+ // Just read over the 3rd line. We used to parse it and use
+ // values found here but now we just hardcode them to avoid
+ // having to read this 3rd line even for random arc file accesses.
+ bodyOffset += getTokenizedHeaderLine(in, null);
+ // this.position = bodyOffset;
+ }
+ setBodyOffset(bodyOffset);
+
+ return computeMetaData(this.headerFieldNameKeys, firstLineValues, version, offset, identifier);
+ }
+
+ /**
+ * Get a record header line as list of tokens.
+ *
+ * We keep reading till we find a LINE_SEPARATOR or we reach the end
+ * of file w/o finding a LINE_SEPARATOR or the line length is crazy.
+ *
+ * @param stream InputStream to read from.
+ * @param list Empty list that gets filled w/ string tokens.
+ * @return Count of characters read.
+ * @exception IOException If problem reading stream or no line separator
+ * found or EOF before EOL or we didn't get minimum header fields.
+ */
+ private int getTokenizedHeaderLine(final InputStream stream,
+ List list) throws IOException {
+ // Preallocate usual line size.
+ StringBuilder buffer = new StringBuilder(2048 + 20);
+ int read = 0;
+ int previous = -1;
+ for (int c = -1; true;) {
+ previous = c;
+ c = stream.read();
+ if (c == -1) {
+ throw new RecoverableIOException("Hit EOF before header EOL.");
+ }
+ c &= 0xff;
+ read++;
+ if (read > MAX_HEADER_LINE_LENGTH) {
+ throw new IOException("Header line longer than max allowed " +
+ " -- " + String.valueOf(MAX_HEADER_LINE_LENGTH) +
+ " -- or passed buffer doesn't contain a line (Read: " +
+ buffer.length() + "). Here's" +
+ " some of what was read: " +
+ buffer.substring(0, Math.min(buffer.length(), 256)));
+ }
+
+ if (c == LINE_SEPARATOR) {
+ if (buffer.length() == 0) {
+ // Empty line at start of buffer. Skip it and try again.
+ continue;
+ }
+
+ if (list != null) {
+ list.add(buffer.toString());
+ }
+ // LOOP TERMINATION.
+ break;
+ } else if (c == HEADER_FIELD_SEPARATOR) {
+ if (!isStrict() && previous == HEADER_FIELD_SEPARATOR) {
+ // Early ARCs sometimes had multiple spaces between fields.
+ continue;
+ }
+ if (list != null) {
+ list.add(buffer.toString());
+ }
+ // reset to empty
+ buffer.setLength(0);
+ } else {
+ buffer.append((char)c);
+ }
+ }
+
+ // List must have at least 3 elements in it and no more than 10. If
+ // it has other than this, then bogus parse.
+ if (list != null && (list.size() < 3 || list.size() > 100)) {
+ throw new IOException("Unparseable header line: " + list);
+ }
+
+ // save verbatim header String
+ this.headerString = StringUtils.join(list," ");
+
+ return read;
+ }
+
+ /**
+ * Compute metadata fields.
+ *
+ * Here we check the meta field has right number of items in it.
+ *
+ * @param keys Keys to use composing headerFields map.
+ * @param values Values to set into the headerFields map.
+ * @param v The version of this ARC file.
+ * @param offset Offset into arc file.
+ *
+ * @return Metadata structure for this record.
+ *
+ * @exception IOException If no. of keys doesn't match no. of values.
+ */
+ private ARCRecordMetaData computeMetaData(List keys,
+ List values, String v, long offset, final String identifier)
+ throws IOException {
+ if (keys.size() != values.size()) {
+ List originalValues = values;
+ if (!isStrict()) {
+ values = fixSpaceInURL(values, keys.size());
+ // If values still doesn't match key size, try and do
+ // further repair.
+ if (keys.size() != values.size()) {
+ // Early ARCs had a space in mimetype.
+ if (values.size() == (keys.size() + 1) &&
+ values.get(4).toLowerCase().startsWith("charset=")) {
+ List nuvalues =
+ new ArrayList(keys.size());
+ nuvalues.add(0, values.get(0));
+ nuvalues.add(1, values.get(1));
+ nuvalues.add(2, values.get(2));
+ nuvalues.add(3, values.get(3) + values.get(4));
+ nuvalues.add(4, values.get(5));
+ values = nuvalues;
+ } else if((values.size() + 1) == keys.size() &&
+ isLegitimateIPValue(values.get(1)) &&
+ isDate(values.get(2)) && isNumber(values.get(3))) {
+ // Mimetype is empty.
+ List nuvalues =
+ new ArrayList(keys.size());
+ nuvalues.add(0, values.get(0));
+ nuvalues.add(1, values.get(1));
+ nuvalues.add(2, values.get(2));
+ nuvalues.add(3, "-");
+ nuvalues.add(4, values.get(3));
+ values = nuvalues;
+ }
+ }
+ }
+ if (keys.size() != values.size()) {
+ throw new IOException("Size of field name keys does" +
+ " not match count of field values: " + values);
+ }
+ // Note that field was fixed on stderr.
+ System.err.println(Level.WARNING.toString() + "Fixed spaces in metadata line at " +
+ "offset " + offset +
+ " Original: " + originalValues + ", New: " + values);
+ }
+
+ Map headerFields =
+ new HashMap(keys.size() + 2);
+ for (int i = 0; i < keys.size(); i++) {
+ headerFields.put(keys.get(i), values.get(i));
+ }
+
+ // Add a check for tabs in URLs. If any, replace with '%09'.
+ // See https://sourceforge.net/tracker/?group_id=73833&atid=539099&func=detail&aid=1010966,
+ // [ 1010966 ] crawl.log has URIs with spaces in them.
+ String url = (String)headerFields.get(URL_FIELD_KEY);
+ if (url != null && url.indexOf('\t') >= 0) {
+ headerFields.put(URL_FIELD_KEY,
+ TextUtils.replaceAll("\t", url, "%09"));
+ }
+
+ headerFields.put(VERSION_FIELD_KEY, v);
+ headerFields.put(ABSOLUTE_OFFSET_KEY, new Long(offset));
+
+ return new ARCRecordMetaData(identifier, headerFields);
+ }
+
+ /**
+ * Fix space in URLs.
+ * The ARCWriter used to write into the ARC URLs with spaces in them.
+ * See [ 1010966 ]
+ * crawl.log has URIs with spaces in them.
+ * This method does fix up on such headers converting all spaces found
+ * to '%20'.
+ * @param values List of metadata values.
+ * @param requiredSize Expected size of resultant values list.
+ * @return New list if we successfully fixed up values or original if
+ * fixup failed.
+ */
+ private List fixSpaceInURL(List values, int requiredSize) {
+ // Do validity check. 3rd from last is a date of 14 numeric
+ // characters. The 4th from last is IP, all before the IP
+ // should be concatenated together with a '%20' joiner.
+ // In the below, '4' is 4th field from end which has the IP.
+ if (!(values.size() > requiredSize) || values.size() < 4) {
+ return values;
+ }
+ // Test 3rd field is valid date.
+ if (!isDate((String) values.get(values.size() - 3))) {
+ return values;
+ }
+
+ // Test 4th field is valid IP.
+ if (!isLegitimateIPValue((String) values.get(values.size() - 4))) {
+ return values;
+ }
+
+ List newValues = new ArrayList(requiredSize);
+ StringBuffer url = new StringBuffer();
+ for (int i = 0; i < (values.size() - 4); i++) {
+ if (i > 0) {
+ url.append("%20");
+ }
+ url.append(values.get(i));
+ }
+ newValues.add(url.toString());
+ for (int i = values.size() - 4; i < values.size(); i++) {
+ newValues.add(values.get(i));
+ }
+ return newValues;
+ }
+
+ private boolean isDate(final String date) {
+ if (date.length() != 14) {
+ return false;
+ }
+ return isNumber(date);
+ }
+
+ private boolean isNumber(final String n) {
+ for (int i = 0; i < n.length(); i++) {
+ if (!Character.isDigit(n.charAt(i))) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ private boolean isLegitimateIPValue(final String ip) {
+ if ("-".equals(ip)) {
+ return true;
+ }
+ Matcher m = InetAddressUtil.IPV4_QUADS.matcher(ip);
+ return m != null && m.matches();
+ }
+
+ /**
+ * Skip over the the http header if one present.
+ *
+ * Subsequent reads will get the body.
+ *
+ *
Calling this method in the midst of reading the header
+ * will make for strange results. Otherwise, safe to call
+ * at any time though before reading any of the arc record
+ * content is only time that it makes sense.
+ *
+ *
After calling this method, you can call
+ * {@link #getHttpHeaders()} to get the read http header.
+ *
+ * @throws IOException
+ */
+ public void skipHttpHeader() throws IOException {
+ if (this.httpHeaderStream != null) {
+ // Empty the httpHeaderStream
+ for (int available = this.httpHeaderStream.available();
+ this.httpHeaderStream != null &&
+ (available = this.httpHeaderStream.available()) > 0;) {
+ // We should be in this loop once only we should only do this
+ // buffer allocation once.
+ byte [] buffer = new byte[available];
+ // The read nulls out httpHeaderStream when done with it so
+ // need check for null in the loop control line.
+ read(buffer, 0, available);
+ }
+ }
+ }
+
+ public void dumpHttpHeader() throws IOException {
+ if (this.httpHeaderStream == null) {
+ return;
+ }
+ // Dump the httpHeaderStream to STDOUT
+ for (int available = this.httpHeaderStream.available();
+ this.httpHeaderStream != null
+ && (available = this.httpHeaderStream.available()) > 0;) {
+ // We should be in this loop only once and should do this
+ // buffer allocation once.
+ byte[] buffer = new byte[available];
+ // The read nulls out httpHeaderStream when done with it so
+ // need check for null in the loop control line.
+ int read = read(buffer, 0, available);
+ System.out.write(buffer, 0, read);
+ }
+ }
+
+ /**
+ * Read http header if present. Technique borrowed from HttpClient HttpParse
+ * class. set errors when found.
+ *
+ * @return ByteArrayInputStream with the http header in it or null if no
+ * http header.
+ * @throws IOException
+ */
+ private InputStream readHttpHeader() throws IOException {
+
+ // this can be helpful when simply iterating over records,
+ // looking for problems.
+ Logger logger = Logger.getLogger(this.getClass().getName());
+ ArchiveRecordHeader h = this.getHeader();
+
+ // If judged a record that doesn't have an http header, return
+ // immediately.
+ String url = getHeader().getUrl();
+ if(!url.startsWith("http") ||
+ getHeader().getLength() <= MIN_HTTP_HEADER_LENGTH) {
+ return null;
+ }
+
+ String statusLine;
+ byte[] statusBytes;
+ int eolCharCount = 0;
+ int errOffset = 0;
+
+ // Read status line, skipping any errant http headers found before it
+ // This allows a larger number of 'corrupt' arcs -- where headers were accidentally
+ // inserted before the status line to be readable
+ while (true) {
+ statusBytes = LaxHttpParser.readRawLine(getIn());
+ eolCharCount = getEolCharsCount(statusBytes);
+ if (eolCharCount <= 0) {
+ throw new RecoverableIOException(
+ "Failed to read http status where one was expected: "
+ + ((statusBytes == null) ? "" : new String(statusBytes)));
+ }
+
+ statusLine = EncodingUtil.getString(statusBytes, 0,
+ statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING);
+
+ // If a null or DELETED break immediately
+ if ((statusLine == null) || statusLine.startsWith("DELETED")) {
+ break;
+ }
+
+ // If it's actually the status line, break, otherwise continue skipping any
+ // previous header values
+ if (!statusLine.contains(":") && StatusLine.startsWithHTTP(statusLine)) {
+ break;
+ }
+
+ // Add bytes read to error "offset" to add to position
+ errOffset += statusBytes.length;
+ }
+
+ if (errOffset > 0) {
+ this.incrementPosition(errOffset);
+ }
+
+ if ((statusLine == null) ||
+ !StatusLine.startsWithHTTP(statusLine)) {
+ if (statusLine.startsWith("DELETED")) {
+ // Some old ARCs have deleted records like following:
+ // http://vireo.gatech.edu:80/ebt-bin/nph-dweb/dynaweb/SGI_Developer/SGITCL_PG/@Generic__BookTocView/11108%3Btd%3D2 130.207.168.42 19991010131803 text/html 29202
+ // DELETED_TIME=20000425001133_DELETER=Kurt_REASON=alexalist
+ // (follows ~29K spaces)
+ // For now, throw a RecoverableIOException so if iterating over
+ // records, we keep going. TODO: Later make a legitimate
+ // ARCRecord from the deleted record rather than throw
+ // exception.
+ throw new DeletedARCRecordIOException(statusLine);
+ } else {
+ this.errors.add(ArcRecordErrors.HTTP_STATUS_LINE_INVALID);
+ }
+ }
+
+ try {
+ this.httpStatus = new StatusLine(statusLine);
+ } catch(IOException e) {
+ logger.warning(e.getMessage() + " at offset: " + h.getOffset());
+ this.errors.add(ArcRecordErrors.HTTP_STATUS_LINE_EXCEPTION);
+ }
+
+ // Save off all bytes read. Keep them as bytes rather than
+ // convert to strings so we don't have to worry about encodings
+ // though this should never be a problem doing http headers since
+ // its all supposed to be ascii.
+ ByteArrayOutputStream baos =
+ new ByteArrayOutputStream(statusBytes.length + 4 * 1024);
+ baos.write(statusBytes);
+
+ // Now read rest of the header lines looking for the separation
+ // between header and body.
+ for (byte [] lineBytes = null; true;) {
+ lineBytes = LaxHttpParser.readRawLine(getIn());
+ eolCharCount = getEolCharsCount(lineBytes);
+ if (eolCharCount <= 0) {
+ if (getIn().available() == 0) {
+ httpHeaderBytesRead += statusBytes.length;
+ logger.warning("HTTP header truncated at offset: " + h.getOffset());
+ this.errors.add(ArcRecordErrors.HTTP_HEADER_TRUNCATED);
+ this.setEor(true);
+ break;
+ } else {
+ throw new IOException("Failed reading http headers: " +
+ ((lineBytes != null)? new String(lineBytes): null));
+ }
+ } else {
+ httpHeaderBytesRead += lineBytes.length;
+ }
+ // Save the bytes read.
+ baos.write(lineBytes);
+ if ((lineBytes.length - eolCharCount) <= 0) {
+ // We've finished reading the http header.
+ break;
+ }
+ }
+
+ byte [] headerBytes = baos.toByteArray();
+ // Save off where body starts.
+ this.getMetaData().setContentBegin(headerBytes.length);
+ ByteArrayInputStream bais =
+ new ByteArrayInputStream(headerBytes);
+ if (!bais.markSupported()) {
+ throw new IOException("ByteArrayInputStream does not support mark");
+ }
+ bais.mark(headerBytes.length);
+ // Read the status line. Don't let it into the parseHeaders function.
+ // It doesn't know what to do with it.
+ bais.read(statusBytes, 0, statusBytes.length);
+ this.httpHeaders = LaxHttpParser.parseHeaders(bais,
+ ARCConstants.DEFAULT_ENCODING);
+ this.getMetaData().setStatusCode(Integer.toString(getStatusCode()));
+ bais.reset();
+ return bais;
+ }
+
+ private static class DeletedARCRecordIOException
+ extends RecoverableIOException {
+ private static final long serialVersionUID = 1L;
+
+ public DeletedARCRecordIOException(final String reason) {
+ super(reason);
+ }
+ }
+
+ /**
+ * Return status code for this record.
+ *
+ * This method will return -1 until the http header has been read.
+ * @return Status code.
+ */
+ public int getStatusCode() {
+ return (this.httpStatus == null)? -1: this.httpStatus.getStatusCode();
+ }
+
+ /**
+ * @param bytes Array of bytes to examine for an EOL.
+ * @return Count of end-of-line characters or zero if none.
+ */
+ private int getEolCharsCount(byte [] bytes) {
+ int count = 0;
+ if (bytes != null && bytes.length >=1 &&
+ bytes[bytes.length - 1] == '\n') {
+ count++;
+ if (bytes.length >=2 && bytes[bytes.length -2] == '\r') {
+ count++;
+ }
+ }
+ return count;
+ }
+
+ /**
+ * @return Meta data for this record.
+ */
+ public ARCRecordMetaData getMetaData() {
+ return (ARCRecordMetaData)getHeader();
+ }
+
+ /**
+ * @return http headers (Only available after header has been read).
+ */
+ public Header [] getHttpHeaders() {
+ return this.httpHeaders;
+ }
+
+ /**
+ * @return ArcRecordErrors encountered when reading
+ */
+ public List getErrors() {
+ return this.errors;
+ }
+
+ /**
+ * @return true if ARC record errors found
+ */
+ public boolean hasErrors() {
+ return !this.errors.isEmpty();
+ }
+
+ /**
+ * @return Next character in this ARCRecord's content else -1 if at end of
+ * this record.
+ * @throws IOException
+ */
+ public int read() throws IOException {
+ int c = -1;
+ if (this.httpHeaderStream != null &&
+ (this.httpHeaderStream.available() > 0)) {
+ // If http header, return bytes from it before we go to underlying
+ // stream.
+ c = this.httpHeaderStream.read();
+ // If done with the header stream, null it out.
+ if (this.httpHeaderStream.available() <= 0) {
+ this.httpHeaderStream = null;
+ }
+ incrementPosition();
+ } else {
+ c = super.read();
+ }
+ return c;
+ }
+
+ public int read(byte [] b, int offset, int length) throws IOException {
+ int read = -1;
+ if (this.httpHeaderStream != null &&
+ (this.httpHeaderStream.available() > 0)) {
+ // If http header, return bytes from it before we go to underlying
+ // stream.
+ read = Math.min(length, this.httpHeaderStream.available());
+ if (read == 0) {
+ read = -1;
+ } else {
+ read = this.httpHeaderStream.read(b, offset, read);
+ }
+ // If done with the header stream, null it out.
+ if (this.httpHeaderStream.available() <= 0) {
+ this.httpHeaderStream = null;
+ }
+ incrementPosition(read);
+ } else {
+ read = super.read(b, offset, length);
+ }
+ return read;
+ }
+
+ /**
+ * @return Offset at which the body begins (Only known after
+ * header has been read) or -1 if none or if we haven't read
+ * headers yet. Usually length of HTTP headers (does not include ARC
+ * metadata line length).
+ */
+ public int getBodyOffset() {
+ return this.getMetaData().getContentBegin();
+ }
+
+ @Override
+ protected String getIp4Cdx(ArchiveRecordHeader h) {
+ String result = null;
+ if (h instanceof ARCRecordMetaData) {
+ result = ((ARCRecordMetaData)h).getIp();
+ }
+ return (result != null)? result: super.getIp4Cdx(h);
+ }
+
+ @Override
+ protected String getStatusCode4Cdx(ArchiveRecordHeader h) {
+ String result = null;
+ if (h instanceof ARCRecordMetaData) {
+ result = ((ARCRecordMetaData) h).getStatusCode();
+ }
+ return (result != null) ? result: super.getStatusCode4Cdx(h);
+ }
+
+ @Override
+ protected String getDigest4Cdx(ArchiveRecordHeader h) {
+ String result = null;
+ if (h instanceof ARCRecordMetaData) {
+ result = ((ARCRecordMetaData) h).getDigest();
+ }
+ return (result != null) ? result: super.getDigest4Cdx(h);
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/io/arc/ARCRecordMetaData.java b/src/main/java/org/archive/io/arc/ARCRecordMetaData.java
new file mode 100644
index 00000000..3f617041
--- /dev/null
+++ b/src/main/java/org/archive/io/arc/ARCRecordMetaData.java
@@ -0,0 +1,267 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io.arc;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Set;
+
+import org.archive.io.ArchiveRecordHeader;
+
+
+/**
+ * An immutable class to hold an ARC record meta data.
+ *
+ * @author stack
+ */
+public class ARCRecordMetaData implements ArchiveRecordHeader, ARCConstants {
+ /**
+ * Map of record header fields.
+ *
+ * We store all in a hashmap. This way we can hold version 1 or
+ * version 2 record meta data.
+ *
+ *
Keys are lowercase.
+ */
+ protected Map headerFields = null;
+
+ /**
+ * Digest for the record.
+ *
+ * Only available after the record has been read in totality.
+ */
+ private String digest = null;
+
+ /**
+ * Status for this request.
+ *
+ * There may be no status.
+ */
+ private String statusCode = null;
+
+ /**
+ * The arc this metadata came out.
+ * Descriptive String, either path or URL.
+ */
+ private String arc = null;
+
+ private int contentBegin = 0;
+
+ /**
+ * Shut down the default constructor.
+ */
+ protected ARCRecordMetaData() {
+ super();
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param arc The arc file this metadata came out of.
+ * @param headerFields Hash of meta fields.
+ *
+ * @throws IOException
+ */
+ public ARCRecordMetaData(final String arc, Map headerFields)
+ throws IOException {
+ // Make sure the minimum required fields are present,
+ for (Iterator i = REQUIRED_VERSION_1_HEADER_FIELDS.iterator();
+ i.hasNext(); ) {
+ testRequiredField(headerFields, (String)i.next());
+ }
+ this.headerFields = headerFields;
+ this.arc = arc;
+ }
+
+ /**
+ * Test required field is present in hash.
+ *
+ * @param fields Map of fields.
+ * @param requiredField Field to test for.
+ *
+ * @exception IOException If required field is not present.
+ */
+ protected void testRequiredField(Map fields, String requiredField)
+ throws IOException {
+ if (!fields.containsKey(requiredField)) {
+ throw new IOException("Required field " + requiredField +
+ " not in meta data.");
+ }
+ }
+
+ /**
+ * Get the time when the record was harvested.
+ *
+ * Returns the date in Heritrix 14 digit time format (UTC). See the
+ * {@link org.archive.util.ArchiveUtils} class for converting to Java
+ * dates.
+ *
+ * @return Header date in Heritrix 14 digit format.
+ * @see org.archive.util.ArchiveUtils#parse14DigitDate(String)
+ */
+ public String getDate() {
+ return (String) this.headerFields.get(DATE_FIELD_KEY);
+ }
+
+ /**
+ * @return Return length of the record.
+ */
+ public long getLength() {
+ return Long.parseLong((String)this.headerFields.
+ get(LENGTH_FIELD_KEY));
+ }
+
+ /**
+ * @return Return Content-Length of the contents of the record
+ * Same as record length for arcs? TODO
+ */
+ public long getContentLength() {
+ return getLength();
+ }
+
+ /**
+ * @return Header url.
+ */
+ public String getUrl() {
+ return (String)this.headerFields.get(URL_FIELD_KEY);
+ }
+
+ /**
+ * @return IP.
+ */
+ public String getIp()
+ {
+ return (String)this.headerFields.get(IP_HEADER_FIELD_KEY);
+ }
+
+ /**
+ * @return mimetype The mimetype that is in the ARC metaline -- NOT the http
+ * content-type content.
+ */
+ public String getMimetype() {
+ return (String)this.headerFields.get(MIMETYPE_FIELD_KEY);
+ }
+
+ /**
+ * @return Arcfile version.
+ */
+ public String getVersion() {
+ return (String)this.headerFields.get(VERSION_FIELD_KEY);
+ }
+
+ /**
+ * @return Offset into arcfile at which this record begins.
+ */
+ public long getOffset() {
+ return ((Long)this.headerFields.get(ABSOLUTE_OFFSET_KEY)).longValue();
+ }
+
+ /**
+ * @param key Key to use looking up field value.
+ * @return value for passed key of null if no such entry.
+ */
+ public Object getHeaderValue(String key) {
+ return this.headerFields.get(key);
+ }
+
+ /**
+ * @return Header field name keys.
+ */
+ public Set getHeaderFieldKeys()
+ {
+ return this.headerFields.keySet();
+ }
+
+ /**
+ * @return Map of header fields.
+ */
+ public Map getHeaderFields() {
+ return this.headerFields;
+ }
+
+ /**
+ * @return Returns identifier for ARC.
+ */
+ public String getArc() {
+ return this.arc;
+ }
+
+ /**
+ * @return Convenience method that does a
+ * return new File(this.arc) (Be aware this.arc is not always
+ * full path to an ARC file -- may be an URL). Test
+ * returned file for existence.
+ */
+ public File getArcFile() {
+ return new File(this.arc);
+ }
+
+ /**
+ * @return Returns the digest.
+ */
+ public String getDigest() {
+ return this.digest;
+ }
+
+ /**
+ * @param d The digest to set.
+ */
+ public void setDigest(String d) {
+ this.digest = d;
+ }
+
+ /**
+ * @return Returns the statusCode. May be null.
+ */
+ public String getStatusCode() {
+ return this.statusCode;
+ }
+
+ /**
+ * @param statusCode The statusCode to set.
+ */
+ public void setStatusCode(String statusCode) {
+ this.statusCode = statusCode;
+ }
+
+ public String toString() {
+ return ((this.arc != null)? this.arc: "") +
+ ": " +
+ ((this.headerFields != null)? this.headerFields.toString(): "");
+ }
+
+ public String getReaderIdentifier() {
+ return this.getArc();
+ }
+
+ public String getRecordIdentifier() {
+ return getDate() + "/" + getUrl();
+ }
+
+ public int getContentBegin() {
+ return this.contentBegin;
+ }
+
+ protected void setContentBegin(final int offset) {
+ this.contentBegin = offset;
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/io/arc/ARCUtils.java b/src/main/java/org/archive/io/arc/ARCUtils.java
new file mode 100644
index 00000000..985457e2
--- /dev/null
+++ b/src/main/java/org/archive/io/arc/ARCUtils.java
@@ -0,0 +1,240 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io.arc;
+
+import it.unimi.dsi.fastutil.io.RepositionableStream;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URI;
+import java.net.URISyntaxException;
+
+import org.archive.url.UsableURI;
+import org.archive.util.zip.GzipHeader;
+import org.archive.util.zip.NoGzipMagicException;
+
+public class ARCUtils implements ARCConstants {
+ /**
+ * @param pathOrUri Path or URI to extract arc filename from.
+ * @return Extracted arc file name.
+ * @throws URISyntaxException
+ */
+ public static String parseArcFilename(final String pathOrUri)
+ throws URISyntaxException {
+ String path = pathOrUri;
+ if (UsableURI.hasScheme(pathOrUri)) {
+ URI url = new URI(pathOrUri);
+ path = url.getPath();
+ }
+ return (new File(path)).getName();
+ }
+
+ /**
+ * @param arcFile File to test.
+ * @return True if arcFile is compressed ARC.
+ * @throws IOException
+ */
+ public static boolean isCompressed(File arcFile) throws IOException {
+ return testCompressedARCFile(arcFile);
+ }
+
+ /**
+ * Check file is compressed and in ARC GZIP format.
+ *
+ * @param arcFile File to test if its Internet Archive ARC file
+ * GZIP compressed.
+ *
+ * @return True if this is an Internet Archive GZIP'd ARC file (It begins
+ * w/ the Internet Archive GZIP header and has the
+ * COMPRESSED_ARC_FILE_EXTENSION suffix).
+ *
+ * @exception IOException If file does not exist or is not unreadable.
+ */
+ public static boolean testCompressedARCFile(File arcFile)
+ throws IOException {
+ return testCompressedARCFile(arcFile, false);
+ }
+
+ /**
+ * Check file is compressed and in ARC GZIP format.
+ *
+ * @param arcFile File to test if its Internet Archive ARC file
+ * GZIP compressed.
+ * @param skipSuffixCheck Set to true if we're not to test on the
+ * '.arc.gz' suffix.
+ *
+ * @return True if this is an Internet Archive GZIP'd ARC file (It begins
+ * w/ the Internet Archive GZIP header).
+ *
+ * @exception IOException If file does not exist or is not unreadable.
+ */
+ public static boolean testCompressedARCFile(File arcFile,
+ boolean skipSuffixCheck)
+ throws IOException {
+ boolean compressedARCFile = false;
+ isReadable(arcFile);
+ if(!skipSuffixCheck && !arcFile.getName().toLowerCase()
+ .endsWith(COMPRESSED_ARC_FILE_EXTENSION)) {
+ return compressedARCFile;
+ }
+
+ final InputStream is = new FileInputStream(arcFile);
+ try {
+ compressedARCFile = testCompressedARCStream(is);
+ } finally {
+ is.close();
+ }
+ return compressedARCFile;
+ }
+
+ /**
+ * Tests passed stream is gzip stream by reading in the HEAD.
+ * Does not reposition the stream. That is left up to the caller.
+ * @param is An InputStream.
+ * @return True if compressed stream.
+ * @throws IOException
+ */
+ public static boolean testCompressedARCStream(final InputStream is)
+ throws IOException {
+ boolean compressedARCFile = false;
+ GzipHeader gh = null;
+ try {
+ gh = new GzipHeader(is);
+ } catch (NoGzipMagicException e ) {
+ return compressedARCFile;
+ }
+
+ byte[] fextra = gh.getFextra();
+ // Now make sure following bytes are IA GZIP comment.
+ // First check length. ARC_GZIP_EXTRA_FIELD includes length
+ // so subtract two and start compare to ARC_GZIP_EXTRA_FIELD
+ // at +2.
+ if (fextra != null &&
+ ARC_GZIP_EXTRA_FIELD.length - 2 == fextra.length) {
+ compressedARCFile = true;
+ for (int i = 0; i < fextra.length; i++) {
+ if (fextra[i] != ARC_GZIP_EXTRA_FIELD[i + 2]) {
+ compressedARCFile = false;
+ break;
+ }
+ }
+ }
+ return compressedARCFile;
+ }
+
+ /**
+ * Tests passed stream is gzip stream by reading in the HEAD.
+ * Does reposition of stream when done.
+ * @param rs An InputStream that is Repositionable.
+ * @return True if compressed stream.
+ * @throws IOException
+ */
+ public static boolean testCompressedRepositionalStream(
+ final RepositionableStream rs)
+ throws IOException {
+ boolean compressedARCFile = false;
+ long p = rs.position();
+ try {
+ compressedARCFile = testCompressedStream((InputStream)rs);
+ } finally {
+ rs.position(p);
+ }
+ return compressedARCFile;
+ }
+
+ /**
+ * Tests passed stream is gzip stream by reading in the HEAD.
+ * Does reposition of stream when done.
+ * @param is An InputStream.
+ * @return True if compressed stream.
+ * @throws IOException
+ */
+ public static boolean testCompressedStream(final InputStream is)
+ throws IOException {
+ boolean compressedARCFile = false;
+ try {
+ new GzipHeader(is);
+ compressedARCFile = true;
+ } catch (NoGzipMagicException e) {
+ return compressedARCFile;
+ }
+ return compressedARCFile;
+ }
+
+ /**
+ * Check file is uncompressed ARC file.
+ *
+ * @param arcFile
+ * File to test if its Internet Archive ARC file uncompressed.
+ *
+ * @return True if this is an Internet Archive ARC file.
+ *
+ * @exception IOException
+ * If file does not exist or is not unreadable.
+ */
+ public static boolean testUncompressedARCFile(File arcFile)
+ throws IOException {
+ boolean uncompressedARCFile = false;
+ isReadable(arcFile);
+ if(arcFile.getName().toLowerCase().endsWith(ARC_FILE_EXTENSION)) {
+ FileInputStream fis = new FileInputStream(arcFile);
+ try {
+ byte [] b = new byte[ARC_MAGIC_NUMBER.length()];
+ int read = fis.read(b, 0, ARC_MAGIC_NUMBER.length());
+ fis.close();
+ if (read == ARC_MAGIC_NUMBER.length()) {
+ StringBuffer beginStr
+ = new StringBuffer(ARC_MAGIC_NUMBER.length());
+ for (int i = 0; i < ARC_MAGIC_NUMBER.length(); i++) {
+ beginStr.append((char)b[i]);
+ }
+
+ if (beginStr.toString().
+ equalsIgnoreCase(ARC_MAGIC_NUMBER)) {
+ uncompressedARCFile = true;
+ }
+ }
+ } finally {
+ fis.close();
+ }
+ }
+
+ return uncompressedARCFile;
+ }
+
+
+ /**
+ * @param arcFile File to test.
+ * @exception IOException If file does not exist or is not unreadable.
+ */
+ private static void isReadable(File arcFile) throws IOException {
+ if (!arcFile.exists()) {
+ throw new FileNotFoundException(arcFile.getAbsolutePath() +
+ " does not exist.");
+ }
+
+ if (!arcFile.canRead()) {
+ throw new FileNotFoundException(arcFile.getAbsolutePath() +
+ " is not readable.");
+ }
+ }
+}
diff --git a/src/main/java/org/archive/io/arc/ARCWriter.java b/src/main/java/org/archive/io/arc/ARCWriter.java
new file mode 100644
index 00000000..b5825d50
--- /dev/null
+++ b/src/main/java/org/archive/io/arc/ARCWriter.java
@@ -0,0 +1,459 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io.arc;
+
+import java.io.BufferedInputStream;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.Closeable;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PrintStream;
+import java.io.UnsupportedEncodingException;
+import java.util.Iterator;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.logging.Logger;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.archive.io.ReplayInputStream;
+import org.archive.io.WriterPoolMember;
+import org.archive.io.WriterPoolSettings;
+import org.archive.util.ArchiveUtils;
+import org.archive.util.DevUtils;
+import org.archive.util.MimetypeUtils;
+
+
+/**
+ * Write ARC files.
+ *
+ * Assumption is that the caller is managing access to this ARCWriter ensuring
+ * only one thread of control accessing this ARC file instance at any one time.
+ *
+ *
ARC files are described here:
+ * Arc
+ * File Format. This class does version 1 of the ARC file format. It also
+ * writes version 1.1 which is version 1 with data stuffed into the body of the
+ * first arc record in the file, the arc file meta record itself.
+ *
+ *
An ARC file is three lines of meta data followed by an optional 'body' and
+ * then a couple of '\n' and then: record, '\n', record, '\n', record, etc.
+ * If we are writing compressed ARC files, then each of the ARC file records is
+ * individually gzipped and concatenated together to make up a single ARC file.
+ * In GZIP terms, each ARC record is a GZIP member of a total gzip'd
+ * file.
+ *
+ *
The GZIPping of the ARC file meta data is exceptional. It is GZIPped
+ * w/ an extra GZIP header, a special Internet Archive (IA) extra header field
+ * (e.g. FEXTRA is set in the GZIP header FLG field and an extra field is
+ * appended to the GZIP header). The extra field has little in it but its
+ * presence denotes this GZIP as an Internet Archive gzipped ARC. See RFC1952
+ * to learn about the GZIP header structure.
+ *
+ *
This class then does its GZIPping in the following fashion. Each GZIP
+ * member is written w/ a new instance of GZIPOutputStream -- actually
+ * ARCWriterGZIPOututStream so we can get access to the underlying stream.
+ * The underlying stream stays open across GZIPoutputStream instantiations.
+ * For the 'special' GZIPing of the ARC file meta data, we cheat by catching the
+ * GZIPOutputStream output into a byte array, manipulating it adding the
+ * IA GZIP header, before writing to the stream.
+ *
+ *
I tried writing a resettable GZIPOutputStream and could make it work w/
+ * the SUN JDK but the IBM JDK threw NPE inside in the deflate.reset -- its zlib
+ * native call doesn't seem to like the notion of resetting -- so I gave up on
+ * it.
+ *
+ *
Because of such as the above and troubles with GZIPInputStream, we should
+ * write our own GZIP*Streams, ones that resettable and consious of gzip
+ * members.
+ *
+ *
This class will write until we hit >= maxSize. The check is done at
+ * record boundary. Records do not span ARC files. We will then close current
+ * file and open another and then continue writing.
+ *
+ *
TESTING: Here is how to test that produced ARC files are good
+ * using the
+ * alexa
+ * ARC c-tools:
+ *
+ * Examine the produced cdx file to make sure it makes sense. Search
+ * for 'no-type 0'. If found, then we're opening a gzip record w/o data to
+ * write. This is bad.
+ *
+ *
You can also do gzip -t FILENAME and it will tell you if the
+ * ARC makes sense to GZIP.
+ *
+ *
While being written, ARCs have a '.open' suffix appended.
+ *
+ * @author stack
+ */
+public class ARCWriter extends WriterPoolMember implements ARCConstants, Closeable {
+ private static final Logger logger =
+ Logger.getLogger(ARCWriter.class.getName());
+
+ /**
+ * Metadata line pattern.
+ */
+ private static final Pattern METADATA_LINE_PATTERN =
+ Pattern.compile("^\\S+ \\S+ \\S+ \\S+ \\S+(" + LINE_SEPARATOR + "?)$");
+
+
+ /**
+ * Constructor.
+ * Takes a stream. Use with caution. There is no upperbound check on size.
+ * Will just keep writing.
+ *
+ * @param serialNo used to generate unique file name sequences
+ * @param out Where to write.
+ * @param arc File the out is connected to.
+ * @param cmprs Compress the content written.
+ * @param metadata File meta data. Can be null. Is list of File and/or
+ * String objects.
+ * @param a14DigitDate If null, we'll write current time.
+ * @throws IOException
+ */
+ public ARCWriter(final AtomicInteger serialNo, final PrintStream out,
+ final File arc, final WriterPoolSettings settings)
+ throws IOException {
+ super(serialNo, out, arc, settings);
+ writeFirstRecord(ArchiveUtils.get14DigitDate());
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param serialNo used to generate unique file name sequences
+ * @param settings all creation parameters
+ */
+ public ARCWriter(final AtomicInteger serialNo, final WriterPoolSettings settings) {
+ super(serialNo, settings, ARC_FILE_EXTENSION);
+
+ }
+
+ protected String createFile()
+ throws IOException {
+ String name = super.createFile();
+ writeFirstRecord(currentTimestamp);
+ return name;
+ }
+
+ private void writeFirstRecord(final String ts)
+ throws IOException {
+ write(generateARCFileMetaData(ts));
+ }
+
+ /**
+ * Write out the ARCMetaData.
+ *
+ *
Generate ARC file meta data. Currently we only do version 1 of the
+ * ARC file formats or version 1.1 when metadata has been supplied (We
+ * write it into the body of the first record in the arc file).
+ *
+ *
Version 1 metadata looks roughly like this:
+ *
+ *
If compress is set, then we generate a header that has been gzipped
+ * in the Internet Archive manner. Such a gzipping enables the FEXTRA
+ * flag in the FLG field of the gzip header. It then appends an extra
+ * header field: '8', '0', 'L', 'X', '0', '0', '0', '0'. The first two
+ * bytes are the length of the field and the last 6 bytes the Internet
+ * Archive header. To learn about GZIP format, see RFC1952. To learn
+ * about the Internet Archive extra header field, read the source for
+ * av_ziparc which can be found at
+ * alexa/vista/alexa-tools-1.2/src/av_ziparc.cc.
+ *
+ *
We do things in this roundabout manner because the java
+ * GZIPOutputStream does not give access to GZIP header fields.
+ *
+ * @param date Date to put into the ARC metadata; if 17-digit will be
+ * truncated to traditional 14-digits
+ *
+ * @return Byte array filled w/ the arc header.
+ * @throws IOException
+ */
+ private byte [] generateARCFileMetaData(String date)
+ throws IOException {
+ if(date!=null && date.length()>14) {
+ date = date.substring(0,14);
+ }
+ int metadataBodyLength = getMetadataLength();
+ // If metadata body, then the minor part of the version is '1' rather
+ // than '0'.
+ String metadataHeaderLinesTwoAndThree =
+ getMetadataHeaderLinesTwoAndThree("1 " +
+ ((metadataBodyLength > 0)? "1": "0"));
+ int recordLength = metadataBodyLength +
+ metadataHeaderLinesTwoAndThree.getBytes(DEFAULT_ENCODING).length;
+ String metadataHeaderStr = ARC_MAGIC_NUMBER + getBaseFilename() +
+ " 0.0.0.0 " + date + " text/plain " + recordLength +
+ metadataHeaderLinesTwoAndThree;
+ ByteArrayOutputStream metabaos =
+ new ByteArrayOutputStream(recordLength);
+ // Write the metadata header.
+ metabaos.write(metadataHeaderStr.getBytes(DEFAULT_ENCODING));
+ // Write the metadata body, if anything to write.
+ if (metadataBodyLength > 0) {
+ writeMetaData(metabaos);
+ }
+
+ // Write out a LINE_SEPARATORs to end this record.
+ metabaos.write(LINE_SEPARATOR);
+
+ // Now get bytes of all just written and compress if flag set.
+ byte [] bytes = metabaos.toByteArray();
+
+ if(isCompressed()) {
+ // GZIP the header but catch the gzipping into a byte array so we
+ // can add the special IA GZIP header to the product. After
+ // manipulations, write to the output stream (The JAVA GZIP
+ // implementation does not give access to GZIP header. It
+ // produces a 'default' header only). We can get away w/ these
+ // maniupulations because the GZIP 'default' header doesn't
+ // do the 'optional' CRC'ing of the header.
+ byte [] gzippedMetaData = ArchiveUtils.gzip(bytes);
+ if (gzippedMetaData[3] != 0) {
+ throw new IOException("The GZIP FLG header is unexpectedly " +
+ " non-zero. Need to add smarter code that can deal " +
+ " when already extant extra GZIP header fields.");
+ }
+ // Set the GZIP FLG header to '4' which says that the GZIP header
+ // has extra fields. Then insert the alex {'L', 'X', '0', '0', '0,
+ // '0'} 'extra' field. The IA GZIP header will also set byte
+ // 9 (zero-based), the OS byte, to 3 (Unix). We'll do the same.
+ gzippedMetaData[3] = 4;
+ gzippedMetaData[9] = 3;
+ byte [] assemblyBuffer = new byte[gzippedMetaData.length +
+ ARC_GZIP_EXTRA_FIELD.length];
+ // '10' in the below is a pointer past the following bytes of the
+ // GZIP header: ID1 ID2 CM FLG + MTIME(4-bytes) XFL OS. See
+ // RFC1952 for explaination of the abbreviations just used.
+ System.arraycopy(gzippedMetaData, 0, assemblyBuffer, 0, 10);
+ System.arraycopy(ARC_GZIP_EXTRA_FIELD, 0, assemblyBuffer, 10,
+ ARC_GZIP_EXTRA_FIELD.length);
+ System.arraycopy(gzippedMetaData, 10, assemblyBuffer,
+ 10 + ARC_GZIP_EXTRA_FIELD.length, gzippedMetaData.length - 10);
+ bytes = assemblyBuffer;
+ }
+ return bytes;
+ }
+
+ public String getMetadataHeaderLinesTwoAndThree(String version) {
+ StringBuffer buffer = new StringBuffer();
+ buffer.append(LINE_SEPARATOR);
+ buffer.append(version);
+ buffer.append(" InternetArchive");
+ buffer.append(LINE_SEPARATOR);
+ buffer.append("URL IP-address Archive-date Content-type Archive-length");
+ buffer.append(LINE_SEPARATOR);
+ return buffer.toString();
+ }
+
+ /**
+ * Write all metadata to passed baos.
+ *
+ * @param baos Byte array to write to.
+ * @throws UnsupportedEncodingException
+ * @throws IOException
+ */
+ private void writeMetaData(ByteArrayOutputStream baos)
+ throws UnsupportedEncodingException, IOException {
+ if (settings.getMetadata() == null) {
+ return;
+ }
+
+ for (Iterator i = settings.getMetadata().iterator();
+ i.hasNext();) {
+ Object obj = i.next();
+ if (obj instanceof String) {
+ baos.write(((String)obj).getBytes(DEFAULT_ENCODING));
+ } else if (obj instanceof File) {
+ InputStream is = null;
+ try {
+ is = new BufferedInputStream(
+ new FileInputStream((File)obj));
+ byte [] buffer = new byte[4096];
+ for (int read = -1; (read = is.read(buffer)) != -1;) {
+ baos.write(buffer, 0, read);
+ }
+ } finally {
+ if (is != null) {
+ is.close();
+ }
+ }
+ } else if (obj != null) {
+ logger.severe("Unsupported metadata type: " + obj);
+ }
+ }
+ return;
+ }
+
+ /**
+ * @return Total length of metadata.
+ * @throws UnsupportedEncodingException
+ */
+ private int getMetadataLength()
+ throws UnsupportedEncodingException {
+ int result = -1;
+ if (settings.getMetadata() == null) {
+ result = 0;
+ } else {
+ for (Iterator i = settings.getMetadata().iterator();
+ i.hasNext();) {
+ Object obj = i.next();
+ if (obj instanceof String) {
+ result += ((String)obj).getBytes(DEFAULT_ENCODING).length;
+ } else if (obj instanceof File) {
+ result += ((File)obj).length();
+ } else {
+ logger.severe("Unsupported metadata type: " + obj);
+ }
+ }
+ }
+ return result;
+ }
+
+ /**
+ * @deprecated use input-stream version directly instead
+ */
+ public void write(String uri, String contentType, String hostIP,
+ long fetchBeginTimeStamp, long recordLength,
+ ByteArrayOutputStream baos)
+ throws IOException {
+ write(uri, contentType, hostIP, fetchBeginTimeStamp, recordLength,
+ new ByteArrayInputStream(baos.toByteArray()), false);
+ }
+
+ public void write(String uri, String contentType, String hostIP,
+ long fetchBeginTimeStamp, long recordLength, InputStream in)
+ throws IOException {
+ write(uri,contentType,hostIP,fetchBeginTimeStamp,recordLength,in,true);
+ }
+
+ /**
+ * Write a record with the given metadata/content.
+ *
+ * @param uri
+ * URI for metadata-line
+ * @param contentType
+ * MIME content-type for metadata-line
+ * @param hostIP
+ * IP for metadata-line
+ * @param fetchBeginTimeStamp
+ * timestamp for metadata-line
+ * @param recordLength
+ * length for metadata-line; also may be enforced
+ * @param in
+ * source InputStream for record content
+ * @param enforceLength
+ * whether to enforce the declared length; should be true
+ * unless intentionally writing bad records for testing
+ * @throws IOException
+ */
+ public void write(String uri, String contentType, String hostIP,
+ long fetchBeginTimeStamp, long recordLength, InputStream in,
+ boolean enforceLength) throws IOException {
+ preWriteRecordTasks();
+ try {
+ write(getMetaLine(uri, contentType, hostIP, fetchBeginTimeStamp,
+ recordLength).getBytes(UTF8));
+ copyFrom(in, recordLength, enforceLength);
+ if (in instanceof ReplayInputStream) {
+ // check for consumption of entire recorded material
+ long remaining = ((ReplayInputStream) in).remaining();
+ // Should be zero at this stage. If not, something is
+ // wrong.
+ if (remaining != 0) {
+ String message = "Gap between expected and actual: "
+ + remaining + LINE_SEPARATOR + DevUtils.extraInfo()
+ + " writing arc "
+ + this.getFile().getAbsolutePath();
+ DevUtils.warnHandle(new Throwable(message), message);
+ throw new IOException(message);
+ }
+ }
+ write(LINE_SEPARATOR);
+ } finally {
+ postWriteRecordTasks();
+ }
+ }
+
+ /**
+ * @param uri
+ * @param contentType
+ * @param hostIP
+ * @param fetchBeginTimeStamp
+ * @param recordLength
+ * @return Metadata line for an ARCRecord made of passed components.
+ * @exception IOException
+ */
+ protected String getMetaLine(String uri, String contentType, String hostIP,
+ long fetchBeginTimeStamp, long recordLength)
+ throws IOException {
+ if (fetchBeginTimeStamp <= 0) {
+ throw new IOException("Bogus fetchBeginTimestamp: " +
+ Long.toString(fetchBeginTimeStamp));
+ }
+
+ return validateMetaLine(createMetaline(uri, hostIP,
+ ArchiveUtils.get14DigitDate(fetchBeginTimeStamp),
+ MimetypeUtils.truncate(contentType),
+ Long.toString(recordLength)));
+ }
+
+ public String createMetaline(String uri, String hostIP,
+ String timeStamp, String mimetype, String recordLength) {
+ return uri + HEADER_FIELD_SEPARATOR + hostIP +
+ HEADER_FIELD_SEPARATOR + timeStamp +
+ HEADER_FIELD_SEPARATOR + mimetype +
+ HEADER_FIELD_SEPARATOR + recordLength + LINE_SEPARATOR;
+ }
+
+ /**
+ * Test that the metadata line is valid before writing.
+ * @param metaLineStr
+ * @throws IOException
+ * @return The passed in metaline.
+ */
+ protected String validateMetaLine(String metaLineStr)
+ throws IOException {
+ if (metaLineStr.length() > MAX_METADATA_LINE_LENGTH) {
+ throw new IOException("Metadata line too long ("
+ + metaLineStr.length() + ">" + MAX_METADATA_LINE_LENGTH
+ + "): " + metaLineStr);
+ }
+ Matcher m = METADATA_LINE_PATTERN.matcher(metaLineStr);
+ if (!m.matches()) {
+ throw new IOException("Metadata line doesn't match expected" +
+ " pattern: " + metaLineStr);
+ }
+ return metaLineStr;
+ }
+}
diff --git a/src/main/java/org/archive/io/arc/ARCWriterPool.java b/src/main/java/org/archive/io/arc/ARCWriterPool.java
new file mode 100644
index 00000000..b55b3ed4
--- /dev/null
+++ b/src/main/java/org/archive/io/arc/ARCWriterPool.java
@@ -0,0 +1,69 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io.arc;
+
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.archive.io.WriterPool;
+import org.archive.io.WriterPoolMember;
+import org.archive.io.WriterPoolSettings;
+
+
+/**
+ * A pool of ARCWriters.
+ *
+ * @author stack
+ */
+public class ARCWriterPool extends WriterPool {
+ /**
+ * Constructor
+ *
+ * @param settings Settings for this pool.
+ * @param poolMaximumActive
+ * @param poolMaximumWait
+ */
+ public ARCWriterPool(final WriterPoolSettings settings,
+ final int poolMaximumActive, final int poolMaximumWait) {
+ this(new AtomicInteger(), settings, poolMaximumActive, poolMaximumWait);
+ }
+
+ /**
+ * Constructor
+ *
+ * @param serial Used to generate unique filename sequences
+ * @param settings Settings for this pool.
+ * @param poolMaximumActive
+ * @param poolMaximumWait
+ */
+ public ARCWriterPool(final AtomicInteger serial,
+ final WriterPoolSettings settings,
+ final int poolMaximumActive, final int poolMaximumWait) {
+ super(serial, settings, poolMaximumActive, poolMaximumWait);
+ }
+
+ /* (non-Javadoc)
+ * @see org.archive.io.WriterPool#makeWriter()
+ */
+ protected WriterPoolMember makeWriter() {
+ return new ARCWriter(serialNo, settings);
+ }
+
+
+
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/io/arc/WriterPoolSettingsData.java b/src/main/java/org/archive/io/arc/WriterPoolSettingsData.java
new file mode 100644
index 00000000..7396f2d8
--- /dev/null
+++ b/src/main/java/org/archive/io/arc/WriterPoolSettingsData.java
@@ -0,0 +1,80 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io.arc;
+
+import java.io.File;
+import java.util.List;
+
+import org.archive.io.WriterPoolSettings;
+
+public class WriterPoolSettingsData implements WriterPoolSettings {
+ protected long maxFileSizeBytes;
+ protected String prefix;
+ protected String template;
+ protected List outputDirs;
+ protected boolean compress;
+ protected List metadata;
+ protected boolean frequentFlushes = true;
+ protected int writeBufferSize = 16*1024;
+
+ public WriterPoolSettingsData(String prefix, String template,
+ long maxFileSizeBytes, boolean compress, List outputDirs,
+ List metadata) {
+ super();
+ this.maxFileSizeBytes = maxFileSizeBytes;
+ this.prefix = prefix;
+ this.template = template;
+ this.outputDirs = outputDirs;
+ this.compress = compress;
+ this.metadata = metadata;
+ }
+
+ @Override
+ public boolean getCompress() {
+ return compress;
+ }
+ @Override
+ public long getMaxFileSizeBytes() {
+ return maxFileSizeBytes;
+ }
+ @Override
+ public List getMetadata() {
+ return metadata;
+ }
+ @Override
+ public List calcOutputDirs() {
+ return outputDirs;
+ }
+ @Override
+ public String getPrefix() {
+ return prefix;
+ }
+ @Override
+ public String getTemplate() {
+ return template;
+ }
+ @Override
+ public boolean getFrequentFlushes() {
+ return frequentFlushes;
+ }
+ @Override
+ public int getWriteBufferSize() {
+ return writeBufferSize;
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/io/package.html b/src/main/java/org/archive/io/package.html
new file mode 100644
index 00000000..d1798b80
--- /dev/null
+++ b/src/main/java/org/archive/io/package.html
@@ -0,0 +1,9 @@
+
+
+
+org.archive.io.arc package
+
+
+ARC file reading and writing.
+
+
diff --git a/src/main/java/org/archive/io/warc/WARCConstants.java b/src/main/java/org/archive/io/warc/WARCConstants.java
new file mode 100644
index 00000000..83cc8a6d
--- /dev/null
+++ b/src/main/java/org/archive/io/warc/WARCConstants.java
@@ -0,0 +1,24 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io.warc;
+
+@Deprecated
+public interface WARCConstants extends org.archive.format.warc.WARCConstants {
+}
diff --git a/src/main/java/org/archive/io/warc/WARCReader.java b/src/main/java/org/archive/io/warc/WARCReader.java
new file mode 100644
index 00000000..a34854ef
--- /dev/null
+++ b/src/main/java/org/archive/io/warc/WARCReader.java
@@ -0,0 +1,287 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io.warc;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+import org.apache.commons.cli.PosixParser;
+import org.apache.commons.lang.NotImplementedException;
+import org.archive.io.ArchiveReader;
+import org.archive.io.ArchiveRecord;
+
+/**
+ * WARCReader.
+ * Go via {@link WARCReaderFactory} to get instance.
+ * @author stack
+ * @version $Date: 2006-11-27 18:03:03 -0800 (Mon, 27 Nov 2006) $ $Version$
+ */
+public class WARCReader extends ArchiveReader implements WARCConstants {
+ protected WARCReader() {
+ super();
+ }
+
+ @Override
+ protected void initialize(String i) {
+ super.initialize(i);
+ setVersion(WARC_VERSION);
+ }
+
+ /**
+ * Skip over any trailing new lines at end of the record so we're lined up
+ * ready to read the next.
+ * @param record
+ * @throws IOException
+ */
+ protected void gotoEOR(ArchiveRecord record) throws IOException {
+ if (record.available() != 0) {
+ throw new IOException("Record should be exhausted before coming " +
+ "in here");
+ }
+
+ // Records end in 2*CRLF. Suck it up.
+ readExpectedChar(getIn(), CRLF.charAt(0));
+ readExpectedChar(getIn(), CRLF.charAt(1));
+ readExpectedChar(getIn(), CRLF.charAt(0));
+ readExpectedChar(getIn(), CRLF.charAt(1));
+ }
+
+ protected void readExpectedChar(final InputStream is, final int expected)
+ throws IOException {
+ int c = is.read();
+ if (c != expected) {
+ throw new IOException("Unexpected character " +
+ Integer.toHexString(c) + "(Expecting " +
+ Integer.toHexString(expected) + ")");
+ }
+ }
+
+ /**
+ * Create new WARC record.
+ * Encapsulate housekeeping that has to do w/ creating new Record.
+ * @param is InputStream to use.
+ * @param offset Absolute offset into WARC file.
+ * @return A WARCRecord.
+ * @throws IOException
+ */
+ protected WARCRecord createArchiveRecord(InputStream is, long offset)
+ throws IOException {
+ return (WARCRecord)currentRecord(new WARCRecord(is,
+ getReaderIdentifier(), offset, isDigest(), isStrict()));
+ }
+
+ @Override
+ public void dump(boolean compress)
+ throws IOException, java.text.ParseException {
+ for (final Iterator i = iterator(); i.hasNext();) {
+ ArchiveRecord r = i.next();
+ System.out.println(r.getHeader().toString());
+ r.dump();
+ System.out.println();
+ }
+ }
+
+
+ @Override
+ public ArchiveReader getDeleteFileOnCloseReader(final File f) {
+ throw new NotImplementedException("TODO");
+ }
+
+ @Override
+ public String getDotFileExtension() {
+ return DOT_WARC_FILE_EXTENSION;
+ }
+
+ @Override
+ public String getFileExtension() {
+ return WARC_FILE_EXTENSION;
+ }
+
+ // Static methods follow. Mostly for command-line processing.
+
+ /**
+ *
+ * @param formatter Help formatter instance.
+ * @param options Usage options.
+ * @param exitCode Exit code.
+ */
+ private static void usage(HelpFormatter formatter, Options options,
+ int exitCode) {
+ formatter.printHelp("java org.archive.io.arc.WARCReader" +
+ " [--digest=true|false] \\\n" +
+ " [--format=cdx|cdxfile|dump|gzipdump]" +
+ " [--offset=#] \\\n[--strict] [--parse] WARC_FILE|WARC_URL",
+ options);
+ System.exit(exitCode);
+ }
+
+ /**
+ * Write out the arcfile.
+ *
+ * @param reader
+ * @param format Format to use outputting.
+ * @throws IOException
+ * @throws java.text.ParseException
+ */
+ protected static void output(WARCReader reader, String format)
+ throws IOException, java.text.ParseException {
+ if (!reader.output(format)) {
+ throw new IOException("Unsupported format: " + format);
+ }
+ }
+
+ /**
+ * Generate a CDX index file for an ARC file.
+ *
+ * @param urlOrPath The ARC file to generate a CDX index for
+ * @throws IOException
+ * @throws java.text.ParseException
+ */
+ public static void createCDXIndexFile(String urlOrPath)
+ throws IOException, java.text.ParseException {
+ WARCReader r = WARCReaderFactory.get(urlOrPath);
+ r.setStrict(false);
+ r.setDigest(true);
+ output(r, CDX_FILE);
+ }
+
+ /**
+ * Command-line interface to WARCReader.
+ *
+ * Here is the command-line interface:
+ *
+ * usage: java org.archive.io.arc.WARCReader [--offset=#] ARCFILE
+ * -h,--help Prints this message and exits.
+ * -o,--offset Outputs record at this offset into arc file.
+ *
+ *
Outputs using a pseudo-CDX format as described here:
+ * CDX
+ * Legent and here
+ * Example.
+ * Legend used in below is: 'CDX b e a m s c V (or v if uncompressed) n g'.
+ * Hash is hard-coded straight SHA-1 hash of content.
+ *
+ * @param args Command-line arguments.
+ * @throws ParseException Failed parse of the command line.
+ * @throws IOException
+ * @throws java.text.ParseException
+ */
+ public static void main(String [] args)
+ throws ParseException, IOException, java.text.ParseException {
+ Options options = getOptions();
+ PosixParser parser = new PosixParser();
+ CommandLine cmdline = parser.parse(options, args, false);
+ @SuppressWarnings("unchecked")
+ List cmdlineArgs = cmdline.getArgList();
+ Option [] cmdlineOptions = cmdline.getOptions();
+ HelpFormatter formatter = new HelpFormatter();
+
+ // If no args, print help.
+ if (cmdlineArgs.size() <= 0) {
+ usage(formatter, options, 0);
+ }
+
+ // Now look at options passed.
+ long offset = -1;
+ boolean digest = false;
+ boolean strict = false;
+ String format = CDX;
+ for (int i = 0; i < cmdlineOptions.length; i++) {
+ switch(cmdlineOptions[i].getId()) {
+ case 'h':
+ usage(formatter, options, 0);
+ break;
+
+ case 'o':
+ offset =
+ Long.parseLong(cmdlineOptions[i].getValue());
+ break;
+
+ case 's':
+ strict = true;
+ break;
+
+ case 'd':
+ digest = getTrueOrFalse(cmdlineOptions[i].getValue());
+ break;
+
+ case 'f':
+ format = cmdlineOptions[i].getValue().toLowerCase();
+ boolean match = false;
+ // List of supported formats.
+ final String [] supportedFormats =
+ {CDX, DUMP, GZIP_DUMP, CDX_FILE};
+ for (int ii = 0; ii < supportedFormats.length; ii++) {
+ if (supportedFormats[ii].equals(format)) {
+ match = true;
+ break;
+ }
+ }
+ if (!match) {
+ usage(formatter, options, 1);
+ }
+ break;
+
+ default:
+ throw new RuntimeException("Unexpected option: " +
+ + cmdlineOptions[i].getId());
+ }
+ }
+
+ if (offset >= 0) {
+ if (cmdlineArgs.size() != 1) {
+ System.out.println("Error: Pass one arcfile only.");
+ usage(formatter, options, 1);
+ }
+ WARCReader r = WARCReaderFactory.get(
+ new File((String)cmdlineArgs.get(0)), offset);
+ r.setStrict(strict);
+ outputRecord(r, format);
+ } else {
+ for (Iterator i = cmdlineArgs.iterator(); i.hasNext();) {
+ String urlOrPath = (String)i.next();
+ try {
+ WARCReader r = WARCReaderFactory.get(urlOrPath);
+ r.setStrict(strict);
+ r.setDigest(digest);
+ output(r, format);
+ } catch (RuntimeException e) {
+ // Write out name of file we failed on to help with
+ // debugging. Then print stack trace and try to keep
+ // going. We do this for case where we're being fed
+ // a bunch of ARCs; just note the bad one and move
+ // on to the next.
+ System.err.println("Exception processing " + urlOrPath +
+ ": " + e.getMessage());
+ e.printStackTrace(System.err);
+ System.exit(1);
+ }
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/io/warc/WARCReaderFactory.java b/src/main/java/org/archive/io/warc/WARCReaderFactory.java
new file mode 100644
index 00000000..9c6c7e77
--- /dev/null
+++ b/src/main/java/org/archive/io/warc/WARCReaderFactory.java
@@ -0,0 +1,307 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io.warc;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Iterator;
+
+import org.archive.io.ArchiveReader;
+import org.archive.io.ArchiveReaderFactory;
+import org.archive.io.ArchiveRecord;
+import org.archive.io.warc.WARCConstants;
+import org.archive.util.ArchiveUtils;
+import org.archive.util.FileUtils;
+import org.archive.util.zip.GZIPMembersInputStream;
+
+import com.google.common.io.CountingInputStream;
+
+/**
+ * Factory for WARC Readers.
+ * Figures whether to give out a compressed file Reader or an uncompressed
+ * Reader.
+ * @author stack
+ * @version $Date: 2006-08-23 17:59:04 -0700 (Wed, 23 Aug 2006) $ $Version$
+ */
+public class WARCReaderFactory extends ArchiveReaderFactory
+implements WARCConstants {
+ private static final WARCReaderFactory factory = new WARCReaderFactory();
+
+ /**
+ * Shutdown any access to default constructor.
+ * This factory is Singleton.
+ */
+ private WARCReaderFactory() {
+ super();
+ }
+
+ public static WARCReader get(String arcFileOrUrl)
+ throws MalformedURLException, IOException {
+ return (WARCReader)WARCReaderFactory.factory.
+ getArchiveReader(arcFileOrUrl);
+ }
+
+ public static WARCReader get(final File f) throws IOException {
+ return (WARCReader)WARCReaderFactory.factory.getArchiveReader(f);
+ }
+
+ /**
+ * @param f An arcfile to read.
+ * @param offset Have returned Reader set to start reading at this offset.
+ * @return A WARCReader.
+ * @throws IOException
+ */
+ public static WARCReader get(final File f, final long offset)
+ throws IOException {
+ return (WARCReader)WARCReaderFactory.factory.
+ getArchiveReader(f, offset);
+ }
+
+ protected ArchiveReader getArchiveReader(final File f, final long offset)
+ throws IOException {
+ boolean compressed = testCompressedWARCFile(f);
+ if (!compressed) {
+ if (!FileUtils.isReadableWithExtensionAndMagic(f,
+ DOT_WARC_FILE_EXTENSION, WARC_MAGIC)) {
+ throw new IOException(f.getAbsolutePath()
+ + " is not a WARC file.");
+ }
+ }
+ return (WARCReader)(compressed?
+ WARCReaderFactory.factory.new CompressedWARCReader(f, offset):
+ WARCReaderFactory.factory.new UncompressedWARCReader(f, offset));
+ }
+
+ public static ArchiveReader get(final String s, final InputStream is,
+ final boolean atFirstRecord)
+ throws IOException {
+ return WARCReaderFactory.factory.getArchiveReader(s, is,
+ atFirstRecord);
+ }
+
+ protected ArchiveReader getArchiveReader(final String f,
+ final InputStream is, final boolean atFirstRecord)
+ throws IOException {
+ // For now, assume stream is compressed. Later add test of input
+ // stream or handle exception thrown when figure not compressed stream.
+ return new CompressedWARCReader(f, is, atFirstRecord);
+ }
+
+ public static WARCReader get(final URL arcUrl, final long offset)
+ throws IOException {
+ return (WARCReader)WARCReaderFactory.factory.getArchiveReader(arcUrl,
+ offset);
+ }
+
+ /**
+ * Get an ARCReader.
+ * Pulls the ARC local into whereever the System Property
+ * java.io.tmpdir points. It then hands back an ARCReader that
+ * points at this local copy. A close on this ARCReader instance will
+ * remove the local copy.
+ * @param arcUrl An URL that points at an ARC.
+ * @return An ARCReader.
+ * @throws IOException
+ */
+ public static WARCReader get(final URL arcUrl)
+ throws IOException {
+ return (WARCReader)WARCReaderFactory.factory.getArchiveReader(arcUrl);
+ }
+
+ /**
+ * Check file is compressed WARC.
+ *
+ * @param f File to test.
+ *
+ * @return True if this is compressed WARC (TODO: Just tests if file is
+ * GZIP'd file (It begins w/ GZIP MAGIC)).
+ *
+ * @exception IOException If file does not exist or is not unreadable.
+ */
+ public static boolean testCompressedWARCFile(final File f)
+ throws IOException {
+ FileUtils.assertReadable(f);
+ boolean compressed = false;
+ final InputStream is = new FileInputStream(f);
+ try {
+ compressed = ArchiveUtils.isGzipped(is);
+ } finally {
+ is.close();
+ }
+ return compressed;
+ }
+
+ /**
+ * Uncompressed WARC file reader.
+ * @author stack
+ */
+ public class UncompressedWARCReader extends WARCReader {
+ /**
+ * Constructor.
+ * @param f Uncompressed arcfile to read.
+ * @throws IOException
+ */
+ public UncompressedWARCReader(final File f)
+ throws IOException {
+ this(f, 0);
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param f Uncompressed file to read.
+ * @param offset Offset at which to position Reader.
+ * @throws IOException
+ */
+ public UncompressedWARCReader(final File f, final long offset)
+ throws IOException {
+ // File has been tested for existence by time it has come to here.
+ setIn(new CountingInputStream(getInputStream(f, offset)));
+ getIn().skip(offset);
+ initialize(f.getAbsolutePath());
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param f Uncompressed file to read.
+ * @param is InputStream.
+ */
+ public UncompressedWARCReader(final String f, final InputStream is) {
+ // Arc file has been tested for existence by time it has come
+ // to here.
+ setIn(new CountingInputStream(is));
+ initialize(f);
+ }
+ }
+
+ /**
+ * Compressed WARC file reader.
+ *
+ * @author stack
+ */
+ public class CompressedWARCReader extends WARCReader {
+ /**
+ * Constructor.
+ *
+ * @param f Compressed file to read.
+ * @throws IOException
+ */
+ public CompressedWARCReader(final File f) throws IOException {
+ this(f, 0);
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param f Compressed arcfile to read.
+ * @param offset Position at where to start reading file.
+ * @throws IOException
+ */
+ public CompressedWARCReader(final File f, final long offset)
+ throws IOException {
+ // File has been tested for existence by time it has come to here.
+ setIn(new GZIPMembersInputStream(getInputStream(f, offset)));
+ ((GZIPMembersInputStream)getIn()).compressedSeek(offset);
+ setCompressed((offset == 0)); // TODO: does this make sense?!?!
+ initialize(f.getAbsolutePath());
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param f Compressed arcfile.
+ * @param is InputStream to use.
+ * @param atFirstRecord
+ * @throws IOException
+ */
+ public CompressedWARCReader(final String f, final InputStream is,
+ final boolean atFirstRecord)
+ throws IOException {
+ // Arc file has been tested for existence by time it has come
+ // to here.
+ setIn(new GZIPMembersInputStream(is));
+ setCompressed(true);
+ initialize(f);
+ // TODO: Ignore atFirstRecord. Probably doesn't apply in WARC world.
+ }
+
+ /**
+ * Get record at passed offset.
+ *
+ * @param offset Byte index into file at which a record starts.
+ * @return A WARCRecord reference.
+ * @throws IOException
+ */
+ public WARCRecord get(long offset) throws IOException {
+ cleanupCurrentRecord();
+ ((GZIPMembersInputStream)getIn()).compressedSeek(offset);
+ return (WARCRecord) createArchiveRecord(getIn(), offset);
+ }
+
+ public Iterator iterator() {
+ /**
+ * Override ArchiveRecordIterator so can base returned iterator on
+ * GzippedInputStream iterator.
+ */
+ return new ArchiveRecordIterator() {
+ private GZIPMembersInputStream gis =
+ (GZIPMembersInputStream)getIn();
+
+ private Iterator gzipIterator = this.gis.memberIterator();
+
+ protected boolean innerHasNext() {
+ return this.gzipIterator.hasNext();
+ }
+
+ protected ArchiveRecord innerNext() throws IOException {
+ // Get the position before gzipIterator.next moves
+ // it on past the gzip header.
+ InputStream is = (InputStream) this.gzipIterator.next();
+ return createArchiveRecord(is, Math.max(gis.getCurrentMemberStart(), gis.getCurrentMemberEnd()));
+ }
+ };
+ }
+
+ protected void gotoEOR(ArchiveRecord rec) throws IOException {
+ long skipped = 0;
+ while (getIn().read()>-1) {
+ skipped++;
+ }
+ if(skipped>4) {
+ System.err.println("unexpected extra data after record "+rec);
+ }
+ return;
+ }
+ }
+
+ public static boolean isWARCSuffix(final String f) {
+ return (f == null)?
+ false:
+ (f.toLowerCase().endsWith(DOT_COMPRESSED_WARC_FILE_EXTENSION))?
+ true:
+ (f.toLowerCase().endsWith(DOT_WARC_FILE_EXTENSION))?
+ true: false;
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/io/warc/WARCRecord.java b/src/main/java/org/archive/io/warc/WARCRecord.java
new file mode 100644
index 00000000..635d1c3b
--- /dev/null
+++ b/src/main/java/org/archive/io/warc/WARCRecord.java
@@ -0,0 +1,233 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io.warc;
+
+import it.unimi.dsi.fastutil.io.RepositionableStream;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.httpclient.Header;
+import org.apache.commons.httpclient.HttpParser;
+import org.archive.io.ArchiveRecord;
+import org.archive.io.ArchiveRecordHeader;
+import org.archive.util.LaxHttpParser;
+
+
+/**
+ * A WARC file Record.
+ *
+ * @author stack
+ */
+public class WARCRecord extends ArchiveRecord implements WARCConstants {
+ private Pattern WHITESPACE = Pattern.compile("\\s");
+
+ /**
+ * Constructor.
+ *
+ * @param in Stream cue'd up to be at the start of the record this instance
+ * is to represent.
+ * @throws IOException
+ */
+ public WARCRecord(InputStream in, final String identifier,
+ final long offset)
+ throws IOException {
+ this(in, identifier, offset, true, false);
+ }
+
+ /**
+ * Constructor.
+ * @param in Stream cue'd up just past Header Line and Named Fields.
+ * @param headers Header Line and ANVL Named fields.
+ * @throws IOException
+ */
+ public WARCRecord(InputStream in, ArchiveRecordHeader headers)
+ throws IOException {
+ super(in, headers, 0, true, false);
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param in Stream cue'd up to be at the start of the record this instance
+ * is to represent or, if headers is not null, just past the
+ * Header Line and Named Fields.
+ * @param identifier Identifier for this the hosting Reader.
+ * @param offset Current offset into in (Used to keep
+ * position properly aligned). Usually 0.
+ * @param digest True if we're to calculate digest for this record. Not
+ * digesting saves about ~15% of cpu during parse.
+ * @param strict Be strict parsing (Parsing stops if file inproperly
+ * formatted).
+ * @throws IOException
+ */
+ public WARCRecord(final InputStream in, final String identifier,
+ final long offset, boolean digest, boolean strict)
+ throws IOException {
+ super(in, null, 0, digest, strict);
+ setHeader(parseHeaders(in, identifier, offset, strict));
+ }
+
+ /**
+ * Parse WARC Header Line and Named Fields.
+ * @param in Stream to read.
+ * @param identifier Identifier for the hosting Reader.
+ * @param offset Absolute offset into Reader.
+ * @param strict Whether to be loose parsing or not.
+ * @return An ArchiveRecordHeader.
+ * @throws IOException
+ */
+ protected ArchiveRecordHeader parseHeaders(final InputStream in,
+ final String identifier, final long offset, final boolean strict)
+ throws IOException {
+ final Map m = new HashMap();
+ m.put(ABSOLUTE_OFFSET_KEY, new Long(offset));
+ m.put(READER_IDENTIFIER_FIELD_KEY, identifier);
+
+ long startPosition = -1;
+ if (in instanceof RepositionableStream) {
+ startPosition = ((RepositionableStream)in).position();
+ }
+ String firstLine =
+ new String(LaxHttpParser.readLine(in, WARC_HEADER_ENCODING));
+ if (firstLine == null || firstLine.length() <=0) {
+ throw new IOException("Failed to read WARC_MAGIC");
+ }
+ if (!firstLine.startsWith(WARC_MAGIC)) {
+ throw new IOException("Failed to find WARC MAGIC: " + firstLine);
+ }
+ // Here we start reading off the inputstream but we're reading the
+ // stream direct rather than going via WARCRecord#read. The latter will
+ // keep count of bytes read, digest and fail properly if EOR too soon...
+ // We don't want digesting while reading Headers.
+ //
+ Header [] h = LaxHttpParser.parseHeaders(in, WARC_HEADER_ENCODING);
+ for (int i = 0; i < h.length; i++) {
+ m.put(h[i].getName(), h[i].getValue());
+ }
+ int headerLength = -1;
+ if (in instanceof RepositionableStream) {
+ headerLength =
+ (int)(((RepositionableStream)in).position() - startPosition);
+ }
+ final int contentOffset = headerLength;
+ incrementPosition(contentOffset);
+
+ return new ArchiveRecordHeader() {
+ private Map headers = m;
+ private int contentBegin = contentOffset;
+
+ public String getDate() {
+ return (String)this.headers.get(HEADER_KEY_DATE);
+ }
+
+ public String getDigest() {
+ return null;
+ // TODO: perhaps return block-digest?
+ // superclass def implies this is calculated ("only after
+ // read in totality"), not pulled from header, so
+ // below prior implementation was misleading
+// return (String)this.headers.get(HEADER_KEY_CHECKSUM);
+ }
+
+ public String getReaderIdentifier() {
+ return (String)this.headers.get(READER_IDENTIFIER_FIELD_KEY);
+ }
+
+ public Set getHeaderFieldKeys() {
+ return this.headers.keySet();
+ }
+
+ public Map getHeaderFields() {
+ return this.headers;
+ }
+
+ public Object getHeaderValue(String key) {
+ return this.headers.get(key);
+ }
+
+ // Returns just the Content-Length of the warc record
+ public long getContentLength() {
+ Object o = this.headers.get(CONTENT_LENGTH);
+ if (o == null) {
+ return -1;
+ }
+ long contentLength = (o instanceof Long)?
+ ((Long)o).longValue(): Long.parseLong((String)o);
+ return contentLength;
+ }
+
+ // Returns the full record length
+ public long getLength()
+ {
+ return getContentLength() + contentOffset;
+ }
+
+ public String getMimetype() {
+ return (String)this.headers.get(CONTENT_TYPE);
+ }
+
+ public long getOffset() {
+ Object o = this.headers.get(ABSOLUTE_OFFSET_KEY);
+ if (o == null) {
+ return -1;
+ }
+ return (o instanceof Long)?
+ ((Long)o).longValue(): Long.parseLong((String)o);
+ }
+
+ public String getRecordIdentifier() {
+ return (String)this.headers.get(RECORD_IDENTIFIER_FIELD_KEY);
+ }
+
+ public String getUrl() {
+ return (String)this.headers.get(HEADER_KEY_URI);
+ }
+
+ public String getVersion() {
+ return (String)this.headers.get(VERSION_FIELD_KEY);
+ }
+
+ public int getContentBegin() {
+ return this.contentBegin;
+ }
+
+ @Override
+ public String toString() {
+ return this.headers.toString();
+ }
+ };
+ }
+
+ @Override
+ protected String getMimetype4Cdx(ArchiveRecordHeader h) {
+ final String m = super.getMimetype4Cdx(h);
+ // Mimetypes can have spaces in WARCs. Emitting for CDX, just
+ // squash them for now. Later, quote them since squashing spaces won't
+ // work for params that have quoted-string values.
+ Matcher matcher = WHITESPACE.matcher(m);
+ return matcher.replaceAll("");
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/io/warc/WARCRecordInfo.java b/src/main/java/org/archive/io/warc/WARCRecordInfo.java
new file mode 100644
index 00000000..a6198c44
--- /dev/null
+++ b/src/main/java/org/archive/io/warc/WARCRecordInfo.java
@@ -0,0 +1,139 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io.warc;
+
+import java.io.InputStream;
+import java.net.URI;
+
+import org.archive.format.warc.WARCConstants.WARCRecordType;
+import org.archive.util.anvl.ANVLRecord;
+
+public class WARCRecordInfo {
+
+ protected WARCRecordType type;
+ protected String url;
+ protected String create14DigitDate;
+ protected String mimetype;
+ protected URI recordId;
+ protected ANVLRecord extraHeaders;
+ protected InputStream contentStream;
+ protected long contentLength;
+ protected boolean enforceLength;
+ protected String warcFilename;
+ protected Long warcFileOffset;
+
+ public void setType(WARCRecordType type) {
+ this.type = type;
+ }
+
+ public void setUrl(String url) {
+ this.url = url;
+ }
+
+ public String getCreate14DigitDate() {
+ return create14DigitDate;
+ }
+
+ public void setCreate14DigitDate(String create14DigitDate) {
+ this.create14DigitDate = create14DigitDate;
+ }
+
+ public String getMimetype() {
+ return mimetype;
+ }
+
+ public void setMimetype(String mimetype) {
+ this.mimetype = mimetype;
+ }
+
+ public URI getRecordId() {
+ return recordId;
+ }
+
+ public void setRecordId(URI recordId) {
+ this.recordId = recordId;
+ }
+
+ public ANVLRecord getExtraHeaders() {
+ return extraHeaders;
+ }
+
+ public void setExtraHeaders(ANVLRecord extraHeaders) {
+ this.extraHeaders = extraHeaders;
+ }
+
+ public InputStream getContentStream() {
+ return contentStream;
+ }
+
+ public void setContentStream(InputStream contentStream) {
+ this.contentStream = contentStream;
+ }
+
+ public long getContentLength() {
+ return contentLength;
+ }
+
+ public void setContentLength(long contentLength) {
+ this.contentLength = contentLength;
+ }
+
+ public boolean isEnforceLength() {
+ return enforceLength;
+ }
+
+ public boolean getEnforceLength() {
+ return enforceLength;
+ }
+
+ public void setEnforceLength(boolean enforceLength) {
+ this.enforceLength = enforceLength;
+ }
+
+ public WARCRecordType getType() {
+ return type;
+ }
+
+ public String getUrl() {
+ return url;
+ }
+
+ public void addExtraHeader(String label, String value) {
+ if (extraHeaders == null) {
+ extraHeaders = new ANVLRecord();
+ }
+ extraHeaders.addLabelValue(label, value);
+ }
+
+ public void setWARCFilename(String warcFilenameWithoutOccupiedSuffix) {
+ this.warcFilename = warcFilenameWithoutOccupiedSuffix;
+ }
+
+ public String getWARCFilename() {
+ return warcFilename;
+ }
+
+ public void setWARCFileOffset(Long startPosition) {
+ this.warcFileOffset = startPosition;
+ }
+
+ public Long getWARCFileOffset() {
+ return warcFileOffset;
+ }
+}
diff --git a/src/main/java/org/archive/io/warc/WARCWriter.java b/src/main/java/org/archive/io/warc/WARCWriter.java
new file mode 100644
index 00000000..b9558263
--- /dev/null
+++ b/src/main/java/org/archive/io/warc/WARCWriter.java
@@ -0,0 +1,436 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io.warc;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.net.URI;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import org.apache.commons.lang.StringUtils;
+import org.archive.io.ArchiveFileConstants;
+import org.archive.io.UTF8Bytes;
+import org.archive.io.WriterPoolMember;
+import org.archive.util.ArchiveUtils;
+import org.archive.util.anvl.Element;
+
+
+/**
+ * WARC implementation.
+ *
+ *
Assumption is that the caller is managing access to this
+ * WARCWriter ensuring only one thread accessing this WARC instance
+ * at any one time.
+ *
+ *
While being written, WARCs have a '.open' suffix appended.
+ *
+ * @contributor stack
+ * @version $Revision: 4604 $ $Date: 2006-09-05 22:38:18 -0700 (Tue, 05 Sep 2006) $
+ */
+public class WARCWriter extends WriterPoolMember
+implements WARCConstants {
+ public static final String TOTALS = "totals";
+ public static final String SIZE_ON_DISK = "sizeOnDisk";
+ public static final String TOTAL_BYTES = "totalBytes";
+ public static final String CONTENT_BYTES = "contentBytes";
+ public static final String NUM_RECORDS = "numRecords";
+
+ private static final Logger logger =
+ Logger.getLogger(WARCWriter.class.getName());
+
+ /**
+ * NEWLINE as bytes.
+ */
+ public static byte [] CRLF_BYTES;
+ static {
+ try {
+ CRLF_BYTES = CRLF.getBytes(DEFAULT_ENCODING);
+ } catch(Exception e) {
+ e.printStackTrace();
+ }
+ };
+
+ /**
+ * Temporarily accumulates stats managed externally by
+ * {@link WARCWriterProcessor}. WARCWriterProcessor will call
+ * {@link #resetTmpStats()}, write some records, then add
+ * {@link #getTmpStats()} into its long-term running totals.
+ */
+ private Map> tmpStats;
+
+ /** Temporarily accumulates info on written warc records for use externally. */
+ private LinkedList tmpRecordLog = new LinkedList();
+
+ /**
+ * Constructor.
+ * Takes a stream. Use with caution. There is no upperbound check on size.
+ * Will just keep writing. Only pass Streams that are bounded.
+ * @param serialNo used to generate unique file name sequences
+ * @param out Where to write.
+ * @param f File the out is connected to.
+ * @param cmprs Compress the content written.
+ * @param a14DigitDate If null, we'll write current time.
+ * @throws IOException
+ */
+ public WARCWriter(final AtomicInteger serialNo,
+ final OutputStream out, final File f,
+ final WARCWriterPoolSettings settings)
+ throws IOException {
+ super(serialNo, out, f, settings);
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param dirs Where to drop files.
+ * @param prefix File prefix to use.
+ * @param cmprs Compress the records written.
+ * @param maxSize Maximum size for ARC files written.
+ * @param suffix File tail to use. If null, unused.
+ * @param warcinfoData File metadata for warcinfo record.
+ */
+ public WARCWriter(final AtomicInteger serialNo,
+ final WARCWriterPoolSettings settings) {
+ super(serialNo, settings, WARC_FILE_EXTENSION);
+ }
+
+ @Override
+ protected String createFile(File file) throws IOException {
+ String filename = super.createFile(file);
+ writeWarcinfoRecord(filename);
+ return filename;
+ }
+
+ protected void baseCharacterCheck(final char c, final String parameter)
+ throws IllegalArgumentException {
+ // TODO: Too strict? UNICODE control characters?
+ if (Character.isISOControl(c) || !Character.isValidCodePoint(c)) {
+ throw new IllegalArgumentException("Contains illegal character 0x" +
+ Integer.toHexString(c) + ": " + parameter);
+ }
+ }
+
+ protected String checkHeaderValue(final String value)
+ throws IllegalArgumentException {
+ for (int i = 0; i < value.length(); i++) {
+ final char c = value.charAt(i);
+ baseCharacterCheck(c, value);
+ if (Character.isWhitespace(c)) {
+ throw new IllegalArgumentException("Contains disallowed white space 0x" +
+ Integer.toHexString(c) + ": " + value);
+ }
+ }
+ return value;
+ }
+
+ protected String checkHeaderLineMimetypeParameter(final String parameter)
+ throws IllegalArgumentException {
+ StringBuilder sb = new StringBuilder(parameter.length());
+ boolean wasWhitespace = false;
+ for (int i = 0; i < parameter.length(); i++) {
+ char c = parameter.charAt(i);
+ if (Character.isWhitespace(c)) {
+ // Map all to ' ' and collapse multiples into one.
+ // TODO: Make sure white space occurs in legal location --
+ // before parameter or inside quoted-string.
+ if (wasWhitespace) {
+ continue;
+ }
+ wasWhitespace = true;
+ c = ' ';
+ } else {
+ wasWhitespace = false;
+ baseCharacterCheck(c, parameter);
+ }
+ sb.append(c);
+ }
+
+ return sb.toString();
+ }
+
+// protected String createRecordHeader(final String type,
+// final String url, final String create14DigitDate,
+// final String mimetype, final URI recordId,
+// final ANVLRecord xtraHeaders, final long contentLength)
+ protected String createRecordHeader(WARCRecordInfo metaRecord)
+ throws IllegalArgumentException {
+ final StringBuilder sb =
+ new StringBuilder(2048/*A SWAG: TODO: Do analysis.*/);
+ sb.append(WARC_ID).append(CRLF);
+ sb.append(HEADER_KEY_TYPE).append(COLON_SPACE).append(metaRecord.getType()).
+ append(CRLF);
+ // Do not write a subject-uri if not one present.
+ if (!StringUtils.isEmpty(metaRecord.getUrl())) {
+ sb.append(HEADER_KEY_URI).append(COLON_SPACE).
+ append(checkHeaderValue(metaRecord.getUrl())).append(CRLF);
+ }
+ sb.append(HEADER_KEY_DATE).append(COLON_SPACE).
+ append(metaRecord.getCreate14DigitDate()).append(CRLF);
+ if (metaRecord.getExtraHeaders() != null) {
+ for (final Iterator i = metaRecord.getExtraHeaders().iterator(); i.hasNext();) {
+ sb.append(i.next()).append(CRLF);
+ }
+ }
+
+ sb.append(HEADER_KEY_ID).append(COLON_SPACE).append('<').
+ append(metaRecord.getRecordId().toString()).append('>').append(CRLF);
+ if (metaRecord.getContentLength() > 0) {
+ sb.append(CONTENT_TYPE).append(COLON_SPACE).append(
+ checkHeaderLineMimetypeParameter(metaRecord.getMimetype())).append(CRLF);
+ }
+ sb.append(CONTENT_LENGTH).append(COLON_SPACE).
+ append(Long.toString(metaRecord.getContentLength())).append(CRLF);
+
+ return sb.toString();
+ }
+
+ public void writeRecord(WARCRecordInfo recordInfo)
+ throws IOException {
+
+ if (recordInfo.getContentLength() == 0 &&
+ (recordInfo.getExtraHeaders() == null || recordInfo.getExtraHeaders().size() <= 0)) {
+ throw new IllegalArgumentException("Cannot write record " +
+ "of content-length zero and base headers only.");
+ }
+
+ String header;
+ try {
+ header = createRecordHeader(recordInfo);
+
+ } catch (IllegalArgumentException e) {
+ logger.log(Level.SEVERE,"could not write record type: " + recordInfo.getType()
+ + "for URL: " + recordInfo.getUrl(), e);
+ return;
+ }
+
+ long contentBytes = 0;
+ long totalBytes = 0;
+ long startPosition;
+
+ try {
+ startPosition = getPosition();
+ preWriteRecordTasks();
+
+ // TODO: Revisit encoding of header.
+ byte[] bytes = header.getBytes(WARC_HEADER_ENCODING);
+ write(bytes);
+ totalBytes += bytes.length;
+
+ if (recordInfo.getContentStream() != null && recordInfo.getContentLength() > 0) {
+ // Write out the header/body separator.
+ write(CRLF_BYTES); // TODO: should this be written even for zero-length?
+ totalBytes += CRLF_BYTES.length;
+ contentBytes += copyFrom(recordInfo.getContentStream(),
+ recordInfo.getContentLength(),
+ recordInfo.getEnforceLength());
+ totalBytes += contentBytes;
+ }
+
+ // Write out the two blank lines at end of all records.
+ write(CRLF_BYTES);
+ write(CRLF_BYTES);
+ totalBytes += 2 * CRLF_BYTES.length;
+
+ tally(recordInfo.getType(), contentBytes, totalBytes, getPosition() - startPosition);
+
+ recordInfo.setWARCFilename(getFilenameWithoutOccupiedSuffix());
+ recordInfo.setWARCFileOffset(startPosition);
+ tmpRecordLog.add(recordInfo);
+ } finally {
+ postWriteRecordTasks();
+ }
+ }
+
+ public String getFilenameWithoutOccupiedSuffix() {
+ String name = getFile().getName();
+ if (name.endsWith(ArchiveFileConstants.OCCUPIED_SUFFIX)) {
+ name = name.substring(0, name.length() - ArchiveFileConstants.OCCUPIED_SUFFIX.length());
+ }
+ return name;
+ }
+
+ // if compression is enabled, sizeOnDisk means compressed bytes; if not, it
+ // should be the same as totalBytes (right?)
+ protected void tally(WARCRecordType warcRecordType, long contentBytes, long totalBytes, long sizeOnDisk) {
+ if (tmpStats == null) {
+ tmpStats = new HashMap>();
+ }
+
+ // add to stats for this record type
+ Map substats = tmpStats.get(warcRecordType.toString());
+ if (substats == null) {
+ substats = new HashMap();
+ tmpStats.put(warcRecordType.toString(), substats);
+ }
+ subtally(substats, contentBytes, totalBytes, sizeOnDisk);
+
+ // add to totals
+ substats = tmpStats.get(TOTALS);
+ if (substats == null) {
+ substats = new HashMap();
+ tmpStats.put(TOTALS, substats);
+ }
+ subtally(substats, contentBytes, totalBytes, sizeOnDisk);
+ }
+
+ protected void subtally(Map substats, long contentBytes,
+ long totalBytes, long sizeOnDisk) {
+
+ if (substats.get(NUM_RECORDS) == null) {
+ substats.put(NUM_RECORDS, 1l);
+ } else {
+ substats.put(NUM_RECORDS, substats.get(NUM_RECORDS) + 1);
+ }
+
+ if (substats.get(CONTENT_BYTES) == null) {
+ substats.put(CONTENT_BYTES, contentBytes);
+ } else {
+ substats.put(CONTENT_BYTES, substats.get(CONTENT_BYTES) + contentBytes);
+ }
+
+ if (substats.get(TOTAL_BYTES) == null) {
+ substats.put(TOTAL_BYTES, totalBytes);
+ } else {
+ substats.put(TOTAL_BYTES, substats.get(TOTAL_BYTES) + totalBytes);
+ }
+
+ if (substats.get(SIZE_ON_DISK) == null) {
+ substats.put(SIZE_ON_DISK, sizeOnDisk);
+ } else {
+ substats.put(SIZE_ON_DISK, substats.get(SIZE_ON_DISK) + sizeOnDisk);
+ }
+ }
+
+ protected URI generateRecordId(final Map qualifiers)
+ throws IOException {
+ return ((WARCWriterPoolSettings)settings).getRecordIDGenerator().getQualifiedRecordID(qualifiers);
+ }
+
+ protected URI generateRecordId(final String key, final String value)
+ throws IOException {
+ return ((WARCWriterPoolSettings)settings).getRecordIDGenerator().getQualifiedRecordID(key, value);
+ }
+
+ public URI writeWarcinfoRecord(String filename)
+ throws IOException {
+ return writeWarcinfoRecord(filename, null);
+ }
+
+ public URI writeWarcinfoRecord(String filename, final String description)
+ throws IOException {
+ WARCRecordInfo recordInfo = new WARCRecordInfo();
+ recordInfo.setType(WARCRecordType.warcinfo);
+ recordInfo.setCreate14DigitDate(ArchiveUtils.getLog14Date());
+ recordInfo.setMimetype("application/warc-fields");
+
+ // Strip .open suffix if present.
+ if (filename.endsWith(WriterPoolMember.OCCUPIED_SUFFIX)) {
+ filename = filename.substring(0,
+ filename.length() - WriterPoolMember.OCCUPIED_SUFFIX.length());
+ }
+ recordInfo.addExtraHeader(HEADER_KEY_FILENAME, filename);
+ if (description != null && description.length() > 0) {
+ recordInfo.addExtraHeader(CONTENT_DESCRIPTION, description);
+ }
+
+ // Add warcinfo body.
+ byte [] warcinfoBody = null;
+ if (settings.getMetadata() == null) {
+ // TODO: What to write into a warcinfo? What to associate?
+ warcinfoBody = "TODO: Unimplemented".getBytes();
+ } else {
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ for (final Iterator i = settings.getMetadata().iterator();
+ i.hasNext();) {
+ baos.write(i.next().toString().getBytes(UTF8Bytes.UTF8));
+ }
+ warcinfoBody = baos.toByteArray();
+ }
+ recordInfo.setContentStream(new ByteArrayInputStream(warcinfoBody));
+ recordInfo.setContentLength((long) warcinfoBody.length);
+ recordInfo.setEnforceLength(true);
+
+ recordInfo.setRecordId(generateRecordId(TYPE, WARCRecordType.warcinfo.toString()));
+
+ writeRecord(recordInfo);
+
+ // TODO: If at start of file, and we're writing compressed,
+ // write out our distinctive GZIP extensions.
+ return recordInfo.getRecordId();
+ }
+
+ /**
+ * @see WARCWriter#tmpStats for usage model
+ */
+ public void resetTmpStats() {
+ if (tmpStats != null) {
+ for (Map substats : tmpStats.values()) {
+ for (Entry entry : substats.entrySet()) {
+ entry.setValue(0l);
+ }
+ }
+ }
+ }
+
+ public Map> getTmpStats() {
+ return tmpStats;
+ }
+
+ public static long getStat(Map> map, String key,
+ String subkey) {
+ if (map != null && map.get(key) != null
+ && map.get(key).get(subkey) != null) {
+ return map.get(key).get(subkey);
+ } else {
+ return 0l;
+ }
+ }
+
+ public static long getStat(
+ ConcurrentMap> map,
+ String key, String subkey) {
+ if (map != null && map.get(key) != null
+ && map.get(key).get(subkey) != null) {
+ return map.get(key).get(subkey).get();
+ } else {
+ return 0l;
+ }
+ }
+
+ public void resetTmpRecordLog() {
+ tmpRecordLog.clear();
+ }
+
+ public Iterable getTmpRecordLog() {
+ return tmpRecordLog;
+ }
+}
diff --git a/src/main/java/org/archive/io/warc/WARCWriterPool.java b/src/main/java/org/archive/io/warc/WARCWriterPool.java
new file mode 100644
index 00000000..fdc97162
--- /dev/null
+++ b/src/main/java/org/archive/io/warc/WARCWriterPool.java
@@ -0,0 +1,64 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io.warc;
+
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.archive.io.WriterPool;
+import org.archive.io.WriterPoolMember;
+
+
+/**
+ * A pool of WARCWriters.
+ * @contributor stack
+ * @contributor gojomo
+ * @version $Revision: 4566 $ $Date: 2006-08-31 09:51:41 -0700 (Thu, 31 Aug 2006) $
+ */
+public class WARCWriterPool extends WriterPool {
+ /**
+ * Constructor
+ * @param settings Settings for this pool.
+ * @param poolMaximumActive
+ * @param poolMaximumWait
+ */
+ public WARCWriterPool(final WARCWriterPoolSettings settings,
+ final int poolMaximumActive, final int poolMaximumWait) {
+ this(new AtomicInteger(), settings, poolMaximumActive, poolMaximumWait);
+ }
+
+ /**
+ * Constructor
+ * @param serial Used to generate unique filename sequences
+ * @param settings Settings for this pool.
+ * @param poolMaximumActive
+ * @param poolMaximumWait
+ */
+ public WARCWriterPool(final AtomicInteger serial,
+ final WARCWriterPoolSettings settings,
+ final int poolMaximumActive, final int poolMaximumWait) {
+ super(serial, settings, poolMaximumActive, poolMaximumWait);
+ }
+
+ /* (non-Javadoc)
+ * @see org.archive.io.WriterPool#makeWriter()
+ */
+ protected WriterPoolMember makeWriter() {
+ return new WARCWriter(serialNo, (WARCWriterPoolSettings)settings);
+ }
+}
diff --git a/src/main/java/org/archive/io/warc/WARCWriterPoolSettings.java b/src/main/java/org/archive/io/warc/WARCWriterPoolSettings.java
new file mode 100644
index 00000000..b028a8b7
--- /dev/null
+++ b/src/main/java/org/archive/io/warc/WARCWriterPoolSettings.java
@@ -0,0 +1,32 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io.warc;
+
+import org.archive.io.WriterPoolSettings;
+import org.archive.uid.RecordIDGenerator;
+
+/**
+ * Settings object for a {@link WARCWriterPool}.
+ * Used creating {@link WARCWriter}s.
+ *
+ * @version $Date: 2010-08-19 17:21:43 -0700 (Thu, 19 Aug 2010) $, $Revision: 6927 $
+ */
+public interface WARCWriterPoolSettings extends WriterPoolSettings {
+ public RecordIDGenerator getRecordIDGenerator();
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/io/warc/WARCWriterPoolSettingsData.java b/src/main/java/org/archive/io/warc/WARCWriterPoolSettingsData.java
new file mode 100644
index 00000000..d56c9971
--- /dev/null
+++ b/src/main/java/org/archive/io/warc/WARCWriterPoolSettingsData.java
@@ -0,0 +1,40 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io.warc;
+
+import java.io.File;
+import java.util.List;
+
+import org.archive.io.arc.WriterPoolSettingsData;
+import org.archive.uid.RecordIDGenerator;
+
+public class WARCWriterPoolSettingsData extends WriterPoolSettingsData implements WARCWriterPoolSettings {
+ RecordIDGenerator generator;
+
+ public WARCWriterPoolSettingsData(String prefix, String template,
+ long maxFileSizeBytes, boolean compress, List outputDirs,
+ List metadata, RecordIDGenerator generator) {
+ super(prefix,template,maxFileSizeBytes,compress,outputDirs,metadata);
+ this.generator = generator;
+ }
+ @Override
+ public RecordIDGenerator getRecordIDGenerator() {
+ return generator;
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/io/warc/package.html b/src/main/java/org/archive/io/warc/package.html
new file mode 100644
index 00000000..f52aa95b
--- /dev/null
+++ b/src/main/java/org/archive/io/warc/package.html
@@ -0,0 +1,38 @@
+
+
+
+org.archive.io.warc package
+
+
+Experimental WARC Writer and Readers. Code and specification subject to change
+with no guarantees of backward compatibility: i.e. newer readers
+may not be able to parse WARCs written with older writers. This package
+contains prototyping code for revision 0.12 of the WARC specification.
+See latest revision
+for current state (Version 0.10 code and its documentation has been moved into the
+v10 subpackage).
+
+
+
Implementation Notes
+
Tools
+
Initial implementations of Arc2Warc and Warc2Arc
+tools can be found in the package above this one, at
+{@link org.archive.io.Arc2Warc} and {@link org.archive.io.Warc2Arc}
+respectively. Pass --help to learn how to use each tool.
+
+
+
TODO
+
+
Is MIME-Version header needed? MIME Parsers seem fine without (python email
+lib and java mail).
+
Should we write out a Content-Transfer-Encoding
+header (Currently we do not). Need section in spec. explicit about our
+interpretation of MIME and deviations (e.g. content-transfer-encoding should
+be assumed binary in case of WARCs, multipart is not disallowed but not
+encouraged, etc.)
+
Minor: Do WARC-Version: 0.12 like MIME-Version: 1.0 rather than
+WARC/0.12 for lead in to an ARCRecord?
+
+
+
+
diff --git a/src/main/java/org/archive/net/DownloadURLConnection.java b/src/main/java/org/archive/net/DownloadURLConnection.java
new file mode 100644
index 00000000..fbcee421
--- /dev/null
+++ b/src/main/java/org/archive/net/DownloadURLConnection.java
@@ -0,0 +1,131 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.net;
+
+import java.io.BufferedInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.net.URLConnection;
+import java.util.Arrays;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import org.archive.util.ProcessUtils;
+import org.archive.util.ProcessUtils.ProcessResult;
+
+/**
+ * An URL Connection that pre-downloads URL reference before passing back a
+ * Stream reference. When closed, it removes the local download file.
+ * @author stack
+ * @version $Date$, $Revision$
+ */
+public abstract class DownloadURLConnection extends URLConnection {
+ private final String CLASSNAME = DownloadURLConnection.class.getName();
+ private final Logger LOGGER = Logger.getLogger(CLASSNAME);
+ private static final File TMPDIR =
+ new File(System.getProperty("java.io.tmpdir", "/tmp"));
+ private File downloadFile = null;
+
+ protected DownloadURLConnection(URL u) {
+ super(u);
+ }
+
+ protected String getScript() {
+ return System.getProperty(this.getClass().getName() + ".path",
+ "UNDEFINED");
+ }
+
+ protected String [] getCommand(final URL thisUrl,
+ final File downloadFile) {
+ return new String[] {getScript(), thisUrl.getPath(),
+ downloadFile.getAbsolutePath()};
+ }
+
+ /**
+ * Do script copy to local file.
+ * File is available via {@link #getFile()}.
+ * @throws IOException
+ */
+ public void connect() throws IOException {
+ if (this.connected) {
+ return;
+ }
+
+ this.downloadFile = File.createTempFile(CLASSNAME, null, TMPDIR);
+ try {
+ String [] cmd = getCommand(this.url, this.downloadFile);
+ if (LOGGER.isLoggable(Level.FINE)) {
+ StringBuffer buffer = new StringBuffer();
+ for (int i = 0; i < cmd.length; i++) {
+ if (i > 0) {
+ buffer.append(" ");
+ }
+ buffer.append(cmd[i]);
+ }
+ LOGGER.fine("Command: " + buffer.toString());
+ }
+ ProcessResult pr = ProcessUtils.exec(cmd);
+ if (pr.getResult() != 0) {
+ LOGGER.info(Arrays.toString(cmd) + " returned non-null " + pr.getResult());
+ }
+ // Assume download went smoothly.
+ this.connected = true;
+ } catch (IOException ioe) {
+ // Clean up my tmp file.
+ this.downloadFile.delete();
+ this.downloadFile = null;
+ // Rethrow.
+ throw ioe;
+ }
+ }
+
+ public File getFile() {
+ return this.downloadFile;
+ }
+
+ protected void setFile(final File f) {
+ this.downloadFile = f;
+ }
+
+ public InputStream getInputStream() throws IOException {
+ if (!this.connected) {
+ connect();
+ }
+
+ // Return BufferedInputStream so 'delegation' is done for me, so
+ // I don't have to implement all IS methods and pass to my
+ // 'delegate' instance.
+ final DownloadURLConnection connection = this;
+ return new BufferedInputStream(new FileInputStream(this.downloadFile)) {
+ private DownloadURLConnection ruc = connection;
+
+ public void close() throws IOException {
+ super.close();
+ if (this.ruc != null && this.ruc.getFile()!= null &&
+ this.ruc.getFile().exists()) {
+ this.ruc.getFile().delete();
+ this.ruc.setFile(null);
+ }
+ }
+ };
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/net/FTPException.java b/src/main/java/org/archive/net/FTPException.java
new file mode 100644
index 00000000..2d104390
--- /dev/null
+++ b/src/main/java/org/archive/net/FTPException.java
@@ -0,0 +1,56 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.net;
+
+import java.io.IOException;
+
+/**
+ * Indicates that a FTP operation failed due to a protocol violation.
+ * For instance, if authentication fails.
+ *
+ * @author pjack
+ */
+public class FTPException extends IOException {
+ private static final long serialVersionUID = 1L;
+
+ /**
+ * The reply code from the FTP server.
+ */
+ private int code;
+
+ /**
+ * Constructs a new FTPException.
+ *
+ * @param code the error code from the FTP server
+ */
+ public FTPException(int code) {
+ super("FTP error code: " + code);
+ this.code = code;
+ }
+
+
+ /**
+ * Returns the error code from the FTP server.
+ *
+ * @return the error code from the FTP server
+ */
+ public int getReplyCode() {
+ return code;
+ }
+}
diff --git a/src/main/java/org/archive/url/PublicSuffixes.java b/src/main/java/org/archive/net/PublicSuffixes.java
similarity index 99%
rename from src/main/java/org/archive/url/PublicSuffixes.java
rename to src/main/java/org/archive/net/PublicSuffixes.java
index 7c3df6b8..eab8081a 100644
--- a/src/main/java/org/archive/url/PublicSuffixes.java
+++ b/src/main/java/org/archive/net/PublicSuffixes.java
@@ -17,7 +17,7 @@
* limitations under the License.
*/
-package org.archive.url;
+package org.archive.net;
import java.io.BufferedReader;
import java.io.BufferedWriter;
diff --git a/src/main/java/org/archive/net/md5/Handler.java b/src/main/java/org/archive/net/md5/Handler.java
new file mode 100644
index 00000000..8afcdebb
--- /dev/null
+++ b/src/main/java/org/archive/net/md5/Handler.java
@@ -0,0 +1,87 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.net.md5;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.net.URLConnection;
+import java.net.URLStreamHandler;
+
+/**
+ * A protocol handler for an 'md5' URI scheme.
+ * Md5 URLs look like this: md5:deadbeefdeadbeefdeadbeefdeadbeef
+ * When this handler is invoked against an md5 URL, it passes the raw md5 to
+ * the configured script as an argument. The configured script then does the
+ * work to bring the item pointed to by the md5 local so we can open a Stream
+ * on the local copy. Local file is deleted when we finish. Do
+ * {@link org.archive.net.DownloadURLConnection#getFile()} to get name of
+ * temporary file.
+ *
+ *
You need to define the system property
+ * -Djava.protocol.handler.pkgs=org.archive.net to add this handler
+ * to the java.net.URL set. Also define system properties
+ * -Dorg.archive.net.md5.Md5URLConnection.path=PATH_TO_SCRIPT to
+ * pass path of script to run as well as
+ * -Dorg.archive.net.md5.Md5URLConnection.options=OPTIONS for
+ * any options you'd like to include. The pointed-to PATH_TO_SCRIPT
+ * will be invoked as follows: PATH_TO_SCRIPT OPTIONS MD5
+ * LOCAL_TMP_FILE. The LOCAL_TMP_FILE file is made in
+ * java.io.tmpdir using java tmp name code.
+ * @author stack
+ */
+public class Handler extends URLStreamHandler {
+ protected URLConnection openConnection(URL u) {
+ return new Md5URLConnection(u);
+ }
+
+ /**
+ * Main dumps rsync file to STDOUT.
+ * @param args
+ * @throws IOException
+ */
+ public static void main(String[] args)
+ throws IOException {
+ if (args.length != 1) {
+ System.out.println("Usage: java java " +
+ "-Djava.protocol.handler.pkgs=org.archive.net " +
+ "org.archive.net.md5.Handler " +
+ "md5:deadbeefdeadbeefdeadbeefdeadbeef");
+ System.exit(1);
+ }
+ System.setProperty("org.archive.net.md5.Md5URLConnection.path",
+ "/tmp/manifest");
+ System.setProperty("java.protocol.handler.pkgs", "org.archive.net");
+ URL u = new URL(args[0]);
+ URLConnection connect = u.openConnection();
+ // Write download to stdout.
+ final int bufferlength = 4096;
+ byte [] buffer = new byte [bufferlength];
+ InputStream is = connect.getInputStream();
+ try {
+ for (int count = is.read(buffer, 0, bufferlength);
+ (count = is.read(buffer, 0, bufferlength)) != -1;) {
+ System.out.write(buffer, 0, count);
+ }
+ System.out.flush();
+ } finally {
+ is.close();
+ }
+ }
+}
diff --git a/src/main/java/org/archive/net/md5/Md5URLConnection.java b/src/main/java/org/archive/net/md5/Md5URLConnection.java
new file mode 100644
index 00000000..e4fe98e3
--- /dev/null
+++ b/src/main/java/org/archive/net/md5/Md5URLConnection.java
@@ -0,0 +1,34 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.net.md5;
+
+import java.net.URL;
+
+import org.archive.net.DownloadURLConnection;
+
+/**
+ * Md5 URL connection.
+ * @author stack
+ * @version $Date$, $Revision$
+ */
+public class Md5URLConnection extends DownloadURLConnection {
+ protected Md5URLConnection(URL u) {
+ super(u);
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/net/rsync/Handler.java b/src/main/java/org/archive/net/rsync/Handler.java
new file mode 100644
index 00000000..9eb35f5d
--- /dev/null
+++ b/src/main/java/org/archive/net/rsync/Handler.java
@@ -0,0 +1,71 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.net.rsync;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.net.URLConnection;
+import java.net.URLStreamHandler;
+
+/**
+ * A protocol handler that uses native rsync client to do copy.
+ * You need to define the system property
+ * -Djava.protocol.handler.pkgs=org.archive.net to add this handler
+ * to the java.net.URL set. Assumes rsync is in path. Define
+ * system property
+ * -Dorg.archive.net.rsync.RsyncUrlConnection.path=PATH_TO_RSYNC to
+ * pass path to rsync. Downloads to java.io.tmpdir.
+ * @author stack
+ */
+public class Handler extends URLStreamHandler {
+ protected URLConnection openConnection(URL u) {
+ return new RsyncURLConnection(u);
+ }
+
+ /**
+ * Main dumps rsync file to STDOUT.
+ * @param args
+ * @throws IOException
+ */
+ public static void main(String[] args)
+ throws IOException {
+ if (args.length != 1) {
+ System.out.println("Usage: java java " +
+ "-Djava.protocol.handler.pkgs=org.archive.net " +
+ "org.archive.net.rsync.Handler RSYNC_URL");
+ System.exit(1);
+ }
+ URL u = new URL(args[0]);
+ URLConnection connect = u.openConnection();
+ // Write download to stdout.
+ final int bufferlength = 4096;
+ byte [] buffer = new byte [bufferlength];
+ InputStream is = connect.getInputStream();
+ try {
+ for (int count = is.read(buffer, 0, bufferlength);
+ (count = is.read(buffer, 0, bufferlength)) != -1;) {
+ System.out.write(buffer, 0, count);
+ }
+ System.out.flush();
+ } finally {
+ is.close();
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/net/rsync/RsyncURLConnection.java b/src/main/java/org/archive/net/rsync/RsyncURLConnection.java
new file mode 100644
index 00000000..c6097e96
--- /dev/null
+++ b/src/main/java/org/archive/net/rsync/RsyncURLConnection.java
@@ -0,0 +1,51 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.net.rsync;
+
+import java.io.File;
+import java.net.URL;
+
+import org.archive.net.DownloadURLConnection;
+
+/**
+ * Rsync URL connection.
+ * @author stack
+ * @version $Date$, $Revision$
+ */
+public class RsyncURLConnection extends DownloadURLConnection {
+ private final String RSYNC_TIMEOUT =
+ System.getProperty(RsyncURLConnection.class.getName() + ".timeout",
+ "300");
+
+ protected RsyncURLConnection(URL u) {
+ super(u);
+ }
+
+ protected String getScript() {
+ return System.getProperty(this.getClass().getName() + ".path",
+ "rsync");
+ }
+
+ @Override
+ protected String[] getCommand(final URL thisUrl,
+ final File downloadFile) {
+ return new String[] {getScript(), "--timeout=" + RSYNC_TIMEOUT,
+ this.url.getPath(), downloadFile.getAbsolutePath()};
+ }
+}
diff --git a/src/main/java/org/archive/uid/RecordIDGenerator.java b/src/main/java/org/archive/uid/RecordIDGenerator.java
new file mode 100644
index 00000000..97f1a022
--- /dev/null
+++ b/src/main/java/org/archive/uid/RecordIDGenerator.java
@@ -0,0 +1,72 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.uid;
+
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.Map;
+
+/**
+ * A record-id generator.
+ *
+ * @contributor stack
+ * @contributor gojomo
+ * @version $Revision$ $Date$
+ */
+public interface RecordIDGenerator {
+ /**
+ * @return A URI that can serve as a record-id.
+ * @throws URISyntaxException
+ */
+ public URI getRecordID();
+
+ /**
+ * @param qualifiers Qualifiers to add.
+ * @return A URI qualified with passed qualifiers that can
+ * serve as a record-id, or, a new, unique record-id without qualifiers
+ * (if qualifiers not easily implemented using passed URI scheme).
+ */
+ public URI getQualifiedRecordID(final Map qualifiers);
+
+ /**
+ * @param key Name of qualifier
+ * @param value Value of qualifier
+ * @return A URI qualified with passed qualifiers that can
+ * serve as a record-id, or, a new, unique record-id without qualifiers
+ * (if qualifiers not easily implemented using passed URI scheme).
+ */
+ public URI getQualifiedRecordID(final String key, final String value);
+
+ /**
+ * Append (or if already present, update) qualifiers to passed
+ * recordId. Use with caution. Guard against turning up a
+ * result that already exists. Use when writing a group of records inside
+ * a single transaction.
+ *
+ * How qualifiers are appended/updated varies with URI scheme. Its allowed
+ * that an invocation of this method does nought but call
+ * {@link #getRecordID()}, returning a new URI unrelated to the passed
+ * recordId and passed qualifier.
+ * @param recordId URI to append qualifier to.
+ * @param qualifiers Map of qualifier values keyed by qualifier name.
+ * @return New URI based off passed uri and passed qualifier.
+ */
+ public URI qualifyRecordID(final URI recordId,
+ final Map qualifiers);
+}
diff --git a/src/main/java/org/archive/uid/UUIDGenerator.java b/src/main/java/org/archive/uid/UUIDGenerator.java
new file mode 100644
index 00000000..26d29e60
--- /dev/null
+++ b/src/main/java/org/archive/uid/UUIDGenerator.java
@@ -0,0 +1,72 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.uid;
+
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.Map;
+import java.util.UUID;
+
+/**
+ * Generates UUIDs, using
+ * {@link java.util.UUID java.util.UUID}, formatted as URNs from the UUID
+ * namespace [See RFC4122].
+ * Here is an examples of the type of ID it makes:
+ * urn:uuid:0161811f-5da6-4c6e-9808-a2fab97114cf. Always makes a
+ * new identifier even when passed qualifiers.
+ *
+ * @author stack
+ * @version $Revision$ $Date$
+ * @see RFC4122
+ */
+public class UUIDGenerator implements RecordIDGenerator {
+ private static final String SCHEME = "urn:uuid";
+ private static final String SCHEME_COLON = SCHEME + ":";
+
+ public UUIDGenerator() {
+ super();
+ }
+
+ public URI qualifyRecordID(URI recordId,
+ final Map qualifiers) {
+ return getRecordID();
+ }
+
+ private String getUUID() {
+ return UUID.randomUUID().toString();
+ }
+
+ public URI getRecordID() {
+ try {
+ return new URI(SCHEME_COLON + getUUID());
+ } catch (URISyntaxException e) {
+ // should be impossible
+ throw new RuntimeException(e);
+ }
+ }
+
+ public URI getQualifiedRecordID(
+ final String key, final String value){
+ return getRecordID();
+ }
+
+ public URI getQualifiedRecordID(Map qualifiers){
+ return getRecordID();
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/uid/package.html b/src/main/java/org/archive/uid/package.html
new file mode 100644
index 00000000..dc49f07b
--- /dev/null
+++ b/src/main/java/org/archive/uid/package.html
@@ -0,0 +1,28 @@
+
+
+
+org.archive.uid package
+
+
+A unique ID generator.
+Default is {@link org.archive.uid.UUIDGenerator}.
+To use another ID Generator, set the System Property
+org.archive.uid.GeneratorFactory.generator to point
+at an alternate implementation of {@link org.archive.uid.Generator}.
+
+
TODO
+
+
MIME boundaries have upper-bound of 70 characters total including
+ 'blank line' (CRLFCRLF) and two leading hyphens. Add to
+ {@link org.archive.uid.Generator}
+ interface an upper-bound on generated ID length.
+
Add example of an actionable uid generator:
+e.g. http://archive.org/UID-SCHEME/ID
+where scheme might be UUID and an ID might be
+f9472055-fbb6-4810-90e8-68fd39e145a6;type=metadata or,
+using ARK:
+http://archive.org/ark:/13030/f9472055-fbb6-4810-90e8-68fd39e145a6;type=metadata.
+
+
+
+
diff --git a/src/main/java/org/archive/url/ExtractRule.java b/src/main/java/org/archive/url/ExtractRule.java
new file mode 100644
index 00000000..bcfb3b2f
--- /dev/null
+++ b/src/main/java/org/archive/url/ExtractRule.java
@@ -0,0 +1,45 @@
+package org.archive.url;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class ExtractRule
+{
+ protected String startsWith;
+ protected String regex;
+
+ protected Pattern regexPattern;
+
+ public String getStartsWith() {
+ return startsWith;
+ }
+ public void setStartsWith(String startsWith) {
+ this.startsWith = startsWith;
+ }
+ public String getRegex() {
+ return regex;
+ }
+ public void setRegex(String regex) {
+ regexPattern = Pattern.compile(regex);
+ this.regex = regex;
+ }
+
+ public Matcher extract(String url)
+ {
+ if ((startsWith != null) && !startsWith.isEmpty() && !url.startsWith(startsWith)) {
+ return null;
+ }
+
+ if (regexPattern == null) {
+ return null;
+ }
+
+ Matcher match = regexPattern.matcher(url);
+
+ if (!match.find()) {
+ return null;
+ }
+
+ return match;
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/url/RewriteRule.java b/src/main/java/org/archive/url/RewriteRule.java
new file mode 100644
index 00000000..47292686
--- /dev/null
+++ b/src/main/java/org/archive/url/RewriteRule.java
@@ -0,0 +1,55 @@
+package org.archive.url;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class RewriteRule
+{
+ protected String startsWith;
+ protected String regex;
+ protected String replace;
+
+ protected Pattern regexPattern;
+
+ public String getStartsWith() {
+ return startsWith;
+ }
+ public void setStartsWith(String startsWith) {
+ this.startsWith = startsWith;
+ }
+ public String getRegex() {
+ return regex;
+ }
+ public void setRegex(String regex) {
+ regexPattern = Pattern.compile(regex);
+ this.regex = regex;
+ }
+ public String getReplace() {
+ return replace;
+ }
+ public void setReplace(String replace) {
+ this.replace = replace;
+ }
+
+ public boolean rewrite(StringBuilder sb)
+ {
+ String urlkey = sb.toString();
+
+ if ((startsWith != null) && !urlkey.startsWith(startsWith)) {
+ return false;
+ }
+
+ if (regexPattern == null || replace == null) {
+ return false;
+ }
+
+ Matcher match = regexPattern.matcher(urlkey);
+
+ if (match.matches()) {
+ sb.replace(0, sb.length(), match.replaceAll(replace));
+ return true;
+ }
+
+ return false;
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/url/UrlSurtRangeComputer.java b/src/main/java/org/archive/url/UrlSurtRangeComputer.java
index 74057117..2b960e16 100644
--- a/src/main/java/org/archive/url/UrlSurtRangeComputer.java
+++ b/src/main/java/org/archive/url/UrlSurtRangeComputer.java
@@ -112,7 +112,7 @@ public String[] determineRange(String url, MatchType match, String from, String
return new String[]{startKey, endKey, host};
}
- protected String incLastChar(String input)
+ public static String incLastChar(String input)
{
StringBuilder sb = new StringBuilder(input);
sb.setCharAt(sb.length() - 1, (char)(sb.charAt(sb.length() - 1) + 1));
diff --git a/src/main/java/org/archive/url/WaybackURLKeyMaker.java b/src/main/java/org/archive/url/WaybackURLKeyMaker.java
index 23c67d06..99fb92e9 100644
--- a/src/main/java/org/archive/url/WaybackURLKeyMaker.java
+++ b/src/main/java/org/archive/url/WaybackURLKeyMaker.java
@@ -2,8 +2,6 @@
import java.net.URISyntaxException;
import java.util.List;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
public class WaybackURLKeyMaker implements URLKeyMaker {
// URLCanonicalizer canonicalizer = new NonMassagingIAURLCanonicalizer();
@@ -21,34 +19,6 @@ public void setCanonicalizer(URLCanonicalizer canonicalizer) {
protected List customRules;
- public static class RewriteRule
- {
- String startsWith;
- String regex;
- String replace;
- Pattern regexPattern;
-
- public String getStartsWith() {
- return startsWith;
- }
- public void setStartsWith(String startsWith) {
- this.startsWith = startsWith;
- }
- public String getRegex() {
- return regex;
- }
- public void setRegex(String regex) {
- regexPattern = Pattern.compile(regex);
- this.regex = regex;
- }
- public String getReplace() {
- return replace;
- }
- public void setReplace(String replace) {
- this.replace = replace;
- }
- }
-
public WaybackURLKeyMaker()
{
@@ -117,22 +87,12 @@ public void setCustomRules(List customRules) {
protected String applyCustomRules(String urlkey)
{
+ StringBuilder sb = new StringBuilder(urlkey);
+
for (RewriteRule rule : customRules) {
- if ((rule.startsWith != null) && !urlkey.startsWith(rule.startsWith)) {
- continue;
- }
-
- if (rule.regexPattern == null || rule.replace == null) {
- continue;
- }
-
- Matcher match = rule.regexPattern.matcher(urlkey);
-
- if (match.matches()) {
- urlkey = match.replaceAll(rule.replace);
- }
+ rule.rewrite(sb);
}
- return urlkey;
+ return sb.toString();
}
}
diff --git a/src/main/java/org/archive/util/DevUtils.java b/src/main/java/org/archive/util/DevUtils.java
new file mode 100644
index 00000000..d630a0b1
--- /dev/null
+++ b/src/main/java/org/archive/util/DevUtils.java
@@ -0,0 +1,116 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.util;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.PrintWriter;
+import java.io.StringWriter;
+import java.util.logging.Logger;
+
+
+/**
+ * Write a message and stack trace to the 'org.archive.util.DevUtils' logger.
+ *
+ * @author gojomo
+ * @version $Revision$ $Date$
+ */
+public class DevUtils {
+ public static Logger logger =
+ Logger.getLogger(DevUtils.class.getName());
+
+ /**
+ * Log a warning message to the logger 'org.archive.util.DevUtils' made of
+ * the passed 'note' and a stack trace based off passed exception.
+ *
+ * @param ex Exception we print a stacktrace on.
+ * @param note Message to print ahead of the stacktrace.
+ */
+ public static void warnHandle(Throwable ex, String note) {
+ logger.warning(TextUtils.exceptionToString(note, ex));
+ }
+
+ /**
+ * @return Extra information gotten from current Thread. May not
+ * always be available in which case we return empty string.
+ */
+ public static String extraInfo() {
+ StringWriter sw = new StringWriter();
+ PrintWriter pw = new PrintWriter(sw);
+ final Thread current = Thread.currentThread();
+ if (current instanceof Reporter) {
+ Reporter tt = (Reporter)current;
+ try {
+ tt.reportTo(pw);
+ } catch (IOException e) {
+ // Not really possible w/ a StringWriter
+ e.printStackTrace();
+ }
+ }
+ if (current instanceof ProgressStatisticsReporter) {
+ ProgressStatisticsReporter tt = (ProgressStatisticsReporter)current;
+ try {
+ tt.progressStatisticsLegend(pw);
+ tt.progressStatisticsLine(pw);
+ } catch (IOException e) {
+ // Not really possible w/ a StringWriter
+ e.printStackTrace();
+ }
+ }
+ pw.flush();
+ return sw.toString();
+ }
+
+ /**
+ * Nothing to see here, move along.
+ * @deprecated This method was never used.
+ */
+ @Deprecated
+ public static void betterPrintStack(RuntimeException re) {
+ re.printStackTrace(System.err);
+ }
+
+ /**
+ * Send this JVM process a SIGQUIT; giving a thread dump and possibly
+ * a heap histogram (if using -XX:+PrintClassHistogram).
+ *
+ * Used to automatically dump info, for example when a serious error
+ * is encountered. Would use 'jmap'/'jstack', but have seen JVM
+ * lockups -- perhaps due to lost thread wake signals -- when using
+ * those against Sun 1.5.0+03 64bit JVM.
+ */
+ public static void sigquitSelf() {
+ try {
+ Process p = Runtime.getRuntime().exec(
+ new String[] {"perl", "-e", "print getppid(). \"\n\";"});
+ BufferedReader br =
+ new BufferedReader(new InputStreamReader(p.getInputStream()));
+ String ppid = br.readLine();
+ Runtime.getRuntime().exec(
+ new String[] {"sh", "-c", "kill -3 "+ppid}).waitFor();
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+}
diff --git a/src/main/java/org/archive/util/FileUtils.java b/src/main/java/org/archive/util/FileUtils.java
new file mode 100644
index 00000000..3de276a9
--- /dev/null
+++ b/src/main/java/org/archive/util/FileUtils.java
@@ -0,0 +1,712 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.util;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FileFilter;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.channels.ClosedByInterruptException;
+import java.nio.channels.FileChannel;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Properties;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import java.util.regex.Pattern;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.filefilter.IOFileFilter;
+import org.apache.commons.lang.math.LongRange;
+
+
+/** Utility methods for manipulating files and directories.
+ *
+ * @contributor John Erik Halse
+ * @contributor gojomo
+ */
+public class FileUtils {
+ private static final Logger LOGGER =
+ Logger.getLogger(FileUtils.class.getName());
+
+ /**
+ * Constructor made private because all methods of this class are static.
+ */
+ private FileUtils() {
+ super();
+ }
+
+ /**
+ * Copy the src file to the destination. Deletes any preexisting
+ * file at destination.
+ *
+ * @param src
+ * @param dest
+ * @return True if the extent was greater than actual bytes copied.
+ * @throws FileNotFoundException
+ * @throws IOException
+ */
+ public static boolean copyFile(final File src, final File dest)
+ throws FileNotFoundException, IOException {
+ return copyFile(src, dest, -1, true);
+ }
+
+ /**
+ * Copy up to extent bytes of the source file to the destination.
+ * Deletes any preexisting file at destination.
+ *
+ * @param src
+ * @param dest
+ * @param extent Maximum number of bytes to copy
+ * @return True if the extent was greater than actual bytes copied.
+ * @throws FileNotFoundException
+ * @throws IOException
+ */
+ public static boolean copyFile(final File src, final File dest,
+ long extent)
+ throws FileNotFoundException, IOException {
+ return copyFile(src, dest, extent, true);
+ }
+
+ /**
+ * Copy up to extent bytes of the source file to the destination
+ *
+ * @param src
+ * @param dest
+ * @param extent Maximum number of bytes to copy
+ * @param overwrite If target file already exits, and this parameter is
+ * true, overwrite target file (We do this by first deleting the target
+ * file before we begin the copy).
+ * @return True if the extent was greater than actual bytes copied.
+ * @throws FileNotFoundException
+ * @throws IOException
+ */
+ public static boolean copyFile(final File src, final File dest,
+ long extent, final boolean overwrite)
+ throws FileNotFoundException, IOException {
+ boolean result = false;
+ if (LOGGER.isLoggable(Level.FINE)) {
+ LOGGER.fine("Copying file " + src + " to " + dest + " extent " +
+ extent + " exists " + dest.exists());
+ }
+ if (dest.exists()) {
+ if (overwrite) {
+ dest.delete();
+ LOGGER.finer(dest.getAbsolutePath() + " removed before copy.");
+ } else {
+ // Already in place and we're not to overwrite. Return.
+ return result;
+ }
+ }
+ FileInputStream fis = null;
+ FileOutputStream fos = null;
+ FileChannel fcin = null;
+ FileChannel fcout = null;
+ try {
+ // Get channels
+ fis = new FileInputStream(src);
+ fos = new FileOutputStream(dest);
+ fcin = fis.getChannel();
+ fcout = fos.getChannel();
+ if (extent < 0) {
+ extent = fcin.size();
+ }
+
+ // Do the file copy
+ long trans = fcin.transferTo(0, extent, fcout);
+ if (trans < extent) {
+ result = false;
+ }
+ result = true;
+ } catch (IOException e) {
+ // Add more info to the exception. Preserve old stacktrace.
+ // We get 'Invalid argument' on some file copies. See
+ // http://intellij.net/forums/thread.jsp?forum=13&thread=63027&message=853123
+ // for related issue.
+ String message = "Copying " + src.getAbsolutePath() + " to " +
+ dest.getAbsolutePath() + " with extent " + extent +
+ " got IOE: " + e.getMessage();
+ if ((e instanceof ClosedByInterruptException) ||
+ ((e.getMessage()!=null)
+ &&e.getMessage().equals("Invalid argument"))) {
+ LOGGER.severe("Failed copy, trying workaround: " + message);
+ workaroundCopyFile(src, dest);
+ } else {
+ IOException newE = new IOException(message);
+ newE.initCause(e);
+ throw newE;
+ }
+ } finally {
+ // finish up
+ if (fcin != null) {
+ fcin.close();
+ }
+ if (fcout != null) {
+ fcout.close();
+ }
+ if (fis != null) {
+ fis.close();
+ }
+ if (fos != null) {
+ fos.close();
+ }
+ }
+ return result;
+ }
+
+ protected static void workaroundCopyFile(final File src,
+ final File dest)
+ throws IOException {
+ FileInputStream from = null;
+ FileOutputStream to = null;
+ try {
+ from = new FileInputStream(src);
+ to = new FileOutputStream(dest);
+ byte[] buffer = new byte[4096];
+ int bytesRead;
+ while ((bytesRead = from.read(buffer)) != -1) {
+ to.write(buffer, 0, bytesRead);
+ }
+ } finally {
+ if (from != null) {
+ try {
+ from.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+ if (to != null) {
+ try {
+ to.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+ }
+
+ /**
+ * Get a list of all files in directory that have passed prefix.
+ *
+ * @param dir Dir to look in.
+ * @param prefix Basename of files to look for. Compare is case insensitive.
+ *
+ * @return List of files in dir that start w/ passed basename.
+ */
+ public static File [] getFilesWithPrefix(File dir, final String prefix) {
+ FileFilter prefixFilter = new FileFilter() {
+ public boolean accept(File pathname)
+ {
+ return pathname.getName().toLowerCase().
+ startsWith(prefix.toLowerCase());
+ }
+ };
+ return dir.listFiles(prefixFilter);
+ }
+
+ /** Get a @link java.io.FileFilter that filters files based on a regular
+ * expression.
+ *
+ * @param regex the regular expression the files must match.
+ * @return the newly created filter.
+ */
+ public static IOFileFilter getRegexFileFilter(String regex) {
+ // Inner class defining the RegexFileFilter
+ class RegexFileFilter implements IOFileFilter {
+ Pattern pattern;
+
+ protected RegexFileFilter(String re) {
+ pattern = Pattern.compile(re);
+ }
+
+ public boolean accept(File pathname) {
+ return pattern.matcher(pathname.getName()).matches();
+ }
+
+ public boolean accept(File dir, String name) {
+ return accept(new File(dir,name));
+ }
+ }
+
+ return new RegexFileFilter(regex);
+ }
+
+ /**
+ * Test file exists and is readable.
+ * @param f File to test.
+ * @exception FileNotFoundException If file does not exist or is not unreadable.
+ */
+ public static File assertReadable(final File f) throws FileNotFoundException {
+ if (!f.exists()) {
+ throw new FileNotFoundException(f.getAbsolutePath() +
+ " does not exist.");
+ }
+
+ if (!f.canRead()) {
+ throw new FileNotFoundException(f.getAbsolutePath() +
+ " is not readable.");
+ }
+
+ return f;
+ }
+
+ /**
+ * @param f File to test.
+ * @return True if file is readable, has uncompressed extension,
+ * and magic string at file start.
+ * @exception IOException If file not readable or other problem.
+ */
+ public static boolean isReadableWithExtensionAndMagic(final File f,
+ final String uncompressedExtension, final String magic)
+ throws IOException {
+ boolean result = false;
+ FileUtils.assertReadable(f);
+ if(f.getName().toLowerCase().endsWith(uncompressedExtension)) {
+ FileInputStream fis = new FileInputStream(f);
+ try {
+ byte [] b = new byte[magic.length()];
+ int read = fis.read(b, 0, magic.length());
+ fis.close();
+ if (read == magic.length()) {
+ StringBuffer beginStr
+ = new StringBuffer(magic.length());
+ for (int i = 0; i < magic.length(); i++) {
+ beginStr.append((char)b[i]);
+ }
+
+ if (beginStr.toString().
+ equalsIgnoreCase(magic)) {
+ result = true;
+ }
+ }
+ } finally {
+ fis.close();
+ }
+ }
+
+ return result;
+ }
+
+ /**
+ * Turn path into a File, relative to context (which may be ignored
+ * if path is absolute).
+ *
+ * @param context File context if path is relative
+ * @param path String path to make into a File
+ * @return File created
+ */
+ public static File maybeRelative(File context, String path) {
+ File f = new File(path);
+ if(f.isAbsolute()) {
+ return f;
+ }
+ return new File(context, path);
+ }
+
+ /**
+ * Load Properties instance from a File
+ *
+ * @param file
+ * @return Properties
+ * @throws IOException
+ */
+ public static Properties loadProperties(File file) throws IOException {
+ FileInputStream finp = new FileInputStream(file);
+ try {
+ Properties p = new Properties();
+ p.load(finp);
+ return p;
+ } finally {
+ ArchiveUtils.closeQuietly(finp);
+ }
+ }
+
+ /**
+ * Store Properties instance to a File
+ * @param p
+ * @param file destination File
+ * @throws IOException
+ */
+ public static void storeProperties(Properties p, File file) throws IOException {
+ FileOutputStream fos = new FileOutputStream(file);
+ try {
+ p.store(fos,"");
+ } finally {
+ ArchiveUtils.closeQuietly(fos);
+ }
+ }
+
+ // TODO: comment
+ public static boolean moveAsideIfExists(File file) throws IOException {
+ if(!file.exists()) {
+ return true;
+ }
+ String newName =
+ file.getCanonicalPath() + "."
+ + ArchiveUtils.get14DigitDate(file.lastModified());
+ boolean retVal = file.renameTo(new File(newName));
+ if(!retVal) {
+ LOGGER.warning("unable to move aside: "+file+" to "+newName);
+ }
+ return retVal;
+
+ }
+
+ /**
+ * Retrieve a number of lines from the file around the given
+ * position, as when paging forward or backward through a file.
+ *
+ * @param file File to retrieve lines
+ * @param position offset to anchor lines
+ * @param signedDesiredLineCount lines requested; if negative,
+ * want this number of lines ending with a line containing
+ * the position; if positive, want this number of lines,
+ * all starting at or after position.
+ * @param lines List to insert found lines
+ * @param lineEstimate int estimate of line size, 0 means use default
+ * of 128
+ * @return LongRange indicating the file offsets corresponding to
+ * the beginning of the first line returned, and the point
+ * after the end of the last line returned
+ * @throws IOException
+ */
+ @SuppressWarnings("unchecked")
+ public static LongRange pagedLines(File file, long position,
+ int signedDesiredLineCount, List lines, int lineEstimate)
+ throws IOException {
+ // consider negative positions as from end of file; -1 = last byte
+ if (position < 0) {
+ position = file.length() + position;
+ }
+
+ // calculate a reasonably sized chunk likely to have all desired lines
+ if(lineEstimate == 0) {
+ lineEstimate = 128;
+ }
+ int desiredLineCount = Math.abs(signedDesiredLineCount);
+ long startPosition;
+ long fileEnd = file.length();
+ int bufferSize = (desiredLineCount + 5) * lineEstimate;
+ if(signedDesiredLineCount>0) {
+ // reading forward; include previous char in case line-end
+ startPosition = position - 1;
+ } else {
+ // reading backward
+ startPosition = position - bufferSize + (2 * lineEstimate);
+ }
+ if(startPosition<0) {
+ startPosition = 0;
+ }
+ if(startPosition+bufferSize > fileEnd) {
+ bufferSize = (int)(fileEnd - startPosition);
+ }
+
+ // read that reasonable chunk
+ FileInputStream fis = new FileInputStream(file);
+ fis.getChannel().position(startPosition);
+ byte[] buf = new byte[bufferSize];
+ ArchiveUtils.readFully(fis, buf);
+ IOUtils.closeQuietly(fis);
+
+ // find all line starts fully in buffer
+ // (positions after a line-end, per line-end definition in
+ // BufferedReader.readLine)
+ LinkedList lineStarts = new LinkedList();
+ if(startPosition==0) {
+ lineStarts.add(0);
+ }
+ boolean atLineEnd = false;
+ boolean eatLF = false;
+ int i;
+ for(i = 0; i < bufferSize; i++) {
+ if ((char) buf[i] == '\n' && eatLF) {
+ eatLF = false;
+ continue;
+ }
+ if(atLineEnd) {
+ atLineEnd = false;
+ lineStarts.add(i);
+ if(signedDesiredLineCount<0 && startPosition+i > position) {
+ // reached next line past position, read no more
+ break;
+ }
+ }
+ if ((char) buf[i] == '\r') {
+ atLineEnd = true;
+ eatLF = true;
+ continue;
+ }
+ if ((char) buf[i] == '\n') {
+ atLineEnd = true;
+ }
+ }
+ if(startPosition+i == fileEnd) {
+ // add phantom lineStart after end
+ lineStarts.add(bufferSize);
+ }
+ int foundFullLines = lineStarts.size()-1;
+
+ // if found no lines
+ if(foundFullLines<1) {
+ if(signedDesiredLineCount>0) {
+ if(startPosition+bufferSize == fileEnd) {
+ // nothing more to read: return nothing
+ return new LongRange(fileEnd,fileEnd);
+ } else {
+ // retry with larger lineEstimate
+ return pagedLines(file, position, signedDesiredLineCount, lines, Math.max(bufferSize,lineEstimate));
+ }
+
+ } else {
+ // try again with much larger line estimate
+ // TODO: fail gracefully before growing to multi-MB buffers
+ return pagedLines(file, position, signedDesiredLineCount, lines, bufferSize);
+ }
+ }
+
+ // trim unneeded lines
+ while(signedDesiredLineCount>0 && startPosition+lineStarts.getFirst()desiredLineCount+1) {
+ if (signedDesiredLineCount < 0 && (startPosition+lineStarts.get(1) <= position) ) {
+ // discard from front until reach line containing target position
+ lineStarts.removeFirst();
+ } else {
+ lineStarts.removeLast();
+ }
+ }
+ int firstLine = lineStarts.getFirst();
+ int partialLine = lineStarts.getLast();
+ LongRange range = new LongRange(startPosition + firstLine, startPosition + partialLine);
+ List foundLines =
+ IOUtils.readLines(new ByteArrayInputStream(buf,firstLine,partialLine-firstLine));
+
+ if(foundFullLines< 0 && startPosition > 0) {
+ // if needed and reading backward, read more lines from earlier
+ range = expandRange(
+ range,
+ pagedLines(file,
+ range.getMinimumLong()-1,
+ signedDesiredLineCount+foundFullLines,
+ lines,
+ bufferSize/foundFullLines));
+
+ }
+
+ lines.addAll(foundLines);
+
+ if(signedDesiredLineCount < 0 && range.getMaximumLong() < position) {
+ // did not get line containining start position
+ range = expandRange(
+ range,
+ pagedLines(file,
+ partialLine,
+ 1,
+ lines,
+ bufferSize/foundFullLines));
+ }
+
+ if(signedDesiredLineCount > 0 && foundFullLines < desiredLineCount && range.getMaximumLong() < fileEnd) {
+ // need more forward lines
+ range = expandRange(
+ range,
+ pagedLines(file,
+ range.getMaximumLong(),
+ desiredLineCount - foundFullLines,
+ lines,
+ bufferSize/foundFullLines));
+ }
+
+ return range;
+ }
+
+ public static LongRange expandRange(LongRange range1, LongRange range2) {
+ return new LongRange(Math.min(range1.getMinimumLong(), range2.getMinimumLong()),
+ Math.max(range1.getMaximumLong(), range2.getMaximumLong()));
+
+ }
+
+ public static LongRange pagedLines(File file, long position, int signedDesiredLongCount, List lines) throws IOException {
+ return pagedLines(file, position, signedDesiredLongCount, lines, 0);
+ }
+
+ /**
+ * Delete the file now -- but in the event of failure, keep trying
+ * in the future.
+ *
+ * VERY IMPORTANT: Do not use with any file whose name/path may be
+ * reused, because the lagged delete could then wind up deleting the
+ * newer file. Essentially, only to be used with uniquely-named temp
+ * files.
+ *
+ * Necessary because some platforms (looking at you,
+ * JVM-on-Windows) will have deletes fail because of things like
+ * file-mapped buffers remaining, and there's no explicit way to
+ * unmap a buffer. (See 6-year-old Sun-stumping Java bug
+ * http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4724038 )
+ * We just have to wait and retry.
+ *
+ * (Why not just File.deleteOnExit? There could be an arbitrary,
+ * unbounded number of files in such a situation, that are only
+ * deletable a few seconds or minutes after our first attempt.
+ * Waiting for JVM exist could mean disk exhaustion. It's also
+ * unclear if the native FS class implementations of deleteOnExit
+ * use RAM per pending file.)
+ *
+ * @param fileToDelete
+ */
+ public static synchronized void deleteSoonerOrLater(File fileToDelete) {
+ pendingDeletes.add(fileToDelete);
+ // if things are getting out of hand, force gc/finalization
+ if(pendingDeletes.size()>50) {
+ LOGGER.warning(">50 pending Files to delete; forcing gc/finalization");
+ System.gc();
+ System.runFinalization();
+ }
+ // try all pendingDeletes
+ Iterator iter = pendingDeletes.listIterator();
+ while(iter.hasNext()) {
+ File pending = iter.next();
+ if(pending.delete()) {
+ iter.remove();
+ }
+ }
+ // if things are still out of hand, complain loudly
+ if(pendingDeletes.size()>50) {
+ LOGGER.severe(">50 pending Files to delete even after gc/finalization");
+ }
+ }
+ protected static LinkedList pendingDeletes = new LinkedList();
+
+ /**
+ * Read the entire stream to EOF into the passed file.
+ * Closes is when done or if an exception.
+ * @param is Stream to read.
+ * @param toFile File to write to.
+ * @throws IOException
+ */
+ public static long readFullyToFile(InputStream is, File toFile)
+ throws IOException {
+ OutputStream os = org.apache.commons.io.FileUtils.openOutputStream(toFile);
+ try {
+ return IOUtils.copyLarge(is, os);
+ } finally {
+ IOUtils.closeQuietly(os);
+ IOUtils.closeQuietly(is);
+ }
+ }
+
+ /**
+ * Ensure writeable directory.
+ *
+ * If doesn't exist, we attempt creation.
+ *
+ * @param dir Directory to test for exitence and is writeable.
+ *
+ * @return The passed dir.
+ *
+ * @exception IOException If passed directory does not exist and is not
+ * createable, or directory is not writeable or is not a directory.
+ */
+ public static File ensureWriteableDirectory(String dir)
+ throws IOException {
+ return FileUtils.ensureWriteableDirectory(new File(dir));
+ }
+
+ /**
+ * Ensure writeable directories.
+ *
+ * If doesn't exist, we attempt creation.
+ *
+ * @param dirs List of Files to test.
+ *
+ * @return The passed dirs.
+ *
+ * @exception IOException If passed directory does not exist and is not
+ * createable, or directory is not writeable or is not a directory.
+ */
+ public static List ensureWriteableDirectory(List dirs)
+ throws IOException {
+ for (Iterator i = dirs.iterator(); i.hasNext();) {
+ FileUtils.ensureWriteableDirectory(i.next());
+ }
+ return dirs;
+ }
+
+ /**
+ * Ensure writeable directory.
+ *
+ * If doesn't exist, we attempt creation.
+ *
+ * @param dir Directory to test for exitence and is writeable.
+ *
+ * @return The passed dir.
+ *
+ * @exception IOException If passed directory does not exist and is not
+ * createable, or directory is not writeable or is not a directory.
+ */
+ public static File ensureWriteableDirectory(File dir)
+ throws IOException {
+ if (!dir.exists()) {
+ boolean success = dir.mkdirs();
+ if (!success) {
+ throw new IOException("Failed to create directory: " + dir);
+ }
+ } else {
+ if (!dir.canWrite()) {
+ throw new IOException("Dir " + dir.getAbsolutePath() +
+ " not writeable.");
+ } else if (!dir.isDirectory()) {
+ throw new IOException("Dir " + dir.getAbsolutePath() +
+ " is not a directory.");
+ }
+ }
+
+ return dir;
+ }
+
+ public static File tryToCanonicalize(File file) {
+ try {
+ return file.getCanonicalFile();
+ } catch (IOException e) {
+ return file;
+ }
+ }
+
+ public static void appendTo(File fileToAppendTo, File fileToAppendFrom) throws IOException {
+ // optimal io block size according to http://lingrok.org/xref/coreutils/src/ioblksize.h
+ byte[] buf = new byte[65536];
+ FileOutputStream out = new FileOutputStream(fileToAppendTo, true);
+ FileInputStream in = new FileInputStream(fileToAppendFrom);
+ for (int n = in.read(buf); n > 0; n = in.read(buf)) {
+ out.write(buf, 0, n);
+ }
+ in.close();
+ out.flush();
+ out.close();
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/util/InetAddressUtil.java b/src/main/java/org/archive/util/InetAddressUtil.java
new file mode 100644
index 00000000..585ba772
--- /dev/null
+++ b/src/main/java/org/archive/util/InetAddressUtil.java
@@ -0,0 +1,116 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.util;
+
+import java.net.InetAddress;
+import java.net.NetworkInterface;
+import java.net.SocketException;
+import java.net.UnknownHostException;
+import java.util.ArrayList;
+import java.util.Enumeration;
+import java.util.List;
+import java.util.logging.Logger;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * InetAddress utility.
+ * @author stack
+ * @version $Date$, $Revision$
+ */
+public class InetAddressUtil {
+ private static Logger logger =
+ Logger.getLogger(InetAddressUtil.class.getName());
+
+ /**
+ * ipv4 address.
+ */
+ public static Pattern IPV4_QUADS = Pattern.compile(
+ "([0-9]{1,3})\\.([0-9]{1,3})\\.([0-9]{1,3})\\.([0-9]{1,3})");
+
+ private InetAddressUtil () {
+ super();
+ }
+
+ /**
+ * Returns InetAddress for passed host IF its in
+ * IPV4 quads format (e.g. 128.128.128.128).
+ *
TODO: Move to an AddressParsingUtil class.
+ * @param host Host name to examine.
+ * @return InetAddress IF the passed name was an IP address, else null.
+ */
+ public static InetAddress getIPHostAddress(String host) {
+ InetAddress result = null;
+ Matcher matcher = IPV4_QUADS.matcher(host);
+ if (matcher == null || !matcher.matches()) {
+ return result;
+ }
+ try {
+ // Doing an Inet.getByAddress() avoids a lookup.
+ result = InetAddress.getByAddress(host,
+ new byte[] {
+ (byte)(new Integer(matcher.group(1)).intValue()),
+ (byte)(new Integer(matcher.group(2)).intValue()),
+ (byte)(new Integer(matcher.group(3)).intValue()),
+ (byte)(new Integer(matcher.group(4)).intValue())});
+ } catch (NumberFormatException e) {
+ logger.warning(e.getMessage());
+ } catch (UnknownHostException e) {
+ logger.warning(e.getMessage());
+ }
+ return result;
+ }
+
+ /**
+ * @return All known local names for this host or null if none found.
+ */
+ public static List getAllLocalHostNames() {
+ List localNames = new ArrayList();
+ Enumeration e = null;
+ try {
+ e = NetworkInterface.getNetworkInterfaces();
+ } catch(SocketException exception) {
+ throw new RuntimeException(exception);
+ }
+ for (; e.hasMoreElements();) {
+ for (Enumeration ee = e.nextElement().getInetAddresses();
+ ee.hasMoreElements();) {
+ InetAddress ia = ee.nextElement();
+ if (ia != null) {
+ if (ia.getHostName() != null) {
+ localNames.add(ia.getCanonicalHostName());
+ }
+ if (ia.getHostAddress() != null) {
+ localNames.add(ia.getHostAddress());
+ }
+ }
+ }
+ }
+ final String localhost = "localhost";
+ if (!localNames.contains(localhost)) {
+ localNames.add(localhost);
+ }
+ final String localhostLocaldomain = "localhost.localdomain";
+ if (!localNames.contains(localhostLocaldomain)) {
+ localNames.add(localhostLocaldomain);
+ }
+ return localNames;
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/util/IterableLineIterator.java b/src/main/java/org/archive/util/IterableLineIterator.java
new file mode 100644
index 00000000..6e0d9dc8
--- /dev/null
+++ b/src/main/java/org/archive/util/IterableLineIterator.java
@@ -0,0 +1,26 @@
+package org.archive.util;
+
+import java.io.Reader;
+import java.util.Iterator;
+
+import org.apache.commons.io.LineIterator;
+
+/**
+ * A LineIterator that also implements Iterable, so that it can be used with
+ * the java enhanced for-each loop syntax.
+ *
+ * @contributor nlevitt
+ */
+public class IterableLineIterator extends LineIterator
+ implements Iterable {
+
+ public IterableLineIterator(final Reader reader)
+ throws IllegalArgumentException {
+ super(reader);
+ }
+
+ @SuppressWarnings("unchecked")
+ public Iterator iterator() {
+ return this;
+ }
+}
diff --git a/src/main/java/org/archive/util/LaxHttpParser.java b/src/main/java/org/archive/util/LaxHttpParser.java
new file mode 100644
index 00000000..c1f768f0
--- /dev/null
+++ b/src/main/java/org/archive/util/LaxHttpParser.java
@@ -0,0 +1,242 @@
+/*
+ * $Header: /home/jerenkrantz/tmp/commons/commons-convert/cvs/home/cvs/jakarta-commons//httpclient/src/java/org/apache/commons/httpclient/LaxHttpParser.java,v 1.13 2005/01/11 13:57:06 oglueck Exp $
+ * $Revision$
+ * $Date$
+ *
+ * ====================================================================
+ *
+ * Copyright 1999-2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation. For more
+ * information on the Apache Software Foundation, please see
+ * .
+ *
+ */
+/*
+ *
+ */
+
+package org.archive.util;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+
+import org.apache.commons.httpclient.Header;
+import org.apache.commons.httpclient.HttpException;
+import org.apache.commons.httpclient.util.EncodingUtil;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+/**
+ * A Modified version of HttpParser which doesn't throw exceptions on bad header lines
+ *
+ * A utility class for parsing http header values according to
+ * RFC-2616 Section 4 and 19.3.
+ *
+ * @author Michael Becke
+ * @author Oleg Kalnichevski
+ *
+ * @since 2.0beta1
+ */
+public class LaxHttpParser {
+
+ /** Log object for this class. */
+ private static final Log LOG = LogFactory.getLog(LaxHttpParser.class);
+
+ /**
+ * Constructor for LaxHttpParser.
+ */
+ protected LaxHttpParser() { }
+
+ /**
+ * Return byte array from an (unchunked) input stream.
+ * Stop reading when "\n" terminator encountered
+ * If the stream ends before the line terminator is found,
+ * the last part of the string will still be returned.
+ * If no input data available, null is returned.
+ *
+ * @param inputStream the stream to read from
+ *
+ * @throws IOException if an I/O problem occurs
+ * @return a byte array from the stream
+ */
+ public static byte[] readRawLine(InputStream inputStream) throws IOException {
+ LOG.trace("enter LaxHttpParser.readRawLine()");
+
+ ByteArrayOutputStream buf = new ByteArrayOutputStream();
+ int ch;
+ while ((ch = inputStream.read()) >= 0) {
+ buf.write(ch);
+ if (ch == '\n') { // be tolerant (RFC-2616 Section 19.3)
+ break;
+ }
+ }
+ if (buf.size() == 0) {
+ return null;
+ }
+ return buf.toByteArray();
+ }
+
+ /**
+ * Read up to "\n" from an (unchunked) input stream.
+ * If the stream ends before the line terminator is found,
+ * the last part of the string will still be returned.
+ * If no input data available, null is returned.
+ *
+ * @param inputStream the stream to read from
+ * @param charset charset of HTTP protocol elements
+ *
+ * @throws IOException if an I/O problem occurs
+ * @return a line from the stream
+ *
+ * @since 3.0
+ */
+ public static String readLine(InputStream inputStream, String charset) throws IOException {
+ LOG.trace("enter LaxHttpParser.readLine(InputStream, String)");
+ byte[] rawdata = readRawLine(inputStream);
+ if (rawdata == null) {
+ return null;
+ }
+ // strip CR and LF from the end
+ int len = rawdata.length;
+ int offset = 0;
+ if (len > 0) {
+ if (rawdata[len - 1] == '\n') {
+ offset++;
+ if (len > 1) {
+ if (rawdata[len - 2] == '\r') {
+ offset++;
+ }
+ }
+ }
+ }
+ return EncodingUtil.getString(rawdata, 0, len - offset, charset);
+ }
+
+ /**
+ * Read up to "\n" from an (unchunked) input stream.
+ * If the stream ends before the line terminator is found,
+ * the last part of the string will still be returned.
+ * If no input data available, null is returned
+ *
+ * @param inputStream the stream to read from
+ *
+ * @throws IOException if an I/O problem occurs
+ * @return a line from the stream
+ *
+ * @deprecated use #readLine(InputStream, String)
+ */
+
+ public static String readLine(InputStream inputStream) throws IOException {
+ LOG.trace("enter LaxHttpParser.readLine(InputStream)");
+ return readLine(inputStream, "US-ASCII");
+ }
+
+ /**
+ * Parses headers from the given stream. Headers with the same name are not
+ * combined.
+ *
+ * @param is the stream to read headers from
+ * @param charset the charset to use for reading the data
+ *
+ * @return an array of headers in the order in which they were parsed
+ *
+ * @throws IOException if an IO error occurs while reading from the stream
+ * @throws HttpException if there is an error parsing a header value
+ *
+ * @since 3.0
+ */
+ public static Header[] parseHeaders(InputStream is, String charset) throws IOException, HttpException {
+ LOG.trace("enter HeaderParser.parseHeaders(InputStream, String)");
+
+ ArrayList headers = new ArrayList();
+ String name = null;
+ StringBuffer value = null;
+ for (; ;) {
+ String line = LaxHttpParser.readLine(is, charset);
+ if ((line == null) || (line.trim().length() < 1)) {
+ break;
+ }
+
+ // Parse the header name and value
+ // Check for folded headers first
+ // Detect LWS-char see HTTP/1.0 or HTTP/1.1 Section 2.2
+ // discussion on folded headers
+ if ((line.charAt(0) == ' ') || (line.charAt(0) == '\t')) {
+ // we have continuation folded header
+ // so append value
+ if (value != null) {
+ value.append(' ');
+ value.append(line.trim());
+ }
+ } else {
+ // make sure we save the previous name,value pair if present
+ if (name != null) {
+ headers.add(new Header(name, value.toString()));
+ }
+
+ // Otherwise we should have normal HTTP header line
+ // Parse the header name and value
+ int colon = line.indexOf(":");
+
+ // START IA/HERITRIX change
+ // Don't throw an exception if can't parse. We want to keep
+ // going even though header is bad. Rather, create
+ // pseudo-header.
+ if (colon < 0) {
+ // throw new ProtocolException("Unable to parse header: " +
+ // line);
+ name = "HttpClient-Bad-Header-Line-Failed-Parse";
+ value = new StringBuffer(line);
+
+ } else {
+ name = line.substring(0, colon).trim();
+ value = new StringBuffer(line.substring(colon + 1).trim());
+ }
+ // END IA/HERITRIX change
+ }
+
+ }
+
+ // make sure we save the last name,value pair if present
+ if (name != null) {
+ headers.add(new Header(name, value.toString()));
+ }
+
+ return (Header[]) headers.toArray(new Header[headers.size()]);
+ }
+
+ /**
+ * Parses headers from the given stream. Headers with the same name are not
+ * combined.
+ *
+ * @param is the stream to read headers from
+ *
+ * @return an array of headers in the order in which they were parsed
+ *
+ * @throws IOException if an IO error occurs while reading from the stream
+ * @throws HttpException if there is an error parsing a header value
+ *
+ * @deprecated use #parseHeaders(InputStream, String)
+ */
+ public static Header[] parseHeaders(InputStream is) throws IOException, HttpException {
+ LOG.trace("enter HeaderParser.parseHeaders(InputStream, String)");
+ return parseHeaders(is, "US-ASCII");
+ }
+}
diff --git a/src/main/java/org/archive/util/MimetypeUtils.java b/src/main/java/org/archive/util/MimetypeUtils.java
new file mode 100644
index 00000000..adfa1a0f
--- /dev/null
+++ b/src/main/java/org/archive/util/MimetypeUtils.java
@@ -0,0 +1,75 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.util;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Class of mimetype utilities.
+ * @author stack
+ */
+public class MimetypeUtils {
+ /**
+ * The 'no-type' content-type.
+ *
+ * Defined in the ARC file spec at
+ * http://www.archive.org/web/researcher/ArcFileFormat.php.
+ */
+ public static final String NO_TYPE_MIMETYPE = "no-type";
+
+ /**
+ * Truncation regex.
+ */
+ protected final static Pattern TRUNCATION_REGEX = Pattern.compile("^([^\\s;,]+).*");
+
+
+ /**
+ * Truncate passed mimetype.
+ *
+ * Ensure no spaces. Strip encoding. Truncation required by
+ * ARC files.
+ *
+ *
Truncate at delimiters [;, ].
+ * Truncate multi-part content type header at ';'.
+ * Apache httpclient collapses values of multiple instances of the
+ * header into one comma-separated value,therefore truncated at ','.
+ * Current ia_tools that work with arc files expect 5-column
+ * space-separated meta-lines, therefore truncate at ' '.
+ *
+ * @param contentType Raw content-type.
+ *
+ * @return Computed content-type made from passed content-type after
+ * running it through a set of rules.
+ */
+ public static String truncate(String contentType) {
+ if (contentType == null) {
+ contentType = NO_TYPE_MIMETYPE;
+ } else {
+ Matcher matcher = TRUNCATION_REGEX.matcher(contentType);
+ if (matcher.matches()) {
+ contentType = matcher.group(1);
+ } else {
+ contentType = NO_TYPE_MIMETYPE;
+ }
+ }
+
+ return contentType;
+ }
+}
diff --git a/src/main/java/org/archive/util/ProcessUtils.java b/src/main/java/org/archive/util/ProcessUtils.java
new file mode 100644
index 00000000..af792981
--- /dev/null
+++ b/src/main/java/org/archive/util/ProcessUtils.java
@@ -0,0 +1,151 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.util;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.Arrays;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+/**
+ * Class to run an external process.
+ * @author stack
+ * @version $Date$ $Revision$
+ */
+public class ProcessUtils {
+ private static final Logger LOGGER =
+ Logger.getLogger(ProcessUtils.class.getName());
+
+ protected ProcessUtils() {
+ super();
+ }
+
+ /**
+ * Thread to gobble up an output stream.
+ * See http://www.javaworld.com/javaworld/jw-12-2000/jw-1229-traps.html
+ */
+ protected class StreamGobbler extends Thread {
+ private final InputStream is;
+ private final StringBuffer sink = new StringBuffer();
+
+ protected StreamGobbler(InputStream is, String name) {
+ this.is = is;
+ setName(name);
+ }
+
+ public void run() {
+ try {
+ BufferedReader br =
+ new BufferedReader(new InputStreamReader(this.is));
+ for (String line = null; (line = br.readLine()) != null;) {
+ this.sink.append(line);
+ }
+ } catch (IOException ioe) {
+ ioe.printStackTrace();
+ }
+ }
+
+ public String getSink() {
+ return this.sink.toString();
+ }
+ }
+
+ /**
+ * Data structure to hold result of a process exec.
+ * @author stack
+ * @version $Date$ $Revision$
+ */
+ public class ProcessResult {
+ private final String [] args;
+ private final int result;
+ private final String stdout;
+ private final String stderr;
+
+ protected ProcessResult(String [] args, int result, String stdout,
+ String stderr) {
+ this.args = args;
+ this.result = result;
+ this.stderr = stderr;
+ this.stdout = stdout;
+ }
+
+ public int getResult() {
+ return this.result;
+ }
+
+ public String getStdout() {
+ return this.stdout;
+ }
+
+ public String getStderr() {
+ return this.stderr;
+ }
+
+ public String toString() {
+ StringBuffer sb = new StringBuffer();
+ for (int i = 0; i < this.args.length; i++) {
+ sb.append(this.args[i]);
+ sb.append(", ");
+ }
+ return sb.toString() + " exit code: " + this.result +
+ ((this.stderr != null && this.stderr.length() > 0)?
+ "\nSTDERR: " + this.stderr: "") +
+ ((this.stdout != null && this.stdout.length() > 0)?
+ "\nSTDOUT: " + this.stdout: "");
+ }
+ }
+
+ /**
+ * Runs process.
+ * @param args List of process args.
+ * @return A ProcessResult data structure.
+ * @throws IOException If interrupted, we throw an IOException. If non-zero
+ * exit code, we throw an IOException (This may need to change).
+ */
+ public static ProcessUtils.ProcessResult exec(String [] args)
+ throws IOException {
+ Process p = Runtime.getRuntime().exec(args);
+ ProcessUtils pu = new ProcessUtils();
+ // Gobble up any output.
+ StreamGobbler err = pu.new StreamGobbler(p.getErrorStream(), "stderr");
+ err.setDaemon(true);
+ err.start();
+ StreamGobbler out = pu.new StreamGobbler(p.getInputStream(), "stdout");
+ out.setDaemon(true);
+ out.start();
+ int exitVal;
+ try {
+ exitVal = p.waitFor();
+ } catch (InterruptedException e) {
+ throw new IOException("Wait on process " + Arrays.toString(args) + " interrupted: "
+ + e.getMessage());
+ }
+ ProcessUtils.ProcessResult result =
+ pu.new ProcessResult(args, exitVal, out.getSink(), err.getSink());
+ if (exitVal != 0) {
+ throw new IOException(result.toString());
+ } else if (LOGGER.isLoggable(Level.INFO)) {
+ LOGGER.info(result.toString());
+ }
+ return result;
+ }
+}
diff --git a/src/main/java/org/archive/util/ProgressStatisticsReporter.java b/src/main/java/org/archive/util/ProgressStatisticsReporter.java
new file mode 100644
index 00000000..dc1e51f7
--- /dev/null
+++ b/src/main/java/org/archive/util/ProgressStatisticsReporter.java
@@ -0,0 +1,36 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.util;
+
+import java.io.IOException;
+import java.io.PrintWriter;
+
+public interface ProgressStatisticsReporter {
+ /**
+ * @param writer Where to write statistics.
+ * @throws IOException
+ */
+ public void progressStatisticsLine(PrintWriter writer) throws IOException;
+
+ /**
+ * @param writer Where to write statistics legend.
+ * @throws IOException
+ */
+ public void progressStatisticsLegend(PrintWriter writer) throws IOException;
+}
diff --git a/src/main/java/org/archive/util/PropertyUtils.java b/src/main/java/org/archive/util/PropertyUtils.java
new file mode 100644
index 00000000..083615f6
--- /dev/null
+++ b/src/main/java/org/archive/util/PropertyUtils.java
@@ -0,0 +1,114 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.util;
+
+import java.util.Properties;
+import java.util.regex.Matcher;
+
+import org.apache.commons.lang.StringUtils;
+
+/**
+ * Utilities for dealing with Java Properties (incl. System Properties)
+ *
+ * @contributor stack
+ * @contributor gojomo
+ * @version $Date$ $Revision$
+ */
+public class PropertyUtils {
+ /***
+ * @param key Property key.
+ * @return Named property or null if the property is null or empty.
+ */
+ public static String getPropertyOrNull(final String key) {
+ String value = System.getProperty(key);
+ return (value == null || value.length() <= 0)? null: value;
+ }
+
+ /***
+ * @param key Property key.
+ * @return Boolean value or false if null or unreadable.
+ */
+ public static boolean getBooleanProperty(final String key) {
+ return (getPropertyOrNull(key) == null)?
+ false: Boolean.valueOf(getPropertyOrNull(key)).booleanValue();
+ }
+
+ /**
+ * @param key Key to use looking up system property.
+ * @param fallback If no value found for passed key, return
+ * fallback.
+ * @return Value of property or fallback.
+ */
+ public static int getIntProperty(final String key, final int fallback) {
+ return getPropertyOrNull(key) == null?
+ fallback: Integer.parseInt(getPropertyOrNull(key));
+ }
+
+ /**
+ * Given a string which may contain expressions of the form
+ * ${key}, replace each expression with the value corresponding to the
+ * given key in System Properties. If no value is present,
+ * the expression is replaced with the empty-string.
+ *
+ * @param original String
+ * @param properties Properties to try in order; first value found (if any) is used
+ * @return modified String
+ */
+ public static String interpolateWithProperties(String original) {
+ return interpolateWithProperties(original,System.getProperties());
+ }
+
+ protected static String propRefPattern = "\\$\\{([^{}]+)\\}";
+
+ /**
+ * Given a string which may contain expressions of the form
+ * ${key}, replace each expression with the value corresponding to the
+ * given key in the supplied Properties instance. If no value is present,
+ * the expression is replaced with the empty-string.
+ *
+ * @param original String
+ * @param props Properties to try in order; first value found (if any) is used
+ * @return modified String
+ */
+ public static String interpolateWithProperties(String original,
+ Properties... props) {
+ String result = original;
+ // cap number of interpolations as guard against unending loop
+ inter: for(int i =0; i < original.length()*2; i++) {
+ Matcher m = TextUtils.getMatcher(propRefPattern, result);
+ while(m.find()) {
+ String key = m.group(1);
+ String value = "";
+ for(Properties properties : props) {
+ value = properties.getProperty(key, "");
+ if(StringUtils.isNotEmpty(value)) {
+ break;
+ }
+ }
+ result = result.substring(0,m.start())
+ + value
+ + result.substring(m.end());
+ continue inter;
+ }
+ // we only hit here if there were no interpolations last while loop
+ break;
+ }
+ return result;
+ }
+}
diff --git a/src/main/java/org/archive/util/Recorder.java b/src/main/java/org/archive/util/Recorder.java
new file mode 100644
index 00000000..425344bb
--- /dev/null
+++ b/src/main/java/org/archive/util/Recorder.java
@@ -0,0 +1,593 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.util;
+
+import java.io.BufferedInputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
+import java.nio.charset.Charset;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import java.util.zip.DeflaterInputStream;
+import java.util.zip.GZIPInputStream;
+
+import org.apache.commons.httpclient.ChunkedInputStream;
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang.StringUtils;
+import org.archive.io.GenericReplayCharSequence;
+import org.archive.io.RecordingInputStream;
+import org.archive.io.RecordingOutputStream;
+import org.archive.io.ReplayCharSequence;
+import org.archive.io.ReplayInputStream;
+
+import com.google.common.base.Charsets;
+
+
+/**
+ * Pairs together a RecordingInputStream and RecordingOutputStream
+ * to capture exactly a single HTTP transaction.
+ *
+ * Initially only supports HTTP/1.0 (one request, one response per stream)
+ *
+ * Call {@link #markContentBegin()} to demarc the transition between HTTP
+ * header and body.
+ *
+ * @author gojomo
+ */
+public class Recorder {
+ protected static Logger logger =
+ Logger.getLogger("org.archive.util.HttpRecorder");
+
+ private static final int DEFAULT_OUTPUT_BUFFER_SIZE = 16384;
+ private static final int DEFAULT_INPUT_BUFFER_SIZE = 524288;
+
+ private RecordingInputStream ris = null;
+ private RecordingOutputStream ros = null;
+
+ /**
+ * Backing file basename.
+ *
+ * Keep it around so can clean up backing files left on disk.
+ */
+ private String backingFileBasename = null;
+
+ /**
+ * Backing file output stream suffix.
+ */
+ private static final String RECORDING_OUTPUT_STREAM_SUFFIX = ".ros";
+
+ /**
+ * Backing file input stream suffix.
+ */
+ private static final String RECORDING_INPUT_STREAM_SUFFIX = ".ris";
+
+ /**
+ * recording-input (ris) content character encoding.
+ */
+ protected String characterEncoding = null;
+
+ /**
+ * Charset to use for CharSequence provision. Will be UTF-8 if no
+ * encoding ever requested; a Charset matching above characterEncoding
+ * if possible; ISO_8859 if above characterEncoding is unsatisfiable.
+ * TODO: unify to UTF-8 for unspecified and bad-specified cases?
+ * (current behavior is for consistency with our prior but perhaps not
+ * optimal behavior)
+ */
+ protected Charset charset = Charsets.UTF_8;
+
+ /** whether recording-input (ris) message-body is chunked */
+ protected boolean inputIsChunked = false;
+
+ /** recording-input (ris) entity content-encoding (eg gzip, deflate), if any */
+ protected String contentEncoding = null;
+
+ private ReplayCharSequence replayCharSequence;
+
+
+ /**
+ * Create an HttpRecorder.
+ *
+ * @param tempDir Directory into which we drop backing files for
+ * recorded input and output.
+ * @param backingFilenameBase Backing filename base to which we'll append
+ * suffices ris for recorded input stream and
+ * ros for recorded output stream.
+ * @param outBufferSize Size of output buffer to use.
+ * @param inBufferSize Size of input buffer to use.
+ */
+ public Recorder(File tempDir, String backingFilenameBase,
+ int outBufferSize, int inBufferSize) {
+ this(new File(ensure(tempDir), backingFilenameBase),
+ outBufferSize, inBufferSize);
+ }
+
+
+ private static File ensure(File tempDir) {
+ try {
+ org.archive.util.FileUtils.ensureWriteableDirectory(tempDir);
+ } catch (IOException e) {
+ throw new IllegalStateException(e);
+ }
+
+ return tempDir;
+ }
+
+ public Recorder(File file, int outBufferSize, int inBufferSize) {
+ super();
+ this.backingFileBasename = file.getAbsolutePath();
+ this.ris = new RecordingInputStream(inBufferSize,
+ this.backingFileBasename + RECORDING_INPUT_STREAM_SUFFIX);
+ this.ros = new RecordingOutputStream(outBufferSize,
+ this.backingFileBasename + RECORDING_OUTPUT_STREAM_SUFFIX);
+ }
+
+ /**
+ * Create an HttpRecorder.
+ *
+ * @param tempDir
+ * Directory into which we drop backing files for recorded input
+ * and output.
+ * @param backingFilenameBase
+ * Backing filename base to which we'll append suffices
+ * ris for recorded input stream and
+ * ros for recorded output stream.
+ */
+ public Recorder(File tempDir, String backingFilenameBase) {
+ this(tempDir, backingFilenameBase, DEFAULT_INPUT_BUFFER_SIZE,
+ DEFAULT_OUTPUT_BUFFER_SIZE);
+ }
+
+
+ /**
+ * Wrap the provided stream with the internal RecordingInputStream
+ *
+ * open() throws an exception if RecordingInputStream is already open.
+ *
+ * @param is InputStream to wrap.
+ *
+ * @return The input stream wrapper which itself is an input stream.
+ * Pass this in place of the passed stream so input can be recorded.
+ *
+ * @throws IOException
+ */
+ public InputStream inputWrap(InputStream is)
+ throws IOException {
+ logger.fine(Thread.currentThread().getName() + " wrapping input");
+
+ // discard any state from previously-recorded input
+ this.characterEncoding = null;
+ this.inputIsChunked = false;
+ this.contentEncoding = null;
+
+ this.ris.open(is);
+ return this.ris;
+ }
+
+ /**
+ * Wrap the provided stream with the internal RecordingOutputStream
+ *
+ * open() throws an exception if RecordingOutputStream is already open.
+ *
+ * @param os The output stream to wrap.
+ *
+ * @return The output stream wrapper which is itself an output stream.
+ * Pass this in place of the passed stream so output can be recorded.
+ *
+ * @throws IOException
+ */
+ public OutputStream outputWrap(OutputStream os)
+ throws IOException {
+ this.ros.open(os);
+ return this.ros;
+ }
+
+ /**
+ * Close all streams.
+ */
+ public void close() {
+ logger.fine(Thread.currentThread().getName() + " closing");
+ try {
+ this.ris.close();
+ } catch (IOException e) {
+ // TODO: Can we not let the exception out of here and report it
+ // higher up in the caller?
+ DevUtils.logger.log(Level.SEVERE, "close() ris" +
+ DevUtils.extraInfo(), e);
+ }
+ try {
+ this.ros.close();
+ } catch (IOException e) {
+ DevUtils.logger.log(Level.SEVERE, "close() ros" +
+ DevUtils.extraInfo(), e);
+ }
+ }
+
+ /**
+ * Return the internal RecordingInputStream
+ *
+ * @return A RIS.
+ */
+ public RecordingInputStream getRecordedInput() {
+ return this.ris;
+ }
+
+ /**
+ * @return The RecordingOutputStream.
+ */
+ public RecordingOutputStream getRecordedOutput() {
+ return this.ros;
+ }
+
+ /**
+ * Mark current position as the point where the HTTP headers end.
+ */
+ public void markContentBegin() {
+ this.ris.markContentBegin();
+ }
+
+ public long getResponseContentLength() {
+ return this.ris.getResponseContentLength();
+ }
+
+ /**
+ * Close both input and output recorders.
+ *
+ * Recorders are the output streams to which we are recording.
+ * {@link #close()} closes the stream that is being recorded and the
+ * recorder. This method explicitly closes the recorder only.
+ */
+ public void closeRecorders() {
+ try {
+ this.ris.closeRecorder();
+ this.ros.closeRecorder();
+ } catch (IOException e) {
+ DevUtils.warnHandle(e, "Convert to runtime exception?");
+ }
+ }
+
+ /**
+ * Cleanup backing files.
+ *
+ * Call when completely done w/ recorder. Removes any backing files that
+ * may have been dropped.
+ */
+ public void cleanup() {
+ this.close();
+ this.delete(this.backingFileBasename + RECORDING_OUTPUT_STREAM_SUFFIX);
+ this.delete(this.backingFileBasename + RECORDING_INPUT_STREAM_SUFFIX);
+ }
+
+ /**
+ * Delete file if exists.
+ *
+ * @param name Filename to delete.
+ */
+ private void delete(String name) {
+ File f = new File(name);
+ if (f.exists()) {
+ f.delete();
+ }
+ }
+
+
+ protected static ThreadLocal currentRecorder = new ThreadLocal();
+
+ public static void setHttpRecorder(Recorder httpRecorder) {
+ currentRecorder.set(httpRecorder);
+ }
+
+ /**
+ * Get the current threads' HttpRecorder.
+ *
+ * @return This threads' HttpRecorder. Returns null if can't find a
+ * HttpRecorder in current instance.
+ */
+ public static Recorder getHttpRecorder() {
+ return currentRecorder.get();
+ }
+
+ /**
+ * @param characterEncoding Character encoding of input recording.
+ * @return actual charset in use after attempt to set
+ */
+ public void setCharset(Charset cs) {
+ this.charset = cs;
+ }
+
+ /**
+ * @return effective Charset of input recording
+ */
+ public Charset getCharset() {
+ return this.charset;
+ }
+
+ /**
+ * @param characterEncoding Character encoding of input recording.
+ */
+ public void setInputIsChunked(boolean chunked) {
+ this.inputIsChunked = chunked;
+ }
+
+ protected static Set SUPPORTED_ENCODINGS = new HashSet();
+ static {
+ SUPPORTED_ENCODINGS.add("gzip");
+ SUPPORTED_ENCODINGS.add("x-gzip");
+ SUPPORTED_ENCODINGS.add("deflate");
+ SUPPORTED_ENCODINGS.add("identity");
+ SUPPORTED_ENCODINGS.add("none"); // unofficial but common
+ }
+ /**
+ * @param contentEncoding declared content-encoding of input recording.
+ */
+ public void setContentEncoding(String contentEncoding) {
+ String lowerCoding = contentEncoding.toLowerCase();
+ if(!SUPPORTED_ENCODINGS.contains(contentEncoding.toLowerCase())) {
+ throw new IllegalArgumentException("contentEncoding unsupported: "+contentEncoding);
+ }
+ this.contentEncoding = lowerCoding;
+ }
+
+ /**
+ * @return Returns the characterEncoding.
+ */
+ public String getContentEncoding() {
+ return this.contentEncoding;
+ }
+
+
+ /**
+ * @return
+ * @throws IOException
+ * @deprecated use getContentReplayCharSequence
+ */
+ public ReplayCharSequence getReplayCharSequence() throws IOException {
+ return getContentReplayCharSequence();
+ }
+
+ /**
+ * @return A ReplayCharSequence. Caller may call
+ * {@link ReplayCharSequence#close()} when finished. However, in
+ * heritrix, the ReplayCharSequence is closed automatically when url
+ * processing has finished; in that context it's preferable not
+ * to close, so that processors can reuse the same instance.
+ * @throws IOException
+ * @see {@link #endReplays()}
+ */
+ public ReplayCharSequence getContentReplayCharSequence() throws IOException {
+ if (replayCharSequence == null || !replayCharSequence.isOpen()
+ || !replayCharSequence.getCharset().equals(charset)) {
+ if(replayCharSequence!=null && replayCharSequence.isOpen()) {
+ // existing sequence must not have matched now-configured Charset; close
+ replayCharSequence.close();
+ }
+ replayCharSequence = getContentReplayCharSequence(this.charset);
+ }
+ return replayCharSequence;
+ }
+
+
+ /**
+ * @param characterEncoding Encoding of recorded stream.
+ * @return A ReplayCharSequence Will return null if an IOException. Call
+ * close on returned RCS when done.
+ * @throws IOException
+ */
+ public ReplayCharSequence getContentReplayCharSequence(Charset requestedCharset) throws IOException {
+ // raw data overflows to disk; use temp file
+ InputStream ris = getContentReplayInputStream();
+ ReplayCharSequence rcs = new GenericReplayCharSequence(
+ ris,
+ calcRecommendedCharBufferSize(this.getRecordedInput()),
+ this.backingFileBasename + RECORDING_OUTPUT_STREAM_SUFFIX,
+ requestedCharset);
+ ris.close();
+ return rcs;
+ }
+
+ /**
+ * Calculate a recommended size for an in-memory decoded-character buffer
+ * of this content. We seek a size that is itself no larger (in 2-byte chars)
+ * than the memory already used by the RecordingInputStream's internal raw
+ * byte buffer, and also no larger than likely necessary. So, we take the
+ * minimum of the actual recorded byte size and the RecordingInputStream's
+ * max buffer size.
+ *
+ * @param inStream
+ * @return int length for in-memory decoded-character buffer
+ */
+ static protected int calcRecommendedCharBufferSize(RecordingInputStream inStream) {
+ return (int) Math.min(inStream.getRecordedBufferLength()/2, inStream.getSize());
+ }
+
+ /**
+ * Get a raw replay of all recorded data (including, for example, HTTP
+ * protocol headers)
+ *
+ * @return A replay input stream.
+ * @throws IOException
+ */
+ public ReplayInputStream getReplayInputStream() throws IOException {
+ return getRecordedInput().getReplayInputStream();
+ }
+
+ /**
+ * Get a raw replay of the 'message-body'. For the common case of
+ * HTTP, this is the raw, possibly chunked-transfer-encoded message
+ * contents not including the leading headers.
+ *
+ * @return A replay input stream.
+ * @throws IOException
+ */
+ public ReplayInputStream getMessageBodyReplayInputStream() throws IOException {
+ return getRecordedInput().getMessageBodyReplayInputStream();
+ }
+
+ /**
+ * Get a raw replay of the 'entity'. For the common case of
+ * HTTP, this is the message-body after any (usually-unnecessary)
+ * transfer-decoding but before any content-encoding (eg gzip) decoding
+ *
+ * @return A replay input stream.
+ * @throws IOException
+ */
+ public InputStream getEntityReplayInputStream() throws IOException {
+ if(inputIsChunked) {
+ return new ChunkedInputStream(getRecordedInput().getMessageBodyReplayInputStream());
+ } else {
+ return getRecordedInput().getMessageBodyReplayInputStream();
+ }
+ }
+
+ /**
+ * Get a replay cued up for the 'content' (after all leading headers)
+ *
+ * @return A replay input stream.
+ * @throws IOException
+ */
+ public InputStream getContentReplayInputStream() throws IOException {
+ InputStream entityStream = getEntityReplayInputStream();
+ if(StringUtils.isEmpty(contentEncoding)) {
+ return entityStream;
+ } else if ("gzip".equalsIgnoreCase(contentEncoding) || "x-gzip".equalsIgnoreCase(contentEncoding)) {
+ try {
+ return new GZIPInputStream(entityStream);
+ } catch (IOException ioe) {
+ logger.log(Level.WARNING,"gzip problem; using raw entity instead",ioe);
+ IOUtils.closeQuietly(entityStream); // close partially-read stream
+ return getEntityReplayInputStream();
+ }
+ } else if ("deflate".equalsIgnoreCase(contentEncoding)) {
+ return new DeflaterInputStream(entityStream);
+ } else if ("identity".equalsIgnoreCase(contentEncoding) || "none".equalsIgnoreCase(contentEncoding)) {
+ return entityStream;
+ } else {
+ // shouldn't be reached given check on setContentEncoding
+ logger.log(Level.INFO,"Unknown content-encoding '"+contentEncoding+"' declared; using raw entity instead");
+ return entityStream;
+ }
+ }
+
+ /**
+ * Return a short prefix of the presumed-textual content as a String.
+ *
+ * @param size max length of String to return
+ * @return String prefix, or empty String (with logged exception) on any error
+ */
+ public String getContentReplayPrefixString(int size) {
+ return getContentReplayPrefixString(size, this.charset);
+ }
+
+ /**
+ * Return a short prefix of the presumed-textual content as a String.
+ *
+ * @param size max length of String to return
+ * @return String prefix, or empty String (with logged exception) on any error
+ */
+ public String getContentReplayPrefixString(int size, Charset cs) {
+ try {
+ InputStreamReader isr = new InputStreamReader(getContentReplayInputStream(), cs);
+ char[] chars = new char[size];
+ int count = isr.read(chars);
+ isr.close();
+ if (count > 0) {
+ return new String(chars,0,count);
+ } else {
+ return "";
+ }
+ } catch (IOException e) {
+ logger.log(Level.SEVERE,"unable to get replay prefix string", e);
+ return "";
+ }
+ }
+
+ /**
+ * @param tempFile
+ * @throws IOException
+ */
+ public void copyContentBodyTo(File tempFile) throws IOException {
+ InputStream inStream = null;
+ OutputStream outStream = null;
+ try {
+ inStream = getContentReplayInputStream();
+ outStream = FileUtils.openOutputStream(tempFile);
+ IOUtils.copy(inStream, outStream);
+ } finally {
+ IOUtils.closeQuietly(inStream);
+ IOUtils.closeQuietly(outStream);
+ }
+ }
+
+ /**
+ * Record the input stream for later playback by an extractor, etc.
+ * This is convenience method used to setup an artificial HttpRecorder
+ * scenario used in unit tests, etc.
+ * @param dir Directory to write backing file to.
+ * @param basename of what we're recording.
+ * @param in Stream to read.
+ * @param encoding Stream encoding.
+ * @throws IOException
+ * @return An {@link org.archive.util.Recorder}.
+ */
+ public static Recorder wrapInputStreamWithHttpRecord(File dir,
+ String basename, InputStream in, String encoding)
+ throws IOException {
+ Recorder rec = new Recorder(dir, basename);
+ if (encoding != null && encoding.length() > 0) {
+ rec.setCharset(Charset.forName(encoding));
+ }
+ // Do not use FastBufferedInputStream here. It does not
+ // support mark.
+ InputStream is = rec.inputWrap(new BufferedInputStream(in));
+ final int BUFFER_SIZE = 1024 * 4;
+ byte [] buffer = new byte[BUFFER_SIZE];
+ while(true) {
+ // Just read it all down.
+ int x = is.read(buffer);
+ if (x == -1) {
+ break;
+ }
+ }
+ is.close();
+ return rec;
+ }
+
+ public void endReplays() {
+ ArchiveUtils.closeQuietly(replayCharSequence);
+ replayCharSequence = null;
+
+ // like closeQuietly
+ try {
+ ris.clearForReuse();
+ } catch (IOException ioe) {
+ }
+
+ // like closeQuietly
+ try {
+ ros.clearForReuse();
+ } catch (IOException e) {
+ }
+ }
+}
diff --git a/src/main/java/org/archive/util/Reporter.java b/src/main/java/org/archive/util/Reporter.java
new file mode 100644
index 00000000..2fcb8cd8
--- /dev/null
+++ b/src/main/java/org/archive/util/Reporter.java
@@ -0,0 +1,56 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.util;
+
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.Map;
+
+public interface Reporter {
+ /**
+ * Make a default report to the passed-in Writer. Should
+ * be equivalent to reportTo(null, writer)
+ *
+ * @param writer to receive report
+ */
+ public void reportTo(PrintWriter writer) throws IOException;
+
+ /**
+ * Write a short single-line summary report
+ *
+ * @param writer to receive report
+ */
+ @Deprecated
+ public void shortReportLineTo(PrintWriter pw) throws IOException;
+
+
+ /**
+ * @return Same data that's in the single line report, as key-value pairs
+ */
+ public Map shortReportMap();
+
+
+ /**
+ * Return a legend for the single-line summary report as a String.
+ *
+ * @return String single-line summary legend
+ */
+ public String shortReportLegend();
+}
diff --git a/src/main/java/org/archive/util/anvl/ANVLRecord.java b/src/main/java/org/archive/util/anvl/ANVLRecord.java
new file mode 100644
index 00000000..de2d3101
--- /dev/null
+++ b/src/main/java/org/archive/util/anvl/ANVLRecord.java
@@ -0,0 +1,336 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.util.anvl;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import org.archive.io.UTF8Bytes;
+
+/**
+ * An ordered {@link List} with 'data' {@link Element} values.
+ * ANVLRecords end with a blank line.
+ *
+ * @see A Name-Value
+ * Language (ANVL)
+ * @author stack
+ */
+public class ANVLRecord extends LinkedList implements UTF8Bytes {
+ private static final Logger logger =
+ Logger.getLogger(ANVLRecord.class.getName());
+
+ public static final String MIMETYPE = "application/warc-fields";
+
+ public static final ANVLRecord EMPTY_ANVL_RECORD = new ANVLRecord();
+
+ /**
+ * Arbitrary upper bound on maximum size of ANVL Record.
+ * Will throw an IOException if exceed this size.
+ */
+ public static final long MAXIMUM_SIZE = 1024 * 10;
+
+ /**
+ * An ANVL 'newline'.
+ * @see http://en.wikipedia.org/wiki/CRLF
+ */
+ protected static final String CRLF = "\r\n";
+
+ protected static final String FOLD_PREFIX = CRLF + ' ';
+
+ public ANVLRecord() {
+ super();
+ }
+
+ public ANVLRecord(Collection extends Element> c) {
+ super(c);
+ }
+
+ /** @deprecated */
+ public ANVLRecord(int initialCapacity) {
+ super();
+ }
+
+ public boolean addLabel(final String l) {
+ return super.add(new Element(new Label(l)));
+ }
+
+ public boolean addLabelValue(final String l, final String v) {
+ try {
+ return super.add(new Element(new Label(l), new Value(v)));
+ } catch (IllegalArgumentException e) {
+ logger.log(Level.WARNING, "bad label " + l + " or value " + v, e);
+ return false;
+ }
+ }
+
+ @Override
+ public String toString() {
+ // TODO: What to emit for empty ANVLRecord?
+ StringBuilder sb = new StringBuilder();
+ for (final Iterator i = iterator(); i.hasNext();) {
+ sb.append(i.next());
+ sb.append(CRLF);
+ }
+ // 'ANVL Records end in a blank line'.
+ sb.append(CRLF);
+ return sb.toString();
+ }
+
+ public Map asMap() {
+ Map m = new HashMap(size());
+ for (final Iterator i = iterator(); i.hasNext();) {
+ Element e = i.next();
+ m.put(e.getLabel().toString(),
+ e.isValue()? e.getValue().toString(): (String)null);
+ }
+ return m;
+ }
+
+ @Override
+ public ANVLRecord clone() {
+ return (ANVLRecord) super.clone();
+ }
+
+ /**
+ * @return This ANVLRecord as UTF8 bytes.
+ */
+ public byte [] getUTF8Bytes()
+ throws UnsupportedEncodingException {
+ return toString().getBytes(UTF8);
+ }
+
+ /**
+ * Parses a single ANVLRecord from passed InputStream.
+ * Read as a single-byte stream until we get to a CRLFCRLF which
+ * signifies End-of-ANVLRecord. Then parse all read as a UTF-8 Stream.
+ * Doing it this way, while requiring a double-scan, it makes it so do not
+ * need to be passed a RepositionableStream or a Stream that supports
+ * marking. Also no danger of over-reading which can happen when we
+ * wrap passed Stream with an InputStreamReader for doing UTF-8
+ * character conversion (See the ISR class comment).
+ * @param is InputStream
+ * @return An ANVLRecord instance.
+ * @throws IOException
+ */
+ public static ANVLRecord load(final InputStream is)
+ throws IOException {
+ // It doesn't look like a CRLF sequence is possible in UTF-8 without
+ // it signifying CRLF: The top bits are set in multibyte characters.
+ // Was thinking of recording CRLF as I was running through this first
+ // parse but the offsets would then be incorrect if any multibyte
+ // characters in the intervening gaps between CRLF.
+ boolean isCRLF = false;
+ boolean recordStart = false;
+ ByteArrayOutputStream baos = new ByteArrayOutputStream(1024);
+ boolean done = false;
+ int read = 0;
+ for (int c = -1, previousCharacter; !done;) {
+ if (read++ >= MAXIMUM_SIZE) {
+ throw new IOException("Read " + MAXIMUM_SIZE +
+ " bytes without finding \\r\\n\\r\\n " +
+ "End-Of-ANVLRecord");
+ }
+ previousCharacter = c;
+ c = is.read();
+ if (c == -1) {
+ throw new IOException("End-Of-Stream before \\r\\n\\r\\n " +
+ "End-Of-ANVLRecord:\n" +
+ new String(baos.toByteArray(), UTF8));
+ }
+ if (isLF((char)c) && isCR((char)previousCharacter)) {
+ if (isCRLF) {
+ // If we just had a CRLF, then its two CRLFs and its end of
+ // record. We're done.
+ done = true;
+ } else {
+ isCRLF = true;
+ }
+ } else if (!recordStart && Character.isWhitespace(c)) {
+ // Skip any whitespace at start of ANVLRecord.
+ continue;
+ } else {
+ // Clear isCRLF flag if this character is NOT a '\r'.
+ if (isCRLF && !isCR((char)c)) {
+ isCRLF = false;
+ }
+ // Not whitespace so start record if we haven't already.
+ if (!recordStart) {
+ recordStart = true;
+ }
+ }
+ baos.write(c);
+ }
+ return load(new String(baos.toByteArray(), UTF8));
+ }
+
+ /**
+ * Parse passed String for an ANVL Record.
+ * Looked at writing javacc grammer but preprocessing is required to
+ * handle folding: See
+ * https://javacc.dev.java.net/servlets/BrowseList?list=users&by=thread&from=56173.
+ * Looked at Terence Parr's ANTLR. More capable. Can set lookahead count.
+ * A value of 3 would help with folding. But its a pain defining UNICODE
+ * grammers -- needed by ANVL -- and support seems incomplete
+ * anyways: http://www.doc.ic.ac.uk/lab/secondyear/Antlr/lexer.html#unicode.
+ * For now, go with the below hand-rolled parser.
+ * @param s String with an ANVLRecord.
+ * @return ANVLRecord parsed from passed String.
+ * @throws IOException
+ */
+ public static ANVLRecord load(final String s)
+ throws IOException {
+ ANVLRecord record = new ANVLRecord();
+ boolean inValue = false, inLabel = false, inComment = false,
+ inNewLine = false;
+ String label = null;
+ StringBuilder sb = new StringBuilder(s.length());
+ for (int i = 0; i < s.length(); i++) {
+ char c = s.charAt(i);
+
+ // Assert I can do look-ahead.
+ if ((i + 1) > s.length()) {
+ throw new IOException("Premature End-of-ANVLRecord:\n" +
+ s.substring(i));
+ }
+
+ // If at LF of a CRLF, just go around again. Eat up the LF.
+ if (inNewLine && isLF(c)) {
+ continue;
+ }
+
+ // If we're at a CRLF and we were just on one, exit. Found Record.
+ if (inNewLine && isCR(c) && isLF(s.charAt(i + 1))) {
+ break;
+ }
+
+ // Check if we're on a fold inside a Value. Skip multiple white
+ // space after CRLF.
+ if (inNewLine && inValue && Character.isWhitespace(c)) {
+ continue;
+ }
+
+ // Else set flag if we're at a CRLF.
+ inNewLine = isCR(c) && isLF(s.charAt(i + 1));
+
+ if (inNewLine) {
+ if (inComment) {
+ inComment = false;
+ } else if (label != null && !inValue) {
+ // Label only 'data element'.
+ record.addLabel(label);
+ label = null;
+ sb.setLength(0);
+ } else if (inValue) {
+ // Assert I can do look-ahead past current CRLF.
+ if ((i + 3) > s.length()) {
+ throw new IOException("Premature End-of-ANVLRecord "
+ + "(2):\n" + s.substring(i));
+ }
+ if (!isCR(s.charAt(i + 2)) && !isLF(s.charAt(i + 3))
+ && Character.isWhitespace(s.charAt(i + 2))) {
+ // Its a fold. Let it go around. But add in a CRLF and
+ // space and do it here. We don't let CRLF fall through
+ // to the sb.append on the end of this loop.
+ sb.append(CRLF);
+ sb.append(' ');
+ } else {
+ // Next line is a new SubElement, a new Comment or
+ // Label.
+ record.addLabelValue(label, sb.toString());
+ sb.setLength(0);
+ label = null;
+ inValue = false;
+ }
+ } else {
+ // We're whitespace between label and value or whitespace
+ // before we've figured whether label or comment.
+ }
+ // Don't let the '\r' or CRLF through.
+ continue;
+ }
+
+ if (inComment) {
+ continue;
+ } else if (inLabel) {
+ if (c == Label.COLON) {
+ label = sb.toString();
+ sb.setLength(0);
+ inLabel = false;
+ continue;
+ }
+ } else {
+ if (!inLabel && !inValue && !inComment) {
+ // We have no state. Figure one.
+ if (Character.isWhitespace(c)) {
+ // If no state, and whitespace, skip. Don't record.
+ continue;
+ } else if (label == null && c == '#') {
+ inComment = true;
+ // Don't record comments.
+ continue;
+ } else if (label == null) {
+ inLabel = true;
+ } else {
+ inValue = true;
+ }
+ }
+ }
+ sb.append(c);
+ }
+ return record;
+ }
+
+ /**
+ * @return Count of ANVLRecord bytes. Be careful, an empty ANVLRecord is
+ * CRLFCRLF so is of size 4. Also, expensive, since it makes String of
+ * the record so it can count bytes.
+ */
+ public synchronized int getLength() {
+ int length = -1;
+ try {
+ length = getUTF8Bytes().length;
+ } catch (UnsupportedEncodingException e) {
+ throw new RuntimeException(e);
+ }
+ return length;
+ }
+
+ public static boolean isCROrLF(final char c) {
+ return isCR(c) || isLF(c);
+ }
+
+ public static boolean isCR(final char c) {
+ return c == ANVLRecord.CRLF.charAt(0);
+ }
+
+ public static boolean isLF(final char c) {
+ return c == ANVLRecord.CRLF.charAt(1);
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/util/anvl/Element.java b/src/main/java/org/archive/util/anvl/Element.java
new file mode 100644
index 00000000..5881fa9b
--- /dev/null
+++ b/src/main/java/org/archive/util/anvl/Element.java
@@ -0,0 +1,73 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.util.anvl;
+
+
+/**
+ * ANVL 'data element'.
+ * Made of a lone {@link Label}, or a {@link Label} plus {@link Value}.
+ *
+ * @author stack
+ * @see A Name-Value
+ * Language (ANVL)
+ */
+public class Element {
+ private final SubElement [] subElements;
+
+ public Element(final Label l) {
+ this.subElements = new SubElement [] {l};
+ }
+
+ public Element(final Label l, final Value v) {
+ this.subElements = new SubElement [] {l, v};
+ }
+
+ public boolean isValue() {
+ return this.subElements.length > 1;
+ }
+
+ public Label getLabel() {
+ return (Label)this.subElements[0];
+ }
+
+ public Value getValue() {
+ if (!isValue()) {
+ return null;
+ }
+ return (Value)this.subElements[1];
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < subElements.length; i++) {
+ sb.append(subElements[i].toString());
+ if (i == 0) {
+ // Add colon after Label.
+ sb.append(':');
+ if (isValue()) {
+ // Add space to intro the value.
+ sb.append(' ');
+ }
+ }
+ }
+ return sb.toString();
+ }
+}
diff --git a/src/main/java/org/archive/util/anvl/Label.java b/src/main/java/org/archive/util/anvl/Label.java
new file mode 100644
index 00000000..fdadb735
--- /dev/null
+++ b/src/main/java/org/archive/util/anvl/Label.java
@@ -0,0 +1,41 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.util.anvl;
+
+class Label extends SubElement {
+ public static final char COLON = ':';
+
+ @SuppressWarnings("unused")
+ private Label() {
+ this(null);
+ }
+
+ public Label(final String s) {
+ super(s);
+ }
+
+ @Override
+ protected void checkCharacter(char c, String srcStr, int index) {
+ super.checkCharacter(c, srcStr, index);
+ if (c == COLON) {
+ throw new IllegalArgumentException("Label cannot contain " + COLON);
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/util/anvl/SubElement.java b/src/main/java/org/archive/util/anvl/SubElement.java
new file mode 100644
index 00000000..33b9e9bb
--- /dev/null
+++ b/src/main/java/org/archive/util/anvl/SubElement.java
@@ -0,0 +1,78 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.util.anvl;
+
+/**
+ * Abstract ANVL 'data element' sub-part.
+ * Subclass to make a Comment, a Label, or a Value.
+ * @author stack
+ */
+abstract class SubElement {
+ private final String e;
+
+ protected SubElement() {
+ this(null);
+ }
+
+ public SubElement(final String s) {
+ this.e = baseCheck(s);
+ }
+
+ protected String baseCheck(final String s) {
+ // Check for null.
+ if (s == null) {
+ throw new IllegalArgumentException("Can't be null");
+ }
+ // Check for CRLF.
+ for (int i = 0; i < s.length(); i++) {
+ checkCharacter(s.charAt(i), s, i);
+ }
+ return s;
+ }
+
+ protected void checkCharacter(final char c, final String srcStr,
+ final int index) {
+ checkControlCharacter(c, srcStr, index);
+ checkCRLF(c, srcStr, index);
+ }
+
+ protected void checkControlCharacter(final char c, final String srcStr,
+ final int index) {
+ if (Character.isISOControl(c) && !Character.isWhitespace(c) ||
+ !Character.isValidCodePoint(c)) {
+ throw new IllegalArgumentException(srcStr +
+ " contains a control character(s) or invalid code point: 0x" +
+ Integer.toHexString(c));
+ }
+ }
+
+ protected void checkCRLF(final char c, final String srcStr,
+ final int index) {
+ if (ANVLRecord.isCROrLF(c)) {
+ throw new IllegalArgumentException(srcStr +
+ " contains disallowed CRLF control character(s): 0x" +
+ Integer.toHexString(c));
+ }
+ }
+
+ @Override
+ public String toString() {
+ return e;
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/util/anvl/Value.java b/src/main/java/org/archive/util/anvl/Value.java
new file mode 100644
index 00000000..2a650ba2
--- /dev/null
+++ b/src/main/java/org/archive/util/anvl/Value.java
@@ -0,0 +1,71 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.util.anvl;
+
+/**
+ * TODO: Now values 'fold' but should but perhaps they shouldn't be stored
+ * folded. Only when we serialize should we fold (But how to know where
+ * to fold?).
+ * @author stack
+ * @version $Date$ $Version$
+ */
+class Value extends SubElement {
+
+ private StringBuilder sb;
+ private boolean folding = false;
+
+ @SuppressWarnings("unused")
+ private Value() {
+ this(null);
+ }
+
+ public Value(final String s) {
+ super(s);
+ }
+
+ protected String baseCheck(String s) {
+ this.sb = new StringBuilder(s.length() * 2);
+ super.baseCheck(s);
+ return sb.toString();
+ }
+
+ @Override
+ protected void checkCharacter(char c, String srcStr, int index) {
+ checkControlCharacter(c, srcStr, index);
+ // Now, rewrite the value String with folding (If CR or LF or CRLF
+ // present.
+ if (ANVLRecord.isCR(c)) {
+ this.folding = true;
+ this.sb.append(ANVLRecord.FOLD_PREFIX);
+ } else if (ANVLRecord.isLF(c)) {
+ if (!this.folding) {
+ this.folding = true;
+ this.sb.append(ANVLRecord.FOLD_PREFIX);
+ } else {
+ // Previous character was a CR. Fold prefix has been added.
+ }
+ } else if (this.folding && Character.isWhitespace(c)) {
+ // Only write out one whitespace character. Skip.
+ } else {
+ this.folding = false;
+ this.sb.append(c);
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/util/anvl/package.html b/src/main/java/org/archive/util/anvl/package.html
new file mode 100644
index 00000000..4a2a8963
--- /dev/null
+++ b/src/main/java/org/archive/util/anvl/package.html
@@ -0,0 +1,42 @@
+
+
+
+org.archive.util.anvl package
+
+
+Parsers and Writers for the (expired) Internet-Draft A Name-Value
+Language (ANVL). Use {@link org.archive.util.anvl.ANVLRecord}
+to create new instances of ANVL Records and for parsing.
+
+
Implementation Details
+
The ANVL Internet-Draft of 14 February, 2005 is inspecific as to the
+definition of 'blank line' and 'newline'. This parser implementation
+assumes CRNL.
+
+
Says "An element consists of a label, a colon, and an optional value".
+Should that be: "An element consists of a label and an optional value, or a
+comment."
+
+
Specification is unclear regards CR or NL in label or
+comment (This implementation disallows CR or NL in labels but lets
+them pass in comments).
+
+
A grammar would help. Here is RFC822:
+
+ field = field-name ":" [ field-body ] CRLF
+
+ field-name = 1*<any CHAR, excluding CTLs, SPACE, and ":">
+
+ field-body = field-body-contents
+ [CRLF LWSP-char field-body]
+
+ field-body-contents =
+ <the ASCII characters making up the field-body, as
+ defined in the following sections, and consisting
+ of combinations of atom, quoted-string, and
+ specials tokens, or else consisting of texts>
+