org.archive.httpclient package

diff --git a/README.md b/README.md index ae865f7e..9bd2e12a 100644 --- a/README.md +++ b/README.md @@ -3,4 +3,6 @@ OpenWayback Web Commons [![Build Status](https://travis-ci.org/iipc/iipc-web-commons.png?branch=master)](https://travis-ci.org/iipc/iipc-web-commons/) -This repository contains common utility code for the OpenWayback project. +This repository contains common utility code for the [OpenWayback][1] project. + +[1]: https://github.com/iipc/openwayback diff --git a/pom-cdh4.xml b/pom-cdh4.xml new file mode 100644 index 00000000..de19d8d0 --- /dev/null +++ b/pom-cdh4.xml @@ -0,0 +1,229 @@ + + 4.0.0 + + org.archive + ia-web-commons + 1.0-SNAPSHOT + jar + + ia-web-commons + http://maven.apache.org + + + UTF-8 + ${maven.build.timestamp} + yyyyMMddhhmmss + + + + + junit + junit + 3.8.1 + test + + + + com.google.guava + guava + 14.0.1 + + + + org.json + json + 20090211 + + + org.htmlparser + htmlparser + 1.6 + + + + org.mozilla + juniversalchardet + 1.0.3 + + + + commons-httpclient + commons-httpclient + 3.1 + + + + org.apache.hadoop + hadoop-core + 2.0.0-mr1-cdh4.2.0 + + + commons-httpclient + commons-httpclient + + + javax.servlet + servlet-api + + + javax.servlet.jsp + jsp-api + + + org.mortbay.jetty + jetty + + + org.mortbay.jetty + jetty-util + + + tomcat + jasper-runtime + + + tomcat + jasper-compiler + + + + + org.apache.hadoop + hadoop-common + 2.0.0-cdh4.2.0 + + + org.apache.hadoop + hadoop-mapreduce-client-common + 2.0.0-cdh4.2.0 + + + org.apache.hadoop + hadoop-mapreduce-client-core + 2.0.0-cdh4.2.0 + + + + org.apache.pig + pig + 0.11.1 + provided + + + + commons-lang + commons-lang + 2.5 + + + + commons-io + commons-io + 2.4 + + + + org.gnu.inet + libidn + 1.15 + + + it.unimi.dsi + mg4j + 1.0.1 + compile + + + org.apache.httpcomponents + httpcore + 4.3 + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 2.3.2 + + 1.6 + 1.6 + + + + maven-assembly-plugin + 2.4 + + + jar-with-dependencies + + ia-web-commons + + + + package + + single + + + + + + + + src/main/resources + true + + + + + + + internetarchive + Internet Archive Maven Repository + http://builds.archive.org:8080/maven2 + default + + + true + daily + warn + + + true + daily + warn + + + + + cloudera + Cloudera Hadoop + https://repository.cloudera.com/artifactory/cloudera-repos/ + default + + + true + daily + warn + + + true + daily + warn + + + + + + + + repository + + ${repository.url} + + + + diff --git a/pom.xml b/pom.xml index f285a382..a1d3de27 100644 --- a/pom.xml +++ b/pom.xml @@ -162,7 +162,23 @@ dsiutils 2.0.12 compile - + + + org.apache.httpcomponents + httpcore + 4.3 + + + joda-time + joda-time + 1.6 + + + fastutil + fastutil + 5.0.7 + compile + diff --git a/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java b/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java new file mode 100644 index 00000000..0d564a6f --- /dev/null +++ b/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java @@ -0,0 +1,150 @@ +package org.archive.extract; + +import java.io.IOException; +import java.io.PrintWriter; +import java.net.MalformedURLException; +import java.net.URISyntaxException; +import java.net.URL; +import java.util.List; +import java.util.logging.Logger; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.archive.format.gzip.GZIPFormatException; +import org.archive.format.json.JSONUtils; +import org.archive.format.json.SimpleJSONPathSpec; +import org.archive.resource.MetaData; +import org.archive.resource.Resource; +import org.archive.util.IAUtils; +import org.archive.util.StreamCopy; +import org.json.JSONArray; +import org.json.JSONException; +import org.json.JSONObject; + +import com.google.common.io.CountingOutputStream; +import com.google.common.io.NullOutputStream; + +public class WARCMetadataRecordExtractorOutput implements ExtractorOutput { + private static final Logger LOG = + Logger.getLogger(WARCMetadataRecordExtractorOutput.class.getName()); + + private PrintWriter out; + SimpleJSONPathSpec formatSpec = new SimpleJSONPathSpec("Envelope.Format"); + SimpleJSONPathSpec warcURL = new SimpleJSONPathSpec("Envelope.WARC-Header-Metadata.WARC-Target-URI"); + SimpleJSONPathSpec warcDate = new SimpleJSONPathSpec("Envelope.WARC-Header-Metadata.WARC-Date"); + SimpleJSONPathSpec warcType = new SimpleJSONPathSpec("Envelope.WARC-Header-Metadata.WARC-Type"); + SimpleJSONPathSpec warcMetadataRecord = new SimpleJSONPathSpec("Envelope.Payload-Metadata.WARC-Metadata-Metadata.Metadata-Records"); + + private String outputType = "outlinks"; + + public WARCMetadataRecordExtractorOutput(PrintWriter out, String outputType) { + this.out = out; + this.outputType = outputType; + } + + public WARCMetadataRecordExtractorOutput(PrintWriter out) { + this(out,"outlinks"); + } + + public void output(Resource resource) throws IOException { + NullOutputStream nullo = new NullOutputStream(); + CountingOutputStream co = new CountingOutputStream(nullo); + try { + StreamCopy.copy(resource.getInputStream(), co); + } catch(GZIPFormatException e) { + e.printStackTrace(); + return; + } + long bytes = co.getCount(); + if(bytes > 0) { + LOG.info(bytes + " unconsumed bytes in Resource InputStream."); + } + try { + MetaData m = resource.getMetaData().getTopMetaData(); + // URL DATE OURL MIME HTTP-CODE SHA1 META REDIR OFFSET LENGTH FILE + String format = getEnvelopeFormat(m); + String origUrl = "TBD"; + String date = "TBD"; + String canUrl = "TBD"; + + if(format.equals("WARC")) { + origUrl = getWARCURL(m); + date = getWARCDate(m); + String type = getWARCType(m); + if(type.equals("metadata")) { + String warcMetadataRecord = getWARCMetadataRecord(m); + + JSONArray array = new JSONArray(warcMetadataRecord); + String viaUrl = "-"; + String viaPath = "-"; + String sourceTag = "-"; + for(int i=0;i 2) + //'outlinks': 'origUrl date origOutlinkUrl linktype linktext' + out.format("%s\t%s\t%s\t%s\t\n",origUrl,date,linkParts[0],linkParts[2]); + } + } else if(outputType.equals("hopinfo")) { + String key = obj.get("Name").toString(); + String value = obj.get("Value").toString(); + if(key.equals("via")) { + viaUrl = value; + } else if (key.equals("hopsFromSeed")) { + viaPath = value; + } else if (key.equals("sourceTag")) { + sourceTag = value; + } + } + } + if(outputType.equals("hopinfo")) { + //'hopinfo': 'origCrawledUrl date origViaUrl hopPathFromVia sourceTag' + out.format("%s\t%s\t%s\t%s\t%s\n",origUrl,date,viaUrl,viaPath,sourceTag); + } + } + } + + } + catch (Exception e) { + throw new IOException(e); + } + out.flush(); + } + + private String getEnvelopeFormat(MetaData m) { + return unwrapFirst(formatSpec.extract(m),"-"); + } + private String getWARCURL(MetaData m) { + return unwrapFirst(warcURL.extract(m),"-"); + } + private String getWARCDate(MetaData m) { + return unwrapFirst(warcDate.extract(m),"-"); + } + private String getWARCType(MetaData m) { + return unwrapFirst(warcType.extract(m),"-"); + } + private String getWARCMetadataRecord(MetaData m) { + return unwrapFirst(warcMetadataRecord.extract(m),"-"); + } + + private String unwrapFirst(List> l, String defaultValue) { + if(l != null) { + if(l.size() > 0) { + if(l.get(0) != null) { + if(l.get(0).size() > 0) { + String v = l.get(0).get(0); + if(v != null) { + if(v.length() > 0) { + return v; + } + } + } + } + } + } + return defaultValue; + } +} diff --git a/src/main/java/org/archive/format/arc/ARCConstants.java b/src/main/java/org/archive/format/arc/ARCConstants.java index 6bfc5a99..a336ddeb 100755 --- a/src/main/java/org/archive/format/arc/ARCConstants.java +++ b/src/main/java/org/archive/format/arc/ARCConstants.java @@ -1,8 +1,20 @@ package org.archive.format.arc; import java.nio.charset.Charset; +import java.util.Arrays; +import java.util.List; +import java.util.zip.Deflater; +import java.util.zip.GZIPInputStream; -public interface ARCConstants { +import org.archive.format.ArchiveFileConstants; +import org.archive.util.zip.GzipHeader; + +/** + * Constants used by ARC files and in ARC file processing. + * + * @author stack + */ +public interface ARCConstants extends ArchiveFileConstants { public final static int MAX_META_LENGTH = 1024 * 32; public final static Charset ARC_META_CHARSET = Charset.forName("utf-8"); public final static int NEW_LINE_ORD = 10; @@ -25,4 +37,201 @@ public interface ARCConstants { public static final String FILEDESC_SCHEME = "filedesc:/"; public static final String DNS_MIME = "text/dns"; public static final String ALEXA_DAT_MIME = "alexa/dat"; + + /** + * Default maximum ARC file size. + */ + public static final long DEFAULT_MAX_ARC_FILE_SIZE = 100000000; + + /** + * Maximum length for a metadata line. + */ + public static final int MAX_METADATA_LINE_LENGTH = (4 * 1024); + + /** + * ARC file extention. + */ + public static final String ARC_FILE_EXTENSION = "arc"; + + /** + * Dot ARC file extension. + */ + public static final String DOT_ARC_FILE_EXTENSION = + "." + ARC_FILE_EXTENSION; + + public static final String DOT_COMPRESSED_FILE_EXTENSION = + ArchiveFileConstants.DOT_COMPRESSED_FILE_EXTENSION; + + /** + * Compressed arc file extension. + */ + public static final String COMPRESSED_ARC_FILE_EXTENSION = + ARC_FILE_EXTENSION + DOT_COMPRESSED_FILE_EXTENSION; + + /** + * Compressed dot arc file extension. + */ + public static final String DOT_COMPRESSED_ARC_FILE_EXTENSION = + DOT_ARC_FILE_EXTENSION + DOT_COMPRESSED_FILE_EXTENSION; + + /** + * Encoding to use getting bytes from strings. + * + * Specify an encoding rather than leave it to chance: i.e whatever the + * JVMs encoding. Use an encoding that gets the stream as bytes, not chars. + */ + public static final String DEFAULT_ENCODING = "ISO-8859-1"; + + /** + * ARC file line seperator character. + * + * This is what the alexa c-code looks for delimiting lines. + */ + public static final char LINE_SEPARATOR = '\n'; + + /** + * ARC header field seperator character. + */ + public static final char HEADER_FIELD_SEPARATOR = ' '; + + /** + * ARC file *MAGIC NUMBER*. + * + * Every ARC file must begin w/ this. + */ + public static final String ARC_MAGIC_NUMBER = "filedesc://"; + + /** + * The FLG.FEXTRA field that is added to ARC files. (See RFC1952 to + * understand FLG.FEXTRA). + */ + public static final byte[] ARC_GZIP_EXTRA_FIELD = { 8, 0, 'L', 'X', 4, 0, + 0, 0, 0, 0 }; + + /** + * Key for the ARC Header IP field. + * + * Lowercased. + */ + public static final String IP_HEADER_FIELD_KEY = "ip-address"; + + /** + * Key for the ARC Header Result Code field. + * + * Lowercased. + */ + public static final String CODE_HEADER_FIELD_KEY = "result-code"; + + /** + * Key for the ARC Header Checksum field. + * + * Lowercased. + */ + public static final String CHECKSUM_HEADER_FIELD_KEY = "checksum"; + + /** + * Key for the ARC Header Location field. + * + * Lowercased. + */ + public static final String LOCATION_HEADER_FIELD_KEY = "location"; + + /** + * Key for the ARC Header Offset field. + * + * Lowercased. + */ + public static final String OFFSET_HEADER_FIELD_KEY = "offset"; + + /** + * Key for the ARC Header filename field. + * + * Lowercased. + */ + public static final String FILENAME_HEADER_FIELD_KEY = "filename"; + + /** + * Key for statuscode field. + */ + public static final String STATUSCODE_FIELD_KEY = "statuscode"; + + /** + * Key for offset field. + */ + public static final String OFFSET_FIELD_KEY = OFFSET_HEADER_FIELD_KEY; + + /** + * Key for filename field. + */ + public static final String FILENAME_FIELD_KEY = FILENAME_HEADER_FIELD_KEY; + + /** + * Key for checksum field. + */ + public static final String CHECKSUM_FIELD_KEY = CHECKSUM_HEADER_FIELD_KEY; + + /** + * Tokenized field prefix. + * + * Use this prefix for tokenized fields when naming fields in + * an index. + */ + public static final String TOKENIZED_PREFIX = "tokenized_"; + + /** + * Assumed maximum size of a record meta header line. + * + * This 100k which seems massive but its the same as the LINE_LENGTH from + * alexa/include/a_arcio.h: + *

+     * #define LINE_LENGTH     (100*1024)
+     *

+ */ + public static final int MAX_HEADER_LINE_LENGTH = 1024 * 100; + + /** + * Version 1 required metadata fields. + */ + public static List REQUIRED_VERSION_1_HEADER_FIELDS = Arrays + .asList(new String[] { URL_FIELD_KEY, IP_HEADER_FIELD_KEY, + DATE_FIELD_KEY, MIMETYPE_FIELD_KEY, + LENGTH_FIELD_KEY, VERSION_FIELD_KEY, + ABSOLUTE_OFFSET_KEY }); + + /** + * Minimum possible record length. + * + * This is a rough calc. When the header is data it will occupy less space. + */ + public static int MINIMUM_RECORD_LENGTH = 1 + "://".length() + 1 + + ARC_FILE_EXTENSION.length() + " ".length() + +1 + " ".length() + + 1 + " ".length() + 1 + "/".length() + 1 + " ".length() + 1; + + /** + * Start of a GZIP header that uses default deflater. + */ + public static final byte[] GZIP_HEADER_BEGIN = { + (byte) GZIPInputStream.GZIP_MAGIC, // Magic number (short) + (byte) (GZIPInputStream.GZIP_MAGIC >> 8), // Magic number (short) + Deflater.DEFLATED // Compression method (CM) + }; + + /** + * Length of minimual 'default GZIP header. + * + * See RFC1952 for explaination of value of 10. + */ + public static final int DEFAULT_GZIP_HEADER_LENGTH = + GzipHeader.MINIMAL_GZIP_HEADER_LENGTH; + + /** + * set of known errors encountered reading ARCs + */ + public enum ArcRecordErrors { + HTTP_HEADER_TRUNCATED, + HTTP_STATUS_LINE_INVALID, + HTTP_STATUS_LINE_EXCEPTION, + } + + } diff --git a/src/main/java/org/archive/format/cdx/CDXFile.java b/src/main/java/org/archive/format/cdx/CDXFile.java index 0c3a777a..7dca0464 100644 --- a/src/main/java/org/archive/format/cdx/CDXFile.java +++ b/src/main/java/org/archive/format/cdx/CDXFile.java @@ -97,4 +97,10 @@ public static BufferedReader createStreamingLineReader(String uri, boolean gzipp BufferedReader reader = new BufferedReader(new InputStreamReader(input)); return reader; } + + @Override + public long getTotalLines() { + //TODO: Implement + return 0; + } } diff --git a/src/main/java/org/archive/format/cdx/CDXInputSource.java b/src/main/java/org/archive/format/cdx/CDXInputSource.java index 0a926ebc..34abde53 100644 --- a/src/main/java/org/archive/format/cdx/CDXInputSource.java +++ b/src/main/java/org/archive/format/cdx/CDXInputSource.java @@ -9,4 +9,6 @@ public interface CDXInputSource { public CloseableIterator getCDXIterator(String key, String prefix, boolean exact, ZipNumParams params) throws IOException; public CloseableIterator getCDXIterator(String key, String start, String startEndUrl, ZipNumParams params) throws IOException; + + public long getTotalLines(); } diff --git a/src/main/java/org/archive/format/cdx/MultiCDXInputSource.java b/src/main/java/org/archive/format/cdx/MultiCDXInputSource.java index 66367077..cbf70c0e 100644 --- a/src/main/java/org/archive/format/cdx/MultiCDXInputSource.java +++ b/src/main/java/org/archive/format/cdx/MultiCDXInputSource.java @@ -8,6 +8,7 @@ import org.archive.format.gzip.zipnum.ZipNumIndex; import org.archive.format.gzip.zipnum.ZipNumParams; +import org.archive.util.iterator.CloseableCompositeIterator; import org.archive.util.iterator.CloseableIterator; import org.archive.util.iterator.SortedCompositeIterator; @@ -40,18 +41,22 @@ public void setCdxUris(List cdxUris) throws IOException { } - Comparator comparator = new Comparator() { + public final static Comparator defaultComparator = new Comparator() { public int compare(String s1, String s2) { return s1.compareTo(s2); } }; - Comparator reverseComparator = new Comparator() { + public final static Comparator defaultReverseComparator = new Comparator() { public int compare(String s1, String s2) { return -s1.compareTo(s2); } }; + protected Comparator comparator = defaultComparator; + protected Comparator reverseComparator = defaultReverseComparator; + + public CloseableIterator getCDXIterator(String key, String prefix, boolean exact, ZipNumParams params) throws IOException { SortedCompositeIterator scitr = new SortedCompositeIterator(cdx.size(), params.isReverse() ? reverseComparator : comparator); @@ -70,9 +75,112 @@ public CloseableIterator getCDXIterator(String key, String prefix, boole return scitr; } + // A special iterator which initializes on actual first use + protected static class LazyInitIterator implements CloseableIterator + { + CDXInputSource source; + CloseableIterator iter; + boolean failed = false; + + String key, start, end; + ZipNumParams params; + + protected LazyInitIterator(CDXInputSource source, String key, String start, String end, ZipNumParams params) + { + this.key = key; + this.start = start; + this.end = end; + + this.params = params; + + this.source = source; + } + + protected void initIter() + { + if (iter != null) { + return; + } + + try { + iter = source.getCDXIterator(key, start, end, params); + } catch (IOException io) { + LOGGER.warning(io.toString()); + iter = null; + } + } + + @Override + public boolean hasNext() { + initIter(); + + if (iter == null) { + return false; + } + + return iter.hasNext(); + } + + @Override + public String next() { + initIter(); + + if (iter == null) { + return null; + } + + return iter.next(); + } + + @Override + public void remove() { + + } + + @Override + public void close() throws IOException { + if (iter != null) { + iter.close(); + } + } + } + + public CloseableIterator createSeqIterator(String key, String start, String end, ZipNumParams params) + { + CloseableCompositeIterator composite = new CloseableCompositeIterator(); + CloseableIterator iter = null; + + for (int i = 0; i < cdx.size(); i++) { + try { + CDXInputSource cdxReader = cdx.get(i); + + if (i == (cdx.size() - 1)) { + iter = cdxReader.getCDXIterator(key, start, end, params); + } else { + iter = new LazyInitIterator(cdxReader, key, start, end, params); + } + + if (!params.isReverse()) { + composite.addLast(iter); + } else { + composite.addFirst(iter); + } + + } catch (IOException io) { + LOGGER.warning(io.toString()); + } + } + + return composite; + } + public CloseableIterator getCDXIterator(String key, String start, String end, ZipNumParams params) throws IOException { + if (params.isSequential()) { + return this.createSeqIterator(key, start, end, params); + } + SortedCompositeIterator scitr = new SortedCompositeIterator(cdx.size(), params.isReverse() ? reverseComparator : comparator); CloseableIterator iter = null; @@ -88,4 +196,15 @@ public CloseableIterator getCDXIterator(String key, String start, String return scitr; } + + @Override + public long getTotalLines() { + long sum = 0; + + for (CDXInputSource cdxReader : cdx) { + sum += cdxReader.getTotalLines(); + } + + return sum; + } } diff --git a/src/main/java/org/archive/format/cdx/StandardCDXLineFactory.java b/src/main/java/org/archive/format/cdx/StandardCDXLineFactory.java index d2c299e5..33da41f1 100644 --- a/src/main/java/org/archive/format/cdx/StandardCDXLineFactory.java +++ b/src/main/java/org/archive/format/cdx/StandardCDXLineFactory.java @@ -39,6 +39,17 @@ public FieldSplitFormat getParseFormat() return parseFormat; } + public CDXLine createStandardCDXLine(String input) + { + if (parseFormat == cdx11) { + return new CDX11Line(input, parseFormat); + } else if (parseFormat == cdx09) { + return new CDX09Line(input, parseFormat); + } else { + return new CDXLine(input, parseFormat); + } + } + public CDXLine createStandardCDXLine(String input, FieldSplitFormat exFormat) { if (parseFormat == cdx11) { diff --git a/src/main/java/org/archive/format/gzip/zipnum/SummaryBlockIterator.java b/src/main/java/org/archive/format/gzip/zipnum/SummaryBlockIterator.java index 0046625c..cbf947f6 100644 --- a/src/main/java/org/archive/format/gzip/zipnum/SummaryBlockIterator.java +++ b/src/main/java/org/archive/format/gzip/zipnum/SummaryBlockIterator.java @@ -120,7 +120,7 @@ public CloseableIterator getNextInner() { SeekableLineReader currReader = zipnumIndex.doBlockLoad(currPartId, startOffset, totalLength); if ((currReader == null) && zipnumIndex.isRequired()) { - throw new RuntimeIOException(); + throw new RuntimeIOException("Failed to load shards for: " + currPartId); } if (currReader != null) { diff --git a/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java b/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java index a1682818..2247eda4 100644 --- a/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java +++ b/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java @@ -156,8 +156,20 @@ public SeekableLineReader attemptLoadBlock(String location, long startOffset, in } catch (IOException io) { Level level = (isRequired ? Level.SEVERE : Level.WARNING); + String actualLocation = null; + + if (currReader instanceof HTTPSeekableLineReader) { + actualLocation = ((HTTPSeekableLineReader)currReader).getConnectedUrl(); + } + + if (actualLocation == null) { + actualLocation = location; + } + + String msg = io.toString() + " -- -r " + startOffset + "-" + (startOffset + totalLength - 1) + " " + actualLocation; + if (LOGGER.isLoggable(level)) { - LOGGER.log(level, io.toString() + " -- -r " + startOffset + "-" + (startOffset + totalLength - 1) + " " + location + " req? " + isRequired); + LOGGER.log(level, msg); } if (currReader != null) { @@ -170,7 +182,7 @@ public SeekableLineReader attemptLoadBlock(String location, long startOffset, in } if (isRequired) { - throw new RuntimeIOException(io); + throw new RuntimeIOException(msg); } } diff --git a/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java b/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java index 5e91c507..bc773a58 100644 --- a/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java +++ b/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java @@ -75,7 +75,7 @@ public void run() { Thread.sleep(checkInterval); if (summary != null) { - summary.reloadFactory(); + summary.reloadFactory(); } } @@ -122,7 +122,15 @@ class BlockSize protected boolean newIsDisabled = false; protected boolean disabled = false; - final static int DEFAULT_LOC_CACHE_EXPIRE_MILLIS = 5000; + //final static int DEFAULT_LOC_CACHE_EXPIRE_MILLIS = 120000; + + protected ConcurrentHashMap locCacheMap; + + protected boolean cacheRemoteLoc = false; + + protected int locCacheExpireMillis = 120000; + + protected int locCacheMaxDuration = 1000; class LocCacheEntry { @@ -151,14 +159,7 @@ public boolean equals(Object obj) return false; } - } - - protected ConcurrentHashMap locCacheMap; - - protected boolean cacheRemoteLoc = false; - - protected int locCacheExpireMillis = DEFAULT_LOC_CACHE_EXPIRE_MILLIS; - + } @Override public void init() throws IOException @@ -189,6 +190,7 @@ public void init() throws IOException startDate = newStartDate; endDate = newEndDate; locRoot = newLocRoot; + this.cdxLinesTotalCount = computeTotalLines(); if (!disabled) { this.loadLastBlockSizes(blockSizesFile); @@ -240,6 +242,12 @@ protected void syncLoad(long newModTime) endDate = newEndDate; disabled = newIsDisabled; locRoot = newLocRoot; + + this.cdxLinesTotalCount = computeTotalLines(); + } + + if (this.locCacheMap != null) { + locCacheMap.clear(); } closeExistingFiles(filesToClose); @@ -287,6 +295,14 @@ public void setLocCacheExpireMillis(int locCacheExpireMillis) { this.locCacheExpireMillis = locCacheExpireMillis; } + public int getLocCacheMaxDuration() { + return locCacheMaxDuration; + } + + public void setLocCacheMaxDuration(int locCacheMaxDuration) { + this.locCacheMaxDuration = locCacheMaxDuration; + } + public boolean isCacheRemoteLoc() { return cacheRemoteLoc; } @@ -471,8 +487,9 @@ public long getLastBlockDiff(String startKey, int startPart, int endPart) { return diff; } + // Adjust from shorter blocks, if loaded - public long getTotalLines() + public long computeTotalLines() { long numLines = 0; @@ -525,25 +542,19 @@ SeekableLineReader doBlockLoad(String partId, long startOffset, int totalLength) } // Attempt cached load for http - if (cacheRemoteLoc && (locCacheMap != null)) { - // Non-http requests follow standard load path - if ((locations.length > 0) && GeneralURIStreamFactory.isHttp(locations[0])) { - reader = loadCachedBalancedReader(partId, locations, startOffset, totalLength); - } - } - - if (reader != null) { - return reader; - } - - for (String location : locations) { - reader = blockLoader.attemptLoadBlock(location, startOffset, totalLength, true, isRequired()); - if (reader != null) { - return reader; + if (cacheRemoteLoc && (locCacheMap != null) && (locations.length > 0) && GeneralURIStreamFactory.isHttp(locations[0])) { + reader = loadCachedBalancedReader(partId, locations, startOffset, totalLength); + } else { + // Standard block load path + for (String location : locations) { + reader = blockLoader.attemptLoadBlock(location, startOffset, totalLength, true, isRequired()); + if (reader != null) { + return reader; + } } } - return null; + return reader; } protected String locCacheGet(String key) @@ -574,12 +585,18 @@ SeekableLineReader loadCachedBalancedReader(String partId, String[] locations, l String cachedUrl = locCacheGet(partId); if (cachedUrl != null) { - reader = blockLoader.attemptLoadBlock(cachedUrl, offset, length, true, isRequired()); + long start = System.currentTimeMillis(); + + reader = blockLoader.attemptLoadBlock(cachedUrl, offset, length, true, false); + long duration = System.currentTimeMillis() - start; + + if ((reader == null) || (duration > locCacheMaxDuration)) { + locCacheMap.remove(partId, cachedUrl); + } + if (reader != null) { return reader; - } else { - locCacheMap.remove(partId, cachedUrl); } } @@ -592,13 +609,29 @@ SeekableLineReader loadCachedBalancedReader(String partId, String[] locations, l Collections.shuffle(indexs); } - for (int index : indexs) { - reader = blockLoader.attemptLoadBlock(locations[index], offset, length, true, isRequired()); + final int lastIndex = locations.length - 1; + + for (int i = 0; i < indexs.size(); i++) { + + int index = indexs.get(i); + + // Skip failed cached url + if ((cachedUrl != null) && locations[index].equals(cachedUrl)) { + continue; + } + + long start = System.currentTimeMillis(); + + boolean required = (isRequired() && (i == lastIndex)); + + reader = blockLoader.attemptLoadBlock(locations[index], offset, length, true, required); + + long duration = System.currentTimeMillis() - start; if (reader != null) { String connectedUrl = ((HTTPSeekableLineReader)reader).getConnectedUrl(); - if (connectedUrl != null) { + if ((duration < locCacheMaxDuration) && (connectedUrl != null)) { locCachePut(partId, connectedUrl); } diff --git a/src/main/java/org/archive/format/gzip/zipnum/ZipNumIndex.java b/src/main/java/org/archive/format/gzip/zipnum/ZipNumIndex.java index 7860be36..ad8c9297 100644 --- a/src/main/java/org/archive/format/gzip/zipnum/ZipNumIndex.java +++ b/src/main/java/org/archive/format/gzip/zipnum/ZipNumIndex.java @@ -29,6 +29,9 @@ public class ZipNumIndex implements CDXInputSource { // Used only for reference / user info protected int cdxLinesPerBlock = 3000; + + protected long cdxLinesTotalCount = 0; + //protected HashMap locMap = null; protected final static boolean DEFAULT_USE_NIO = true; @@ -528,4 +531,9 @@ public boolean isRequired() { public void setRequired(boolean required) { this.required = required; } + + @Override + public long getTotalLines() { + return cdxLinesTotalCount; + } } diff --git a/src/main/java/org/archive/format/gzip/zipnum/ZipNumParams.java b/src/main/java/org/archive/format/gzip/zipnum/ZipNumParams.java index 15e22e1d..668743ae 100644 --- a/src/main/java/org/archive/format/gzip/zipnum/ZipNumParams.java +++ b/src/main/java/org/archive/format/gzip/zipnum/ZipNumParams.java @@ -6,6 +6,7 @@ public class ZipNumParams protected int timestampDedupLength = 0; protected int maxBlocks = 0; private boolean reverse = false; + private boolean sequential = false; public ZipNumParams() { @@ -56,4 +57,12 @@ public boolean isReverse() { public void setReverse(boolean reverse) { this.reverse = reverse; } + + public boolean isSequential() { + return sequential; + } + + public void setSequential(boolean sequential) { + this.sequential = sequential; + } } \ No newline at end of file diff --git a/src/main/java/org/archive/httpclient/ConfigurableX509TrustManager.java b/src/main/java/org/archive/httpclient/ConfigurableX509TrustManager.java new file mode 100644 index 00000000..45a89ba6 --- /dev/null +++ b/src/main/java/org/archive/httpclient/ConfigurableX509TrustManager.java @@ -0,0 +1,188 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.httpclient; + +import java.security.KeyStore; +import java.security.KeyStoreException; +import java.security.NoSuchAlgorithmException; +import java.security.cert.CertificateException; +import java.security.cert.X509Certificate; +import java.util.logging.Logger; + +import javax.net.ssl.TrustManager; +import javax.net.ssl.TrustManagerFactory; +import javax.net.ssl.X509TrustManager; + +/** + * A configurable trust manager built on X509TrustManager. + * + * If set to 'open' trust, the default, will get us into sites for whom we do + * not have the CA or any of intermediary CAs that go to make up the cert chain + * of trust. Will also get us past selfsigned and expired certs. 'loose' + * trust will get us into sites w/ valid certs even if they are just + * selfsigned. 'normal' is any valid cert not including selfsigned. 'strict' + * means cert must be valid and the cert DN must match server name. + * + *

Based on pointers in + * SSL + * Guide, + * and readings done in JSSE + * Guide. + * + *

TODO: Move to an ssl subpackage when we have other classes other than + * just this one. + * + * @author stack + * @version $Id$ + */ +public class ConfigurableX509TrustManager implements X509TrustManager +{ + /** + * Logging instance. + */ + protected static Logger logger = Logger.getLogger( + "org.archive.httpclient.ConfigurableX509TrustManager"); + + public static enum TrustLevel { + /** + * Trust anything given us. + * + * Default setting. + * + *

See + * e502. Disabling Certificate Validation in an HTTPS Connection from + * the java almanac for how to trust all. + */ + OPEN, + + /** + * Trust any valid cert including self-signed certificates. + */ + LOOSE, + + /** + * Normal jsse behavior. + * + * Seemingly any certificate that supplies valid chain of trust. + */ + NORMAL, + + /** + * Strict trust. + * + * Ensure server has same name as cert DN. + */ + STRICT, + } + + /** + * Default setting for trust level. + */ + public final static TrustLevel DEFAULT = TrustLevel.OPEN; + + /** + * Trust level. + */ + private TrustLevel trustLevel = DEFAULT; + + + /** + * An instance of the SUNX509TrustManager that we adapt variously + * depending upon passed configuration. + * + * We have it do all the work we don't want to. + */ + private X509TrustManager standardTrustManager = null; + + + public ConfigurableX509TrustManager() + throws NoSuchAlgorithmException, KeyStoreException { + this(DEFAULT); + } + + /** + * Constructor. + * + * @param level Level of trust to effect. + * + * @throws NoSuchAlgorithmException + * @throws KeyStoreException + */ + public ConfigurableX509TrustManager(TrustLevel level) + throws NoSuchAlgorithmException, KeyStoreException { + super(); + TrustManagerFactory factory = TrustManagerFactory. + getInstance(TrustManagerFactory.getDefaultAlgorithm()); + + // Pass in a null (Trust) KeyStore. Null says use the 'default' + // 'trust' keystore (KeyStore class is used to hold keys and to hold + // 'trusts' (certs)). See 'X509TrustManager Interface' in this doc: + // http://java.sun.com + // /j2se/1.4.2/docs/guide/security/jsse/JSSERefGuide.html#Introduction + factory.init((KeyStore)null); + TrustManager[] trustmanagers = factory.getTrustManagers(); + if (trustmanagers.length == 0) { + throw new NoSuchAlgorithmException(TrustManagerFactory. + getDefaultAlgorithm() + " trust manager not supported"); + } + this.standardTrustManager = (X509TrustManager)trustmanagers[0]; + + this.trustLevel = level; + } + + public void checkClientTrusted(X509Certificate[] certificates, String type) + throws CertificateException { + if (this.trustLevel.equals(TrustLevel.OPEN)) { + return; + } + + this.standardTrustManager.checkClientTrusted(certificates, type); + } + + public void checkServerTrusted(X509Certificate[] certificates, String type) + throws CertificateException { + if (this.trustLevel.equals(TrustLevel.OPEN)) { + return; + } + + try { + this.standardTrustManager.checkServerTrusted(certificates, type); + if (this.trustLevel.equals(TrustLevel.STRICT)) { + logger.severe(TrustLevel.STRICT + " not implemented."); + } + } catch (CertificateException e) { + if (this.trustLevel.equals(TrustLevel.LOOSE) && + certificates != null && certificates.length == 1) + { + // If only one cert and its valid and it caused a + // CertificateException, assume its selfsigned. + X509Certificate certificate = certificates[0]; + certificate.checkValidity(); + } else { + // If we got to here, then we're probably NORMAL. Rethrow. + throw e; + } + } + } + + public X509Certificate[] getAcceptedIssuers() { + return this.standardTrustManager.getAcceptedIssuers(); + } +} diff --git a/src/main/java/org/archive/httpclient/HttpRecorderGetMethod.java b/src/main/java/org/archive/httpclient/HttpRecorderGetMethod.java new file mode 100644 index 00000000..105c4f7e --- /dev/null +++ b/src/main/java/org/archive/httpclient/HttpRecorderGetMethod.java @@ -0,0 +1,120 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.httpclient; + +import java.io.IOException; +import java.util.logging.Logger; + +import org.apache.commons.httpclient.HttpConnection; +import org.apache.commons.httpclient.HttpException; +import org.apache.commons.httpclient.HttpState; +import org.apache.commons.httpclient.methods.GetMethod; +import org.archive.util.Recorder; + + +/** + * Override of GetMethod that marks the passed HttpRecorder w/ the transition + * from HTTP head to body and that forces a close on the http connection. + * + * The actions done in this subclass used to be done by copying + * org.apache.commons.HttpMethodBase, overlaying our version in place of the + * one that came w/ httpclient. Here is the patch of the difference between + * shipped httpclient code and our mods: + *

+ *    -- -1338,6 +1346,12 --
+ *
+ *        public void releaseConnection() {
+ *
+ *   +        // HERITRIX always ants the streams closed.
+ *   +        if (responseConnection != null)
+ *   +        {
+ *   +            responseConnection.close();
+ *   +        }
+ *   +
+ *            if (responseStream != null) {
+ *                try {
+ *                    // FYI - this may indirectly invoke responseBodyConsumed.
+ *   -- -1959,6 +1973,11 --
+ *                        this.statusLine = null;
+ *                    }
+ *                }
+ *   +            // HERITRIX mark transition from header to content.
+ *   +            if (this.httpRecorder != null)
+ *   +            {
+ *   +                this.httpRecorder.markContentBegin();
+ *   +            }
+ *                readResponseBody(state, conn);
+ *                processResponseBody(state, conn);
+ *            } catch (IOException e) {
+ *

+ * + *

We're not supposed to have access to the underlying connection object; + * am only violating contract because see cases where httpclient is skipping + * out w/o cleaning up after itself. + * + * @author stack + * @version $Revision$, $Date$ + */ +public class HttpRecorderGetMethod extends GetMethod { + + protected static Logger logger = + Logger.getLogger(HttpRecorderGetMethod.class.getName()); + + /** + * Instance of http recorder method. + */ + protected HttpRecorderMethod httpRecorderMethod = null; + + + public HttpRecorderGetMethod(String uri, Recorder recorder) { + super(uri); + this.httpRecorderMethod = new HttpRecorderMethod(recorder); + } + + protected void readResponseBody(HttpState state, HttpConnection connection) + throws IOException, HttpException { + // We're about to read the body. Mark transition in http recorder. + this.httpRecorderMethod.markContentBegin(connection); + super.readResponseBody(state, connection); + } + + protected boolean shouldCloseConnection(HttpConnection conn) { + // Always close connection after each request. As best I can tell, this + // is superfluous -- we've set our client to be HTTP/1.0. Doing this + // out of paranoia. + return true; + } + + public int execute(HttpState state, HttpConnection conn) + throws HttpException, IOException { + // Save off the connection so we can close it on our way out in case + // httpclient fails to (We're not supposed to have access to the + // underlying connection object; am only violating contract because + // see cases where httpclient is skipping out w/o cleaning up + // after itself). + this.httpRecorderMethod.setConnection(conn); + return super.execute(state, conn); + } + + protected void addProxyConnectionHeader(HttpState state, HttpConnection conn) + throws IOException, HttpException { + super.addProxyConnectionHeader(state, conn); + this.httpRecorderMethod.handleAddProxyConnectionHeader(this); + } +} diff --git a/src/main/java/org/archive/httpclient/HttpRecorderMethod.java b/src/main/java/org/archive/httpclient/HttpRecorderMethod.java new file mode 100644 index 00000000..932e7e98 --- /dev/null +++ b/src/main/java/org/archive/httpclient/HttpRecorderMethod.java @@ -0,0 +1,107 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.httpclient; + +import java.util.logging.Logger; + +import org.apache.commons.httpclient.Header; +import org.apache.commons.httpclient.HttpConnection; +import org.apache.commons.httpclient.HttpMethod; +import org.archive.util.Recorder; + + +/** + * This class encapsulates the specializations supplied by the + * overrides {@link HttpRecorderGetMethod} and {@link HttpRecorderPostMethod}. + * + * It keeps instance of HttpRecorder and HttpConnection. + * + * @author stack + * @version $Revision$, $Date$ + */ +public class HttpRecorderMethod { + protected static Logger logger = + Logger.getLogger(HttpRecorderMethod.class.getName()); + + /** + * Instance of http recorder we're using recording this http get. + */ + private Recorder httpRecorder = null; + + /** + * Save around so can force close. + * + * See [ 922080 ] IllegalArgumentException (size is wrong). + * https://sourceforge.net/tracker/?func=detail&aid=922080&group_id=73833&atid=539099 + */ + private HttpConnection connection = null; + + + public HttpRecorderMethod(Recorder recorder) { + this.httpRecorder = recorder; + } + + public void markContentBegin(HttpConnection c) { + if (c != this.connection) { + // We're checking that we're not being asked to work on + // a connection that is other than the one we started + // this method#execute with. + throw new IllegalArgumentException("Connections differ: " + + this.connection + " " + c + " " + + Thread.currentThread().getName()); + } + this.httpRecorder.markContentBegin(); + } + + /** + * @return Returns the connection. + */ + public HttpConnection getConnection() { + return this.connection; + } + + /** + * @param connection The connection to set. + */ + public void setConnection(HttpConnection connection) { + this.connection = connection; + } + /** + * @return Returns the httpRecorder. + */ + public Recorder getHttpRecorder() { + return httpRecorder; + } + + /** + * If a 'Proxy-Connection' header has been added to the request, + * it'll be of a 'keep-alive' type. Until we support 'keep-alives', + * override the Proxy-Connection setting and instead pass a 'close' + * (Otherwise every request has to timeout before we notice + * end-of-document). + * @param method Method to find proxy-connection header in. + */ + public void handleAddProxyConnectionHeader(HttpMethod method) { + Header h = method.getRequestHeader("Proxy-Connection"); + if (h != null) { + h.setValue("close"); + method.setRequestHeader(h); + } + } +} diff --git a/src/main/java/org/archive/httpclient/HttpRecorderPostMethod.java b/src/main/java/org/archive/httpclient/HttpRecorderPostMethod.java new file mode 100644 index 00000000..20f1bfd1 --- /dev/null +++ b/src/main/java/org/archive/httpclient/HttpRecorderPostMethod.java @@ -0,0 +1,82 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.httpclient; + +import java.io.IOException; + +import org.apache.commons.httpclient.HttpConnection; +import org.apache.commons.httpclient.HttpException; +import org.apache.commons.httpclient.HttpState; +import org.apache.commons.httpclient.methods.PostMethod; +import org.archive.util.Recorder; + + +/** + * Override of PostMethod that marks the passed HttpRecorder w/ the transition + * from HTTP head to body and that forces a close on the responseConnection. + * + * This is a copy of {@link HttpRecorderGetMethod}. Only difference is the + * parent subclass. + * + * @author stack + * @version $Date$ $Revision$ + */ +public class HttpRecorderPostMethod extends PostMethod { + /** + * Instance of http recorder method. + */ + protected HttpRecorderMethod httpRecorderMethod = null; + + + public HttpRecorderPostMethod(String uri, Recorder recorder) { + super(uri); + this.httpRecorderMethod = new HttpRecorderMethod(recorder); + } + + protected void readResponseBody(HttpState state, HttpConnection connection) + throws IOException, HttpException { + // We're about to read the body. Mark transition in http recorder. + this.httpRecorderMethod.markContentBegin(connection); + super.readResponseBody(state, connection); + } + + protected boolean shouldCloseConnection(HttpConnection conn) { + // Always close connection after each request. As best I can tell, this + // is superfluous -- we've set our client to be HTTP/1.0. Doing this + // out of paranoia. + return true; + } + + public int execute(HttpState state, HttpConnection conn) + throws HttpException, IOException { + // Save off the connection so we can close it on our way out in case + // httpclient fails to (We're not supposed to have access to the + // underlying connection object; am only violating contract because + // see cases where httpclient is skipping out w/o cleaning up + // after itself). + this.httpRecorderMethod.setConnection(conn); + return super.execute(state, conn); + } + + protected void addProxyConnectionHeader(HttpState state, HttpConnection conn) + throws IOException, HttpException { + super.addProxyConnectionHeader(state, conn); + this.httpRecorderMethod.handleAddProxyConnectionHeader(this); + } +} diff --git a/src/main/java/org/archive/httpclient/SingleHttpConnectionManager.java b/src/main/java/org/archive/httpclient/SingleHttpConnectionManager.java new file mode 100644 index 00000000..4ba6a837 --- /dev/null +++ b/src/main/java/org/archive/httpclient/SingleHttpConnectionManager.java @@ -0,0 +1,70 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.httpclient; + +import java.io.IOException; +import java.io.InputStream; + +import org.apache.commons.httpclient.HostConfiguration; +import org.apache.commons.httpclient.HttpConnection; +import org.apache.commons.httpclient.SimpleHttpConnectionManager; + +/** + * An HttpClient-compatible HttpConnection "manager" that actually + * just gives out a new connection each time -- skipping the overhead + * of connection management, since we already throttle our crawler + * with external mechanisms. + * + * @author gojomo + */ +public class SingleHttpConnectionManager extends SimpleHttpConnectionManager { + + public SingleHttpConnectionManager() { + super(); + } + + public HttpConnection getConnectionWithTimeout( + HostConfiguration hostConfiguration, long timeout) { + + HttpConnection conn = new HttpConnection(hostConfiguration); + conn.setHttpConnectionManager(this); + conn.getParams().setDefaults(this.getParams()); + return conn; + } + + public void releaseConnection(HttpConnection conn) { + // ensure connection is closed + conn.close(); + finishLast(conn); + } + + protected static void finishLast(HttpConnection conn) { + // copied from superclass because it wasn't made available to subclasses + InputStream lastResponse = conn.getLastResponseInputStream(); + if (lastResponse != null) { + conn.setLastResponseInputStream(null); + try { + lastResponse.close(); + } catch (IOException ioe) { + //FIXME: badness - close to force reconnect. + conn.close(); + } + } + } +} diff --git a/src/main/java/org/archive/httpclient/ThreadLocalHttpConnectionManager.java b/src/main/java/org/archive/httpclient/ThreadLocalHttpConnectionManager.java new file mode 100644 index 00000000..91e850ea --- /dev/null +++ b/src/main/java/org/archive/httpclient/ThreadLocalHttpConnectionManager.java @@ -0,0 +1,291 @@ +/** + * ==================================================================== + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * ==================================================================== + * + */ +package org.archive.httpclient; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.logging.Level; +import java.util.logging.Logger; + +import org.apache.commons.httpclient.HostConfiguration; +import org.apache.commons.httpclient.HttpConnection; +import org.apache.commons.httpclient.HttpConnectionManager; +import org.apache.commons.httpclient.params.HttpConnectionManagerParams; + +/** + * A simple, but thread-safe HttpClient {@link HttpConnectionManager}. + * Based on {@link org.apache.commons.httpclient.SimpleHttpConnectionManager}. + * + * Java >= 1.4 is recommended. + * + * @author Christian Kohlschuetter + */ +public final class ThreadLocalHttpConnectionManager implements + HttpConnectionManager { + + private static final CloserThread closer = new CloserThread(); + private static final Logger logger = Logger + .getLogger(ThreadLocalHttpConnectionManager.class.getName()); + + private final ThreadLocal tl = new ThreadLocal() { + protected synchronized ConnectionInfo initialValue() { + return new ConnectionInfo(); + } + }; + + private ConnectionInfo getConnectionInfo() { + return (ConnectionInfo) tl.get(); + } + + private static final class ConnectionInfo { + /** The http connection */ + private HttpConnection conn = null; + + /** + * The time the connection was made idle. + */ + private long idleStartTime = Long.MAX_VALUE; + } + + public ThreadLocalHttpConnectionManager() { + } + + /** + * Since the same connection is about to be reused, make sure the + * previous request was completely processed, and if not + * consume it now. + * @param conn The connection + * @return true, if the connection is reusable + */ + private static boolean finishLastResponse(final HttpConnection conn) { + InputStream lastResponse = conn.getLastResponseInputStream(); + if(lastResponse != null) { + conn.setLastResponseInputStream(null); + try { + lastResponse.close(); + return true; + } catch (IOException ioe) { + // force reconnect. + return false; + } + } else { + return false; + } + } + + /** + * Collection of parameters associated with this connection manager. + */ + private HttpConnectionManagerParams params = new HttpConnectionManagerParams(); + + /** + * @see HttpConnectionManager#getConnection(HostConfiguration) + */ + public HttpConnection getConnection( + final HostConfiguration hostConfiguration) { + return getConnection(hostConfiguration, 0); + } + + /** + * Gets the staleCheckingEnabled value to be set on HttpConnections that are created. + * + * @return true if stale checking will be enabled on HttpConections + * + * @see HttpConnection#isStaleCheckingEnabled() + * + * @deprecated Use {@link HttpConnectionManagerParams#isStaleCheckingEnabled()}, + * {@link HttpConnectionManager#getParams()}. + */ + public boolean isConnectionStaleCheckingEnabled() { + return this.params.isStaleCheckingEnabled(); + } + + /** + * Sets the staleCheckingEnabled value to be set on HttpConnections that are created. + * + * @param connectionStaleCheckingEnabled true if stale checking will be enabled + * on HttpConections + * + * @see HttpConnection#setStaleCheckingEnabled(boolean) + * + * @deprecated Use {@link HttpConnectionManagerParams#setStaleCheckingEnabled(boolean)}, + * {@link HttpConnectionManager#getParams()}. + */ + public void setConnectionStaleCheckingEnabled( + final boolean connectionStaleCheckingEnabled) { + this.params.setStaleCheckingEnabled(connectionStaleCheckingEnabled); + } + + /** + * @see HttpConnectionManager#getConnectionWithTimeout(HostConfiguration, long) + * + * @since 3.0 + */ + public HttpConnection getConnectionWithTimeout( + final HostConfiguration hostConfiguration, final long timeout) { + + final ConnectionInfo ci = getConnectionInfo(); + HttpConnection httpConnection = ci.conn; + + // make sure the host and proxy are correct for this connection + // close it and set the values if they are not + if(httpConnection == null || !finishLastResponse(httpConnection) + || !hostConfiguration.hostEquals(httpConnection) + || !hostConfiguration.proxyEquals(httpConnection)) { + + if(httpConnection != null && httpConnection.isOpen()) { + closer.closeConnection(httpConnection); + } + + httpConnection = new HttpConnection(hostConfiguration); + httpConnection.setHttpConnectionManager(this); + httpConnection.getParams().setDefaults(this.params); + ci.conn = httpConnection; + + httpConnection.setHost(hostConfiguration.getHost()); + httpConnection.setPort(hostConfiguration.getPort()); + httpConnection.setProtocol(hostConfiguration.getProtocol()); + httpConnection.setLocalAddress(hostConfiguration.getLocalAddress()); + + httpConnection.setProxyHost(hostConfiguration.getProxyHost()); + httpConnection.setProxyPort(hostConfiguration.getProxyPort()); + } + + // remove the connection from the timeout handler + ci.idleStartTime = Long.MAX_VALUE; + + return httpConnection; + } + + /** + * @see HttpConnectionManager#getConnection(HostConfiguration, long) + * + * @deprecated Use #getConnectionWithTimeout(HostConfiguration, long) + */ + public HttpConnection getConnection( + final HostConfiguration hostConfiguration, final long timeout) { + return getConnectionWithTimeout(hostConfiguration, timeout); + } + + /** + * @see HttpConnectionManager#releaseConnection(org.apache.commons.httpclient.HttpConnection) + */ + public void releaseConnection(final HttpConnection conn) { + final ConnectionInfo ci = getConnectionInfo(); + HttpConnection httpConnection = ci.conn; + + if(conn != httpConnection) { + throw new IllegalStateException( + "Unexpected release of an unknown connection."); + } + + finishLastResponse(httpConnection); + + // track the time the connection was made idle + ci.idleStartTime = System.currentTimeMillis(); + } + + /** + * Returns {@link HttpConnectionManagerParams parameters} associated + * with this connection manager. + * + * @since 2.1 + * + * @see HttpConnectionManagerParams + */ + public HttpConnectionManagerParams getParams() { + return this.params; + } + + /** + * Assigns {@link HttpConnectionManagerParams parameters} for this + * connection manager. + * + * @since 2.1 + * + * @see HttpConnectionManagerParams + */ + public void setParams(final HttpConnectionManagerParams p) { + if(p == null) { + throw new IllegalArgumentException("Parameters may not be null"); + } + this.params = p; + } + + /** + * @since 3.0 + */ + public void closeIdleConnections(final long idleTimeout) { + long maxIdleTime = System.currentTimeMillis() - idleTimeout; + + final ConnectionInfo ci = getConnectionInfo(); + + if(ci.idleStartTime <= maxIdleTime) { + ci.conn.close(); + } + } + + private static final class CloserThread extends Thread { + private List connections + = new ArrayList(); + + private static final int SLEEP_INTERVAL = 5000; + + public CloserThread() { + super("HttpConnection closer"); + // Make this a daemon thread so it can't be responsible for the JVM + // not shutting down. + setDaemon(true); + start(); + } + + public void closeConnection(final HttpConnection conn) { + synchronized (connections) { + connections.add(conn); + } + } + + public void run() { + try { + while (!Thread.interrupted()) { + Thread.sleep(SLEEP_INTERVAL); + + List s; + synchronized (connections) { + s = connections; + connections = new ArrayList(); + } + logger.log(Level.INFO, "Closing " + s.size() + + " HttpConnections"); + for(final Iterator it = s.iterator(); + it.hasNext();) { + HttpConnection conn = it.next(); + conn.close(); + conn.setHttpConnectionManager(null); + it.remove(); + } + } + } catch (InterruptedException e) { + return; + } + } + } +} diff --git a/src/main/java/org/archive/httpclient/package.html b/src/main/java/org/archive/httpclient/package.html new file mode 100644 index 00000000..87ae77ed --- /dev/null +++ b/src/main/java/org/archive/httpclient/package.html @@ -0,0 +1,24 @@ + + + +org.archive.httpclient package + +Provides specializations on + apache jakarta + commons httpclient. + +

HttpRecorderGetMethod

Class that the passed HttpRecorder w/ boundary between + HTTP header and content. Also forces a close on the response on + call to releaseConnection.

+ +

ConfigurableTrustManagerProtocolSocketFactory

A protocol socket factory that allows setting of trust level on + construction.

+ +

References

JavaTM Secure Socket Extension (JSSE): Reference Guide

+ + + diff --git a/src/main/java/org/archive/io/ArchiveFileConstants.java b/src/main/java/org/archive/io/ArchiveFileConstants.java new file mode 100644 index 00000000..b1a39194 --- /dev/null +++ b/src/main/java/org/archive/io/ArchiveFileConstants.java @@ -0,0 +1,24 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + +@Deprecated +public interface ArchiveFileConstants extends org.archive.format.ArchiveFileConstants { +} diff --git a/src/main/java/org/archive/io/ArchiveReader.java b/src/main/java/org/archive/io/ArchiveReader.java new file mode 100644 index 00000000..66056d33 --- /dev/null +++ b/src/main/java/org/archive/io/ArchiveReader.java @@ -0,0 +1,761 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + + +import java.io.BufferedInputStream; +import java.io.BufferedWriter; +import java.io.Closeable; +import java.io.EOFException; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileWriter; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.logging.Level; +import java.util.logging.Logger; + +import org.apache.commons.cli.Option; +import org.apache.commons.cli.Options; +import org.archive.util.MimetypeUtils; +import org.archive.util.zip.GZIPMembersInputStream; + +import com.google.common.io.CountingInputStream; + + +/** + * Reader for an Archive file of Archive {@link ArchiveRecord}s. + * @author stack + * @version $Date$ $Version$ + */ +public abstract class ArchiveReader implements ArchiveFileConstants, Iterable, Closeable { + /** + * Is this Archive file compressed? + */ + private boolean compressed = false; + + /** + * Should we digest as we read? + */ + private boolean digest = true; + + /** + * Should the parse be strict? + */ + private boolean strict = false; + + /** + * Archive file input stream. + * + * Keep it around so we can close it when done. + * + *

Set in constructor. Should support at least 1 byte mark/reset. + * Make it protected so subclasses have access. + */ + protected InputStream in = null; + + /** + * Maximum amount of recoverable exceptions in a row. + * If more than this amount in a row, we'll let out the exception rather + * than go back in for yet another retry. + */ + public static final int MAX_ALLOWED_RECOVERABLES = 10; + + + /** + * The Record currently being read. + * + * Keep this ongoing reference so we'll close the record even if the caller + * doesn't. + */ + private ArchiveRecord currentRecord = null; + + /** + * Descriptive string for the Archive file we're going against: + * full path, url, etc. -- depends on context in which file was made. + */ + private String identifier = null; + + /** + * Archive file version. + */ + private String version = null; + + + protected ArchiveReader() { + super(); + } + + /** + * Convenience method used by subclass constructors. + * @param i Identifier for Archive file this reader goes against. + */ + protected void initialize(final String i) { + setReaderIdentifier(i); + } + + /** + * Convenience method for constructors. + * + * @param f File to read. + * @param offset Offset at which to start reading. + * @return InputStream to read from. + * @throws IOException If failed open or fail to get a memory + * mapped byte buffer on file. + */ + protected InputStream getInputStream(final File f, final long offset) + throws IOException { + FileInputStream fin = new FileInputStream(f); + return new BufferedInputStream(fin); + } + + public boolean isCompressed() { + return this.compressed; + } + + /** + * Get record at passed offset. + * + * @param offset Byte index into file at which a record starts. + * @return An Archive Record reference. + * @throws IOException + */ + public ArchiveRecord get(long offset) throws IOException { + cleanupCurrentRecord(); + long posn = positionForRecord(in); + if(offset>=posn) { + in.skip(offset-posn); + } else { + throw new UnsupportedOperationException("no reverse seeking: at "+posn+" requested "+offset); + } + return createArchiveRecord(this.in, offset); + } + + /** + * @return Return Archive Record created against current offset. + * @throws IOException + */ + public ArchiveRecord get() throws IOException { + return createArchiveRecord(this.in, positionForRecord(in)); + } + + public void close() throws IOException { + if (this.in != null) { + this.in.close(); + this.in = null; + } + } + + /** + * Cleanout the current record if there is one. + * @throws IOException + */ + protected void cleanupCurrentRecord() throws IOException { + if (this.currentRecord != null) { + this.currentRecord.close(); + gotoEOR(this.currentRecord); + this.currentRecord = null; + } + } + + /** + * Return an Archive Record homed on offset into + * is. + * @param is Stream to read Record from. + * @param offset Offset to find Record at. + * @return ArchiveRecord instance. + * @throws IOException + */ + protected abstract ArchiveRecord createArchiveRecord(InputStream is, + long offset) + throws IOException; + + /** + * Skip over any trailing new lines at end of the record so we're lined up + * ready to read the next. + * @param record + * @throws IOException + */ + protected abstract void gotoEOR(ArchiveRecord record) throws IOException; + + public abstract String getFileExtension(); + public abstract String getDotFileExtension(); + + /** + * @return Version of this Archive file. + */ + public String getVersion() { + return this.version; + } + + /** + * Validate the Archive file. + * + * This method iterates over the file throwing exception if it fails + * to successfully parse any record. + * + *

Assumes the stream is at the start of the file. + * @return List of all read Archive Headers. + * + * @throws IOException + */ + public List validate() throws IOException { + return validate(-1); + } + + /** + * Validate the Archive file. + * + * This method iterates over the file throwing exception if it fails + * to successfully parse. + * + *

We start validation from wherever we are in the stream. + * + * @param numRecords Number of records expected. Pass -1 if number is + * unknown. + * + * @return List of all read metadatas. As we validate records, we add + * a reference to the read metadata. + * + * @throws IOException + */ + public List validate(int numRecords) + throws IOException { + List hdrList = new ArrayList(); + int recordCount = 0; + setStrict(true); + for (Iterator i = iterator(); i.hasNext();) { + recordCount++; + ArchiveRecord r = i.next(); + if (r.getHeader().getLength() <= 0 + && r.getHeader().getMimetype(). + equals(MimetypeUtils.NO_TYPE_MIMETYPE)) { + throw new IOException("record content is empty."); + } + r.close(); + hdrList.add(r.getHeader()); + } + + if (numRecords != -1) { + if (recordCount != numRecords) { + throw new IOException("Count of records, " + + Integer.toString(recordCount) + + " is not equal to expected " + + Integer.toString(numRecords)); + } + } + + return hdrList; + } + + /** + * Test Archive file is valid. + * Assumes the stream is at the start of the file. Be aware that this + * method makes a pass over the whole file. + * @return True if file can be successfully parsed. + */ + public boolean isValid() { + boolean valid = false; + try { + validate(); + valid = true; + } catch(Exception e) { + // File is not valid if exception thrown parsing. + valid = false; + } + + return valid; + } + + /** + * @return Returns the strict. + */ + public boolean isStrict() { + return this.strict; + } + + /** + * @param s The strict to set. + */ + public void setStrict(boolean s) { + this.strict = s; + } + + /** + * @param d True if we're to digest. + */ + public void setDigest(boolean d) { + this.digest = d; + } + + /** + * @return True if we're digesting as we read. + */ + public boolean isDigest() { + return this.digest; + } + + protected Logger getLogger() { + return Logger.getLogger(this.getClass().getName()); + } + + /** + * Returns an ArchiveRecord iterator. + * Of note, on IOException, especially if ZipException reading compressed + * ARCs, rather than fail the iteration, try moving to the next record. + * If {@link ArchiveReader#strict} is not set, this will usually succeed. + * @return An iterator over ARC records. + */ + public Iterator iterator() { + // Eat up any record outstanding. + try { + cleanupCurrentRecord(); + } catch (IOException e) { + throw new RuntimeException(e); + } + + return new ArchiveRecordIterator(); + } + + protected void setCompressed(boolean compressed) { + this.compressed = compressed; + } + + /** + * @return The current ARC record or null if none. + * After construction has the arcfile header record. + * @see #get() + */ + protected ArchiveRecord getCurrentRecord() { + return this.currentRecord; + } + + protected ArchiveRecord currentRecord(final ArchiveRecord r) { + this.currentRecord = r; + return r; + } + + protected InputStream getIn() { + return in; + } + + protected void setIn(InputStream in) { + this.in = in; + } + + protected void setVersion(String version) { + this.version = version; + } + + public String getReaderIdentifier() { + return this.identifier; + } + + protected void setReaderIdentifier(final String i) { + this.identifier = i; + } + + /** + * Log on stderr. + * Logging should go via the logging system. This method + * bypasses the logging system going direct to stderr. + * Should not generally be used. Its used for rare messages + * that come of cmdline usage of ARCReader ERRORs and WARNINGs. + * Override if using ARCReader in a context where no stderr or + * where you'd like to redirect stderr to other than System.err. + * @param level Level to log message at. + * @param message Message to log. + */ + public void logStdErr(Level level, String message) { + System.err.println(level.toString() + " " + message); + } + +// /** +// * Add buffering to RandomAccessInputStream. +// */ +// protected class RandomAccessBufferedInputStream +// extends BufferedInputStream implements RepositionableStream { +// +// public RandomAccessBufferedInputStream(RandomAccessInputStream is) +// throws IOException { +// super(is); +// } +// +// public RandomAccessBufferedInputStream(RandomAccessInputStream is, int size) +// throws IOException { +// super(is, size); +// } +// +// public long position() throws IOException { +// // Current position is the underlying files position +// // minus the amount thats in the buffer yet to be read. +// return ((RandomAccessInputStream)this.in).position() - +// (this.count - this.pos); +// } +// +// public void position(long position) throws IOException { +// // Force refill of buffer whenever there's been a seek. +// this.pos = 0; +// this.count = 0; +// ((RandomAccessInputStream)this.in).position(position); +// } +// +// public int available() throws IOException { +// // Avoid overflow on large datastreams +// long amount = (long)in.available() + (long)(count - pos); +// return (amount >= Integer.MAX_VALUE)? Integer.MAX_VALUE: (int)amount; +// } +// } + + /** + * Inner ArchiveRecord Iterator class. + * Throws RuntimeExceptions in {@link #hasNext()} and {@link #next()} if + * trouble pulling record from underlying stream. + * @author stack + */ + protected class ArchiveRecordIterator implements Iterator { + private final Logger logger = + Logger.getLogger(this.getClass().getName()); + /** + * @return True if we have more records to read. + * @exception RuntimeException Can throw an IOException wrapped in a + * RuntimeException if a problem reading underlying stream (Corrupted + * gzip, etc.). + */ + public boolean hasNext() { + // Call close on any extant record. This will scoot us past + // any content not yet read. + try { + cleanupCurrentRecord(); + } catch (IOException e) { + if (isStrict()) { + throw new RuntimeException(e); + } + if (e instanceof EOFException) { + logger.warning("Premature EOF cleaning up " + + currentRecord.getHeader().toString() + ": " + + e.getMessage()); + return false; + } + // If not strict, try going again. We might be able to skip + // over the bad record. + logger.log(Level.WARNING,"Trying skip of failed record cleanup of " + + currentRecord.getHeader().toString() + ": " + + e.getMessage(), e); + } + return innerHasNext(); + } + + protected boolean innerHasNext(){ + try { + getIn().mark(1); + int c = getIn().read(); + getIn().reset(); + return c > -1; + } catch (IOException e) { + logger.log(Level.WARNING,"problem probing for more content",e); + return false; + } + } + + /** + * Tries to move to next record if we get + * {@link RecoverableIOException}. If not strict + * tries to move to next record if we get an + * {@link IOException}. + * @return Next object. + * @exception RuntimeException Throws a runtime exception, + * usually a wrapping of an IOException, if trouble getting + * a record (Throws exception rather than return null). + */ + public ArchiveRecord next() { + long offset = -1; + try { + offset = positionForRecord(getIn()); + return exceptionNext(); + } catch (IOException e) { + if (!isStrict()) { + // Retry though an IOE. Maybe we will succeed reading + // subsequent record. + try { + if (hasNext()) { + getLogger().warning("Bad Record. Trying skip " + + "(Record start " + offset + "): " + + e.getMessage()); + return exceptionNext(); + } + // Else we are at last record. Iterator#next is + // expecting value. We do not have one. Throw exception. + throw new RuntimeException("Retried but no next " + + "record (Record start " + offset + ")", e); + } catch (IOException e1) { + throw new RuntimeException("After retry (Offset " + + offset + ")", e1); + } + } + throw new RuntimeException("(Record start " + offset + ")", e); + } + } + + /** + * A next that throws exceptions and has handling of + * recoverable exceptions moving us to next record. Can call + * hasNext which itself may throw exceptions. + * @return Next record. + * @throws IOException + * @throws RuntimeException Thrown when we've reached maximum + * retries. + */ + protected ArchiveRecord exceptionNext() + throws IOException, RuntimeException { + ArchiveRecord result = null; + IOException ioe = null; + for (int i = MAX_ALLOWED_RECOVERABLES; i > 0 && + result == null; i--) { + ioe = null; + try { + result = innerNext(); + } catch (RecoverableIOException e) { + ioe = e; + getLogger().warning(e.getMessage()); + if (hasNext()) { + continue; + } + // No records left. Throw exception rather than + // return null. The caller is expecting to get + // back a record since they've just called + // hasNext. + break; + } + } + if (ioe != null) { + // Then we did MAX_ALLOWED_RECOVERABLES retries. Throw + // the recoverable ioe wrapped in a RuntimeException so + // it goes out pass checks for IOE. + throw new RuntimeException("Retried " + + MAX_ALLOWED_RECOVERABLES + " times in a row", ioe); + } + return result; + } + + protected ArchiveRecord innerNext() throws IOException { + return get(positionForRecord(getIn())); + } + + public void remove() { + throw new UnsupportedOperationException(); + } + } + + protected static long positionForRecord(InputStream in) { + return (in instanceof GZIPMembersInputStream) + ? ((GZIPMembersInputStream)in).getCurrentMemberStart() + : ((CountingInputStream)in).getCount(); + } + + protected static String stripExtension(final String name, + final String ext) { + return (!name.endsWith(ext))? name: + name.substring(0, name.length() - ext.length()); + } + + /** + * @return short name of Archive file. + */ + public String getFileName() { + return (new File(getReaderIdentifier())).getName(); + } + + /** + * @return short name of Archive file. + */ + public String getStrippedFileName() { + return getStrippedFileName(getFileName(), + getDotFileExtension()); + } + + /** + * @param name Name of ARCFile. + * @param dotFileExtension '.arc' or '.warc', etc. + * @return short name of Archive file. + */ + public static String getStrippedFileName(String name, + final String dotFileExtension) { + name = stripExtension(name, + ArchiveFileConstants.DOT_COMPRESSED_FILE_EXTENSION); + return stripExtension(name, dotFileExtension); + } + + /** + * @param value Value to test. + * @return True if value is 'true', else false. + */ + protected static boolean getTrueOrFalse(final String value) { + if (value == null || value.length() <= 0) { + return false; + } + return Boolean.TRUE.toString().equals(value.toLowerCase()); + } + + /** + * @param format Format to use outputting. + * @throws IOException + * @throws java.text.ParseException + * @return True if handled. + */ + protected boolean output(final String format) + throws IOException, java.text.ParseException { + boolean result = true; + // long start = System.currentTimeMillis(); + + // Write output as pseudo-CDX file. See + // http://www.archive.org/web/researcher/cdx_legend.php + // and http://www.archive.org/web/researcher/example_cdx.php. + // Hash is hard-coded straight SHA-1 hash of content. + if (format.equals(DUMP)) { + // No point digesting dumping. + setDigest(false); + dump(false); + } else if (format.equals(GZIP_DUMP)) { + // No point digesting dumping. + setDigest(false); + dump(true); + } else if (format.equals(CDX)) { + cdxOutput(false); + } else if (format.equals(CDX_FILE)) { + cdxOutput(true); + } else { + result = false; + } + return result; + } + + protected void cdxOutput(boolean toFile) + throws IOException { + BufferedWriter cdxWriter = null; + if (toFile) { + String cdxFilename = stripExtension(getReaderIdentifier(), + DOT_COMPRESSED_FILE_EXTENSION); + cdxFilename = stripExtension(cdxFilename, getDotFileExtension()); + cdxFilename += ('.' + CDX); + cdxWriter = new BufferedWriter(new FileWriter(cdxFilename)); + } + + String header = "CDX b e a m s c " + ((isCompressed()) ? "V" : "v") + + " n g"; + if (toFile) { + cdxWriter.write(header); + cdxWriter.newLine(); + } else { + System.out.println(header); + } + + String strippedFileName = getStrippedFileName(); + try { + for (Iterator ii = iterator(); ii.hasNext();) { + ArchiveRecord r = ii.next(); + if (toFile) { + cdxWriter.write(r.outputCdx(strippedFileName)); + cdxWriter.newLine(); + } else { + System.out.println(r.outputCdx(strippedFileName)); + } + } + } finally { + if (toFile) { + cdxWriter.close(); + } + } + } + + /** + * Output passed record using passed format specifier. + * @param format What format to use outputting. + * @throws IOException + * @return True if handled. + */ + public boolean outputRecord(final String format) + throws IOException { + boolean result = true; + if (format.equals(CDX)) { + System.out.println(get().outputCdx(getStrippedFileName())); + } else if(format.equals(ArchiveFileConstants.DUMP)) { + // No point digesting if dumping content. + setDigest(false); + get().dump(); + } else { + result = false; + } + return result; + } + + /** + * Dump this file on STDOUT + * @throws compress True if dumped output is compressed. + * @throws IOException + * @throws java.text.ParseException + */ + public abstract void dump(final boolean compress) + throws IOException, java.text.ParseException; + + /** + * @return an ArchiveReader that will delete a local file on close. Used + * when we bring Archive files local and need to clean up afterward. + */ + public abstract ArchiveReader getDeleteFileOnCloseReader(final File f); + + /** + * Output passed record using passed format specifier. + * @param r ARCReader instance to output. + * @param format What format to use outputting. + * @throws IOException + */ + protected static void outputRecord(final ArchiveReader r, + final String format) + throws IOException { + if (!r.outputRecord(format)) { + throw new IOException("Unsupported format" + + " (or unsupported on a single record): " + format); + } + } + + /** + * @return Base Options object filled out with help, digest, strict, etc. + * options. + */ + protected static Options getOptions() { + Options options = new Options(); + options.addOption(new Option("h","help", false, + "Prints this message and exits.")); + options.addOption(new Option("o","offset", true, + "Outputs record at this offset into file.")); + options.addOption(new Option("d","digest", true, + "Pass true|false. Expensive. Default: true (SHA-1).")); + options.addOption(new Option("s","strict", false, + "Strict mode. Fails parse if incorrectly formatted file.")); + options.addOption(new Option("f","format", true, + "Output options: 'cdx', cdxfile', 'dump', 'gzipdump'," + + "'or 'nohead'. Default: 'cdx'.")); + return options; + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/ArchiveReaderFactory.java b/src/main/java/org/archive/io/ArchiveReaderFactory.java new file mode 100644 index 00000000..17f14d3a --- /dev/null +++ b/src/main/java/org/archive/io/ArchiveReaderFactory.java @@ -0,0 +1,301 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.net.HttpURLConnection; +import java.net.MalformedURLException; +import java.net.URL; +import java.net.URLConnection; + +import org.archive.io.arc.ARCReaderFactory; +import org.archive.io.warc.WARCReaderFactory; +import org.archive.net.md5.Md5URLConnection; +import org.archive.net.rsync.RsyncURLConnection; +import org.archive.url.UsableURI; +import org.archive.util.FileUtils; + + +/** + * Factory that returns an Archive file Reader. + * Returns Readers for ARCs or WARCs. + * @author stack + * @version $Date$ $Revision$ + */ +public class ArchiveReaderFactory implements ArchiveFileConstants { + // Static block to enable S3 URLs + static { + if (System.getProperty("java.protocol.handler.pkgs") != null) { + System.setProperty("java.protocol.handler.pkgs", + System.getProperty("java.protocol.handler.pkgs") + + "|" + "org.archive.net"); + } else { + System.setProperty("java.protocol.handler.pkgs", "org.archive.net"); + } + } + + private static final ArchiveReaderFactory factory = + new ArchiveReaderFactory(); + + /** + * Shutdown any public access to default constructor. + */ + protected ArchiveReaderFactory() { + super(); + } + + /** + * Get an Archive file Reader on passed path or url. + * Does primitive heuristic figuring if path or URL. + * @param arcFileOrUrl File path or URL pointing at an Archive file. + * @return An Archive file Reader. + * @throws IOException + * @throws MalformedURLException + * @throws IOException + */ + public static ArchiveReader get(final String arcFileOrUrl) + throws MalformedURLException, IOException { + return ArchiveReaderFactory.factory.getArchiveReader(arcFileOrUrl); + } + + protected ArchiveReader getArchiveReader(final String arcFileOrUrl) + throws MalformedURLException, IOException { + return getArchiveReader(arcFileOrUrl, 0); + } + + protected ArchiveReader getArchiveReader(final String arcFileOrUrl, + final long offset) + throws MalformedURLException, IOException { + return UsableURI.hasScheme(arcFileOrUrl) && arcFileOrUrl.indexOf(":")>1? + get(new URL(arcFileOrUrl), offset): + get(new File(arcFileOrUrl), offset); + } + + /** + * @param f An Archive file to read. + * @return An ArchiveReader + * @throws IOException + */ + public static ArchiveReader get(final File f) throws IOException { + return ArchiveReaderFactory.factory.getArchiveReader(f); + } + + protected ArchiveReader getArchiveReader(final File f) + throws IOException { + return getArchiveReader(f, 0); + } + + /** + * @param f An Archive file to read. + * @param offset Have returned Reader set to start reading at this offset. + * @return An ArchiveReader + * @throws IOException + */ + public static ArchiveReader get(final File f, final long offset) + throws IOException { + return ArchiveReaderFactory.factory.getArchiveReader(f, offset); + } + + protected ArchiveReader getArchiveReader(final File f, + final long offset) + throws IOException { + if (ARCReaderFactory.isARCSuffix(f.getName())) { + return ARCReaderFactory.get(f, true, offset); + } else if (WARCReaderFactory.isWARCSuffix(f.getName())) { + return WARCReaderFactory.get(f, offset); + } + throw new IOException("Unknown file extension (Not ARC nor WARC): " + + f.getName()); + } + + /** + * Wrap a Reader around passed Stream. + * @param s Identifying String for this Stream used in error messages. + * Must be a string that ends with the name of the file we're to put + * an ArchiveReader on. This code looks at file endings to figure + * whether to return an ARC or WARC reader. + * @param is Stream. Stream will be wrapped with implementation of + * RepositionableStream unless already supported. + * @param atFirstRecord Are we at first Record? + * @return ArchiveReader. + * @throws IOException + */ + public static ArchiveReader get(final String s, final InputStream is, + final boolean atFirstRecord) + throws IOException { + return ArchiveReaderFactory.factory.getArchiveReader(s, is, + atFirstRecord); + } + + protected ArchiveReader getArchiveReader(final String id, + final InputStream is, final boolean atFirstRecord) + throws IOException { + final InputStream stream = is; + if (ARCReaderFactory.isARCSuffix(id)) { + return ARCReaderFactory.get(id, stream, atFirstRecord); + } else if (WARCReaderFactory.isWARCSuffix(id)) { + return WARCReaderFactory.get(id, stream, atFirstRecord); + } + throw new IOException("Unknown extension (Not ARC nor WARC): " + id); + } + + /** + * Get an Archive Reader aligned at offset. + * This version of get will not bring the file local but will try to + * stream across the net making an HTTP 1.1 Range request on remote + * http server (RFC1435 Section 14.35). + * @param u HTTP URL for an Archive file. + * @param offset Offset into file at which to start fetching. + * @return An ArchiveReader aligned at offset. + * @throws IOException + */ + public static ArchiveReader get(final URL u, final long offset) + throws IOException { + return ArchiveReaderFactory.factory.getArchiveReader(u, offset); + } + + protected ArchiveReader getArchiveReader(final URL f, final long offset) + throws IOException { + // Get URL connection. + URLConnection connection = f.openConnection(); + if (connection instanceof HttpURLConnection) { + addUserAgent((HttpURLConnection)connection); + } + if (offset != 0) { + // Use a Range request (Assumes HTTP 1.1 on other end). If + // length >= 0, add open-ended range header to the request. Else, + // because end-byte is inclusive, subtract 1. + connection.addRequestProperty("Range", "bytes=" + offset + "-"); + // TODO: should actually verify that server respected 'Range' request + // (spec allows them to ignore; 206 response or Content-Range header + // should be present if Range satisfied; multipart/byteranges could be + // a problem). + } + + return getArchiveReader(f.toString(), connection.getInputStream(), (offset == 0)); + } + + /** + * Get an ARCReader. + * Pulls the ARC local into whereever the System Property + * java.io.tmpdir points. It then hands back an ARCReader that + * points at this local copy. A close on this ARCReader instance will + * remove the local copy. + * @param u An URL that points at an ARC. + * @return An ARCReader. + * @throws IOException + */ + public static ArchiveReader get(final URL u) + throws IOException { + return ArchiveReaderFactory.factory.getArchiveReader(u); + } + + protected ArchiveReader getArchiveReader(final URL u) + throws IOException { + // If url represents a local file then return file it points to. + if (u.getPath() != null) { + // TODO: Add scheme check and host check. + File f = new File(u.getPath()); + if (f.exists()) { + return get(f, 0); + } + } + + String scheme = u.getProtocol(); + if (scheme.startsWith("http") || scheme.equals("s3")) { + // Try streaming if http or s3 URLs rather than copying local + // and then reading (Passing an offset will get us an Reader + // that wraps a Stream). + return get(u, 0); + } + + return makeARCLocal(u.openConnection()); + } + + protected ArchiveReader makeARCLocal(final URLConnection connection) + throws IOException { + File localFile = null; + if (connection instanceof HttpURLConnection) { + // If http url connection, bring down the resource local. + String p = connection.getURL().getPath(); + int index = p.lastIndexOf('/'); + if (index >= 0) { + // Name file for the file we're making local. + localFile = File.createTempFile("",p.substring(index + 1)); + if (localFile.exists()) { + // If file of same name already exists in TMPDIR, then + // clean it up (Assuming only reason a file of same name in + // TMPDIR is because we failed a previous download). + localFile.delete(); + } + } else { + localFile = File.createTempFile(ArchiveReader.class.getName(), + ".tmp"); + } + addUserAgent((HttpURLConnection)connection); + connection.connect(); + try { + FileUtils.readFullyToFile(connection.getInputStream(), localFile); + } catch (IOException ioe) { + localFile.delete(); + throw ioe; + } + } else if (connection instanceof RsyncURLConnection) { + // Then, connect and this will create a local file. + // See implementation of the rsync handler. + connection.connect(); + localFile = ((RsyncURLConnection)connection).getFile(); + } else if (connection instanceof Md5URLConnection) { + // Then, connect and this will create a local file. + // See implementation of the md5 handler. + connection.connect(); + localFile = ((Md5URLConnection)connection).getFile(); + } else { + throw new UnsupportedOperationException("No support for " + + connection); + } + + ArchiveReader reader = null; + try { + reader = get(localFile, 0); + } catch (IOException e) { + localFile.delete(); + throw e; + } + + // Return a delegate that does cleanup of downloaded file on close. + return reader.getDeleteFileOnCloseReader(localFile); + } + + protected void addUserAgent(final HttpURLConnection connection) { + connection.addRequestProperty("User-Agent", this.getClass().getName()); + } + + /** + * @param f File to test. + * @return True if f is compressed. + * @throws IOException + */ + protected boolean isCompressed(final File f) throws IOException { + return f.getName().toLowerCase(). + endsWith(DOT_COMPRESSED_FILE_EXTENSION); + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/ArchiveRecord.java b/src/main/java/org/archive/io/ArchiveRecord.java new file mode 100644 index 00000000..63bfe628 --- /dev/null +++ b/src/main/java/org/archive/io/ArchiveRecord.java @@ -0,0 +1,409 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.util.logging.Level; + +import org.archive.util.Base32; + +/** + * Archive file Record. + * @author stack + * @version $Date$ $Version$ + */ +public abstract class ArchiveRecord extends InputStream { + + /** + * Minimal http response or request header length. + * + * I've seen in arcs content length of 1 with no header. + */ + protected static final long MIN_HTTP_HEADER_LENGTH = + Math.min("HTTP/1.1 200 OK\r\n".length(), "GET / HTTP/1.0\n\r".length()); + + protected ArchiveRecordHeader header = null; + + /** + * Stream to read this record from. + * + * Stream can only be read sequentially. Will only return this records' + * content returning a -1 if you try to read beyond the end of the current + * record. + * + *

Streams can be markable or not. If they are, we'll be able to roll + * back when we've read too far. If not markable, assumption is that + * the underlying stream is managing our not reading too much (This pertains + * to the skipping over the end of the ARCRecord. See {@link #skip()}. + */ + protected InputStream in = null; + + /** + * Position w/i the Record content, within in. + * This position is relative within this Record. Its not same as the + * Archive file position. + */ + protected long position = 0; + + /** + * Set flag when we've reached the end-of-record. + */ + protected boolean eor = false; + + /** + * Compute digest on what we read and add to metadata when done. + * + * Currently hardcoded as sha-1. TODO: Remove when archive records + * digest or else, add a facility that allows the arc reader to + * compare the calculated digest to that which is recorded in + * the arc. + * + *

Protected instead of private so subclasses can update and complete + * the digest. + */ + protected MessageDigest digest = null; + private String digestStr = null; + + protected boolean strict = false; + + + /** + * Constructor. + * + * @param in Stream cue'd up to be at the start of the record this instance + * is to represent. + * @throws IOException + */ + public ArchiveRecord(InputStream in) + throws IOException { + this(in, null, 0, true, false); + } + + /** + * Constructor. + * + * @param in Stream cue'd up to be at the start of the record this instance + * is to represent. + * @param header Header data. + * @throws IOException + */ + public ArchiveRecord(InputStream in, ArchiveRecordHeader header) + throws IOException { + this(in, header, 0, true, false); + } + + /** + * Constructor. + * + * @param in Stream cue'd up to be at the start of the record this instance + * is to represent. + * @param header Header data. + * @param bodyOffset Offset into the body. Usually 0. + * @param digest True if we're to calculate digest for this record. Not + * digesting saves about ~15% of cpu during an ARC parse. + * @param strict Be strict parsing (Parsing stops if ARC inproperly + * formatted). + * @throws IOException + */ + public ArchiveRecord(InputStream in, ArchiveRecordHeader header, + int bodyOffset, boolean digest, boolean strict) + throws IOException { + this.in = in; + this.header = header; + this.position = bodyOffset; + if (digest) { + try { + this.digest = MessageDigest.getInstance("SHA1"); + } catch (NoSuchAlgorithmException e) { + // Convert to IOE because thats more amenable to callers + // -- they are dealing with it anyways. + throw new IOException(e.getMessage()); + } + } + this.strict = strict; + } + + public boolean markSupported() { + return false; + } + + /** + * @return Header data for this record. + */ + public ArchiveRecordHeader getHeader() { + return this.header; + } + + protected void setHeader(ArchiveRecordHeader header) { + this.header = header; + } + + /** + * Calling close on a record skips us past this record to the next record + * in the stream. + * + * It does not actually close the stream. The underlying steam is probably + * being used by the next arc record. + * + * @throws IOException + */ + public void close() throws IOException { + if (this.in != null) { + skip(); + this.in = null; + if (this.digest != null) { + this.digestStr = Base32.encode(this.digest.digest()); + } + } + } + + /** + * @return Next character in this Record content else -1 if at EOR. + * @throws IOException + */ + public int read() throws IOException { + int c = -1; + if (available() > 0) { + c = this.in.read(); + if (c == -1) { + throw new IOException("Premature EOF before end-of-record."); + } + if (this.digest != null) { + this.digest.update((byte) c); + } + incrementPosition(); + } + return c; + } + + public int read(byte[] b, int offset, int length) throws IOException { + int read = Math.min(length, available()); + if (read == -1 || read == 0) { + read = -1; + } else { + read = this.in.read(b, offset, read); + if (read == -1) { + String msg = "Premature EOF before end-of-record: " + + getHeader().getHeaderFields(); + if (isStrict()) { + throw new IOException(msg); + } + setEor(true); + System.err.println(Level.WARNING.toString() + " " + msg); + } + if (this.digest != null && read >= 0) { + this.digest.update(b, offset, read); + } + incrementPosition(read); + } + return read; + } + + /** + * This available is not the stream's available. Its an available based on + * what the stated Archive record length is minus what we've read to date. + * + * @return True if bytes remaining in record content. + */ + public int available() { + long amount = getHeader().getLength() - getPosition(); + return (amount > Integer.MAX_VALUE? Integer.MAX_VALUE: (int)amount); + } + + /** + * Skip over this records content. + * + * @throws IOException + */ + protected void skip() throws IOException { + if (this.eor) { + return; + } + + // Read to the end of the body of the record. Exhaust the stream. + // Can't skip direct to end because underlying stream may be compressed + // and we're calculating the digest for the record. + int r = available(); + while (r > 0 && !this.eor) { + skip(r); + r = available(); + } + } + + public long skip(long n) throws IOException { + final int SKIP_BUFFERSIZE = 1024 * 4; + byte[] b = new byte[SKIP_BUFFERSIZE]; + long total = 0; + for (int read = 0; (total < n) && (read != -1);) { + read = Math.min(SKIP_BUFFERSIZE, (int) (n - total)); + // TODO: Interesting is that reading from compressed stream, we only + // read about 500 characters at a time though we ask for 4k. + // Look at this sometime. + read = read(b, 0, read); + if (read <= 0) { + read = -1; + } else { + total += read; + } + } + return total; + } + + /** + * @return Returns the strict. + */ + public boolean isStrict() { + return this.strict; + } + + /** + * @param strict The strict to set. + */ + public void setStrict(boolean strict) { + this.strict = strict; + } + + protected InputStream getIn() { + return this.in; + } + + public String getDigestStr() { + return this.digestStr; + } + + protected void incrementPosition() { + this.position++; + } + + protected void incrementPosition(final long incr) { + this.position += incr; + } + + public long getPosition() { + return this.position; + } + + protected boolean isEor() { + return eor; + } + + protected void setEor(boolean eor) { + this.eor = eor; + } + + protected String getStatusCode4Cdx(final ArchiveRecordHeader h) { + return "-"; + } + + protected String getIp4Cdx(final ArchiveRecordHeader h) { + return "-"; + } + + protected String getDigest4Cdx(final ArchiveRecordHeader h) { + return getDigestStr() == null? "-": getDigestStr(); + } + + protected String getMimetype4Cdx(final ArchiveRecordHeader h) { + return h.getMimetype(); + } + + protected String outputCdx(final String strippedFileName) + throws IOException { + // Read the whole record so we get out a hash. Should be safe calling + // close on already closed Record. + close(); + ArchiveRecordHeader h = getHeader(); + StringBuilder buffer = + new StringBuilder(ArchiveFileConstants.CDX_LINE_BUFFER_SIZE); + buffer.append(h.getDate()); + buffer.append(ArchiveFileConstants.SINGLE_SPACE); + buffer.append(getIp4Cdx(h)); + buffer.append(ArchiveFileConstants.SINGLE_SPACE); + buffer.append(h.getUrl()); + buffer.append(ArchiveFileConstants.SINGLE_SPACE); + buffer.append(getMimetype4Cdx(h)); + buffer.append(ArchiveFileConstants.SINGLE_SPACE); + buffer.append(getStatusCode4Cdx(h)); + buffer.append(ArchiveFileConstants.SINGLE_SPACE); + buffer.append(getDigest4Cdx(h)); + buffer.append(ArchiveFileConstants.SINGLE_SPACE); + buffer.append(h.getOffset()); + buffer.append(ArchiveFileConstants.SINGLE_SPACE); + buffer.append(h.getLength()); + buffer.append(ArchiveFileConstants.SINGLE_SPACE); + buffer.append(strippedFileName != null? strippedFileName: '-'); + return buffer.toString(); + } + + /** + * Writes output on STDOUT. + * @throws IOException + */ + public void dump() + throws IOException { + dump(System.out); + } + + /** + * Writes output on passed os. + * @throws IOException + */ + public void dump(final OutputStream os) + throws IOException { + final byte [] outputBuffer = new byte [16*1024]; + int read = outputBuffer.length; + while ((read = read(outputBuffer, 0, outputBuffer.length)) != -1) { + os.write(outputBuffer, 0, read); + } + os.flush(); + } + + /** + * Is it likely that this record contains headers? + * This method will return true if the body is a http response that includes + * http response headers or the body is a http request that includes request + * headers, etc. Be aware that headers in content are distinct from + * {@link ArchiveRecordHeader} 'headers'. + * @return True if this Record's content has headers: + */ + public boolean hasContentHeaders() { + final String url = getHeader().getUrl(); + if (url == null) { + return false; + } + + if (!url.toLowerCase().startsWith("http")) { + return false; + } + + if (getHeader().getLength() <= MIN_HTTP_HEADER_LENGTH) { + return false; + } + + return true; + } + + protected void setBodyOffset(int bodyOffset) { + this.position = bodyOffset; + } +} diff --git a/src/main/java/org/archive/io/ArchiveRecordHeader.java b/src/main/java/org/archive/io/ArchiveRecordHeader.java new file mode 100644 index 00000000..953537b1 --- /dev/null +++ b/src/main/java/org/archive/io/ArchiveRecordHeader.java @@ -0,0 +1,111 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.util.Map; +import java.util.Set; + +/** + * Archive Record Header. + * @author stack + * @version $Date$ $Version$ + */ +public interface ArchiveRecordHeader { + /** + * Get the time when the record was created. + * @return Date in 14 digit time format (UTC). + * @see org.archive.util.ArchiveUtils#parse14DigitDate(String) + */ + public abstract String getDate(); + + /** + * @return Return length of record. + */ + public abstract long getLength(); + + /** + * @return Return Content-Length of the contents of the record + */ + public abstract long getContentLength(); + + + /** + * @return Record subject-url. + */ + public abstract String getUrl(); + + /** + * @return Record mimetype. + */ + public abstract String getMimetype(); + + /** + * @return Record version. + */ + public abstract String getVersion(); + + /** + * @return Offset into Archive file at which this record begins. + */ + public abstract long getOffset(); + + /** + * @param key Key to use looking up field value. + * @return value for passed key of null if no such entry. + */ + public abstract Object getHeaderValue(final String key); + + /** + * @return Header field name keys. + */ + public abstract Set getHeaderFieldKeys(); + + /** + * @return Map of header fields. + */ + public abstract Map getHeaderFields(); + + /** + * @return Returns identifier for current Archive file. Be aware this + * may not be a file name or file path. It may just be an URL. Depends + * on how Archive file was made. + */ + public abstract String getReaderIdentifier(); + + /** + * @return Identifier for the record. If ARC, the URL + date. If WARC, + * the GUID assigned. + */ + public abstract String getRecordIdentifier(); + + /** + * @return Returns digest as String for this record. Only available after + * the record has been read in totality. + */ + public abstract String getDigest(); + + /** + * Offset at which the content begins. + * For ARCs, its used to delimit where http headers end and content begins. + * For WARCs, its end of Named Fields before payload starts. + */ + public int getContentBegin(); + + public abstract String toString(); +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/ArraySeekInputStream.java b/src/main/java/org/archive/io/ArraySeekInputStream.java new file mode 100644 index 00000000..5b30747e --- /dev/null +++ b/src/main/java/org/archive/io/ArraySeekInputStream.java @@ -0,0 +1,106 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.io.IOException; + + +/** + * A repositionable stream backed by an array. + * + * @author pjack + */ +public class ArraySeekInputStream extends SeekInputStream { + + + /** + * The array of bytes to read from. + */ + private byte[] array; + + + /** + * The offset in the array of the next byte to read. + */ + private int offset; + + + /** + * Constructor. Note that changes to the given array will be reflected + * in the stream. + * + * @param array The array to read bytes from. + */ + public ArraySeekInputStream(byte[] array) { + this.array = array; + this.offset = 0; + } + + + @Override + public int read() { + if (offset >= array.length) { + return -1; + } + int r = array[offset] & 0xFF; + offset++; + return r; + } + + + @Override + public int read(byte[] buf, int ofs, int len) { + if (offset >= array.length) { + return 0; + } + len = Math.min(len, array.length - offset); + System.arraycopy(array, offset, buf, ofs, len); + offset += len; + return len; + } + + + @Override + public int read(byte[] buf) { + return read(buf, 0, buf.length); + } + + + /** + * Returns the position of the stream. + */ + public long position() { + return offset; + } + + + /** + * Repositions the stream. + * + * @param p the new position for the stream + * @throws IOException if the given position is out of bounds + */ + public void position(long p) throws IOException { + if ((p < 0) || (p > array.length)) { + throw new IOException("Invalid position: " + p); + } + offset = (int)p; + } + +} diff --git a/src/main/java/org/archive/io/BufferedSeekInputStream.java b/src/main/java/org/archive/io/BufferedSeekInputStream.java new file mode 100644 index 00000000..2fdc72b7 --- /dev/null +++ b/src/main/java/org/archive/io/BufferedSeekInputStream.java @@ -0,0 +1,217 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + + +import java.io.IOException; + + +/** + * Buffers data from some other SeekInputStream. + * + * @author pjack + */ +public class BufferedSeekInputStream extends SeekInputStream { + + + /** + * The underlying input stream. + */ + final private SeekInputStream input; + + + /** + * The buffered data. + */ + final private byte[] buffer; + + + /** + * The maximum offset of valid data in the buffer. Usually the same + * as buffer.length, but may be shorter if we're in the last region + * of the stream. + */ + private int maxOffset; + + + /** + * The offset of within the buffer of the next byte to read. + */ + private int offset; + + + /** + * Constructor. + * + * @param input the underlying input stream + * @param capacity the size of the buffer + * @throws IOException if an IO occurs filling the first buffer + */ + public BufferedSeekInputStream(SeekInputStream input, int capacity) + throws IOException { + this.input = input; + this.buffer = new byte[capacity]; + buffer(); + } + + /** + * Fills the buffer. + * + * @throws IOException if an IO error occurs + */ + private void buffer() throws IOException { + int remaining = buffer.length; + while (remaining > 0) { + int r = input.read(buffer, buffer.length - remaining, remaining); + if (r <= 0) { + // Not enough information to fill the buffer + offset = 0; + maxOffset = buffer.length - remaining; + return; + } + remaining -= r; + } + maxOffset = buffer.length; + offset = 0; + } + + + /** + * Ensures that the buffer is valid. + * + * @throws IOException if an IO error occurs + */ + private void ensureBuffer() throws IOException { + if (offset >= maxOffset) { + buffer(); + } + } + + + /** + * Returns the number of unread bytes in the current buffer. + * + * @return the remaining bytes + */ + private int remaining() { + return maxOffset - offset; + } + + + @Override + public int read() throws IOException { + ensureBuffer(); + if (maxOffset == 0) { + return -1; + } + int ch = buffer[offset] & 0xFF; + offset++; + return ch; + } + + + @Override + public int read(byte[] buf, int ofs, int len) throws IOException { + ensureBuffer(); + if (maxOffset == 0) { + return 0; + } + len = Math.min(len, remaining()); + System.arraycopy(buffer, offset, buf, ofs, len); + offset += len; + return len; + } + + + @Override + public int read(byte[] buf) throws IOException { + return read(buf, 0, buf.length); + } + + + @Override + public long skip(long c) throws IOException { + ensureBuffer(); + if (maxOffset == 0) { + return 0; + } + int count = (c > Integer.MAX_VALUE) ? Integer.MAX_VALUE : (int)c; + int skip = Math.min(count, remaining()); + offset += skip; + return skip; + } + + + /** + * Returns the stream's current position. + * + * @return the current position + */ + public long position() throws IOException { + return input.position() - buffer.length + offset; + } + + + /** + * Seeks to the given position. This method avoids re-filling the buffer + * if at all possible. + * + * @param p the position to set + * @throws IOException if an IO error occurs + */ + public void position(long p) throws IOException { + long blockStart = (input.position() - maxOffset) + / buffer.length * buffer.length; + long blockEnd = blockStart + maxOffset; + if ((p >= blockStart) && (p < blockEnd)) { + // Desired position is somewhere inside current buffer + long adj = p - blockStart; + offset = (int)adj; + return; + } + positionDirect(p); + } + + + /** + * Positions the underlying stream at the given position, then refills + * the buffer. + * + * @param p the position to set + * @throws IOException if an IO error occurs + */ + private void positionDirect(long p) throws IOException { + long newBlockStart = p / buffer.length * buffer.length; + input.position(newBlockStart); + buffer(); + offset = (int)(p % buffer.length); + } + + /** + * Close the stream, including the wrapped input stream. + */ + public void close() throws IOException { + super.close(); + if(this.input!=null) { + this.input.close(); + } + } + + +} diff --git a/src/main/java/org/archive/io/CharSubSequence.java b/src/main/java/org/archive/io/CharSubSequence.java new file mode 100644 index 00000000..1e89da56 --- /dev/null +++ b/src/main/java/org/archive/io/CharSubSequence.java @@ -0,0 +1,90 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + + +/** + * Provides a subsequence view onto a CharSequence. + * + * @author gojomo + * @version $Revision$, $Date$ + */ +public class CharSubSequence implements CharSequence { + + protected CharSequence inner; + protected int start; + protected int end; + + public CharSubSequence(CharSequence inner, int start, int end) { + if (end < start) { + throw new IllegalArgumentException("Start " + start + " is > " + + " than end " + end); + } + + if (end < 0 || start < 0) { + throw new IllegalArgumentException("Start " + start + " or end " + + end + " is < 0."); + } + + if (inner == null) { + throw new NullPointerException("Passed charsequence is null."); + } + + this.inner = inner; + this.start = start; + this.end = end; + } + + /* + * (non-Javadoc) + * @see java.lang.CharSequence#length() + */ + public int length() { + return this.end - this.start; + } + + /* + * (non-Javadoc) + * @see java.lang.CharSequence#charAt(int) + */ + public char charAt(int index) { + return this.inner.charAt(this.start + index); + } + + /* + * (non-Javadoc) + * @see java.lang.CharSequence#subSequence(int, int) + */ + public CharSequence subSequence(int begin, int finish) { + return new CharSubSequence(this, begin, finish); + } + + /* + * (non-Javadoc) + * @see java.lang.CharSequence#toString() + */ + public String toString() { + StringBuffer sb = new StringBuffer(length()); + // could use StringBuffer.append(CharSequence) if willing to do 1.5 & up + for (int i = 0;i filenames; + + /* (non-Javadoc) + * @see java.io.InputStream#read() + */ + public int read() throws IOException { + int c = super.read(); + if( c == -1 && filenames.hasNext() ) { + cueStream(); + return read(); + } + return c; + } + /* (non-Javadoc) + * @see java.io.InputStream#read(byte[], int, int) + */ + public int read(byte[] b, int off, int len) throws IOException { + int c = super.read(b, off, len); + if( c == -1 && filenames.hasNext() ) { + cueStream(); + return read(b,off,len); + } + return c; + } + /* (non-Javadoc) + * @see java.io.InputStream#read(byte[]) + */ + public int read(byte[] b) throws IOException { + int c = super.read(b); + if( c == -1 && filenames.hasNext() ) { + cueStream(); + return read(b); + } + return c; + } + + /* (non-Javadoc) + * @see java.io.InputStream#skip(long) + */ + public long skip(long n) throws IOException { + long s = super.skip(n); + if( s files) throws IOException { + super(null); + filenames = files.iterator(); + cueStream(); + } + + private void cueStream() throws IOException { + if(filenames.hasNext()) { + this.in = new FileInputStream(filenames.next()); + } + } + +} diff --git a/src/main/java/org/archive/io/CompositeFileReader.java b/src/main/java/org/archive/io/CompositeFileReader.java new file mode 100644 index 00000000..14b56219 --- /dev/null +++ b/src/main/java/org/archive/io/CompositeFileReader.java @@ -0,0 +1,40 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.io.File; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.List; + + +/** + * @author gojomo + */ +public class CompositeFileReader extends InputStreamReader { + + /** + * @param filenames + * @throws IOException + */ + public CompositeFileReader(List filenames) throws IOException { + super(new CompositeFileInputStream(filenames)); + } + +} diff --git a/src/main/java/org/archive/io/Endian.java b/src/main/java/org/archive/io/Endian.java new file mode 100644 index 00000000..f6d89aaa --- /dev/null +++ b/src/main/java/org/archive/io/Endian.java @@ -0,0 +1,125 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + + +import java.io.EOFException; +import java.io.IOException; +import java.io.InputStream; + + +/** + * Reads integers stored in big or little endian streams. + * + * @author pjack + */ +public class Endian { + + + /** + * Static utility class. + */ + private Endian() { + } + + + /** + * Reads the next little-endian unsigned 16 bit integer from the + * given stream. + * + * @param input the input stream to read from + * @return the next 16-bit little-endian integer + * @throws IOException if an IO error occurs + */ + public static char littleChar(InputStream input) throws IOException { + int lo = input.read(); + if (lo < 0) { + throw new EOFException(); + } + int hi = input.read(); + if (hi < 0) { + throw new EOFException(); + } + return (char)((hi << 8) | lo); + } + + + /** + * Reads the next little-endian signed 16-bit integer from the + * given stream. + * + * @param input the input stream to read from + * @return the next 16-bit little-endian integer + * @throws IOException if an IO error occurs + */ + public static short littleShort(InputStream input) throws IOException { + return (short)littleChar(input); + } + + + /** + * Reads the next little-endian signed 32-bit integer from the + * given stream. + * + * @param input the input stream to read from + * @return the next 32-bit little-endian integer + * @throws IOException if an IO error occurs + */ + public static int littleInt(InputStream input) throws IOException { + char lo = littleChar(input); + char hi = littleChar(input); + return (hi << 16) | lo; + } + + + /** + * Reads the next big-endian unsigned 16 bit integer from the + * given stream. + * + * @param input the input stream to read from + * @return the next 16-bit big-endian integer + * @throws IOException if an IO error occurs + */ + public static char bigChar(InputStream input) throws IOException { + int hi = input.read(); + if (hi < 0) { + throw new EOFException(); + } + int lo = input.read(); + if (lo < 0) { + throw new EOFException(); + } + return (char)((hi << 8) | lo); + } + + + /** + * Reads the next big-endian signed 32-bit integer from the + * given stream. + * + * @param input the input stream to read from + * @return the next 32-bit big-endian integer + * @throws IOException if an IO error occurs + */ + public static int bigInt(InputStream input) throws IOException { + char hi = bigChar(input); + char lo = bigChar(input); + return (hi << 16) | lo; + } +} diff --git a/src/main/java/org/archive/io/GZIPMembersInputStream.java b/src/main/java/org/archive/io/GZIPMembersInputStream.java new file mode 100644 index 00000000..35fb9e90 --- /dev/null +++ b/src/main/java/org/archive/io/GZIPMembersInputStream.java @@ -0,0 +1,38 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.io.IOException; +import java.io.InputStream; + +/** + * @deprecated use {@link org.archive.util.zip.GZIPMembersInputStream} + */ +@Deprecated +public class GZIPMembersInputStream extends org.archive.util.zip.GZIPMembersInputStream { + + public GZIPMembersInputStream(InputStream in) throws IOException { + super(in); + } + + public GZIPMembersInputStream(InputStream in, int size) throws IOException { + super(in, size); + } + +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/GenerationFileHandler.java b/src/main/java/org/archive/io/GenerationFileHandler.java new file mode 100644 index 00000000..c1ce8d79 --- /dev/null +++ b/src/main/java/org/archive/io/GenerationFileHandler.java @@ -0,0 +1,200 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.LinkedList; +import java.util.List; +import java.util.logging.FileHandler; +import java.util.logging.Formatter; +import java.util.logging.LogRecord; + +import org.archive.util.FileUtils; + + +/** + * FileHandler with support for rotating the current file to + * an archival name with a specified integer suffix, and + * provision of a new replacement FileHandler with the current + * filename. + * + * @author gojomo + */ +public class GenerationFileHandler extends FileHandler { + private LinkedList filenameSeries = new LinkedList(); + private boolean shouldManifest = false; + + /** + * @return Returns the filenameSeries. + */ + public List getFilenameSeries() { + return filenameSeries; + } + + /** + * Constructor. + * @param pattern + * @param append + * @param shouldManifest + * @throws IOException + * @throws SecurityException + */ + public GenerationFileHandler(String pattern, boolean append, + boolean shouldManifest) + throws IOException, SecurityException { + super(pattern, append); + filenameSeries.addFirst(pattern); + this.shouldManifest = shouldManifest; + } + + /** + * @param filenameSeries + * @param shouldManifest + * @throws IOException + */ + public GenerationFileHandler(LinkedList filenameSeries, + boolean shouldManifest) + throws IOException { + super((String)filenameSeries.getFirst(), false); // Never append in this case + this.filenameSeries = filenameSeries; + this.shouldManifest = shouldManifest; + } + + /** + * Move the current file to a new filename with the storeSuffix in place + * of the activeSuffix; continuing logging to a new file under the + * original filename. + * + * @param storeSuffix Suffix to put in place of activeSuffix + * @param activeSuffix Suffix to replace with storeSuffix. + * @return GenerationFileHandler instance. + * @throws IOException + */ + public GenerationFileHandler rotate(String storeSuffix, + String activeSuffix) + throws IOException { + return rotate(storeSuffix, activeSuffix, false); + } + + public GenerationFileHandler rotate(String storeSuffix, + String activeSuffix, boolean mergeOld) throws IOException { + close(); + String filename = (String) filenameSeries.getFirst(); + if (!filename.endsWith(activeSuffix)) { + throw new FileNotFoundException("Active file does not have" + + " expected suffix"); + } + String storeFilename = filename.substring(0, filename.length() + - activeSuffix.length()) + + storeSuffix; + File activeFile = new File(filename); + File storeFile = new File(storeFilename); + FileUtils.moveAsideIfExists(storeFile); + + if (mergeOld) { + File fileToAppendTo = new File(filenameSeries.getLast()); + for (int i = filenameSeries.size() - 2; i >= 0; i--) { + File f = new File(filenameSeries.get(i)); + FileUtils.appendTo(fileToAppendTo, f); + f.delete(); + } + filenameSeries.clear(); + filenameSeries.add(filename); + if (!fileToAppendTo.renameTo(storeFile)) { + throw new IOException("Unable to move " + fileToAppendTo + " to " + + storeFilename); + } + } else { + if (!activeFile.renameTo(storeFile)) { + throw new IOException("Unable to move " + filename + " to " + + storeFilename); + } + } + filenameSeries.add(1, storeFilename); + GenerationFileHandler newGfh = new GenerationFileHandler( + filenameSeries, shouldManifest); + newGfh.setFormatter(this.getFormatter()); + return newGfh; + } + + /** + * @return True if should manifest. + */ + public boolean shouldManifest() { + return this.shouldManifest; + } + + /** + * Constructor-helper that rather than clobbering any existing + * file, moves it aside with a timestamp suffix. + * + * @param filename + * @param append + * @param shouldManifest + * @return + * @throws SecurityException + * @throws IOException + */ + public static GenerationFileHandler makeNew(String filename, boolean append, boolean shouldManifest) throws SecurityException, IOException { + FileUtils.moveAsideIfExists(new File(filename)); + return new GenerationFileHandler(filename, append, shouldManifest); + } + + @Override + public void publish(LogRecord record) { + // when possible preformat outside synchronized superclass method + // (our most involved UriProcessingFormatter can cache result) + Formatter f = getFormatter(); + if(!(f instanceof Preformatter)) { + super.publish(record); + } else { + try { + ((Preformatter)f).preformat(record); + super.publish(record); + } finally { + ((Preformatter)f).clear(); + } + } + } +// +// TODO: determine if there's another way to have this optimization without +// negative impact on log-following (esp. in web UI) +// /** +// * Flush only 1/100th of the usual once-per-record, to reduce the time +// * spent holding the synchronization lock. (Flush is primarily called in +// * a superclass's synchronized publish()). +// * +// * The eventual close calls a direct flush on the target writer, so all +// * rotates/ends will ultimately be fully flushed. +// * +// * @see java.util.logging.StreamHandler#flush() +// */ +// @Override +// public synchronized void flush() { +// flushCount++; +// if(flushCount==100) { +// super.flush(); +// flushCount=0; +// } +// } +// int flushCount; + +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/GenericReplayCharSequence.java b/src/main/java/org/archive/io/GenericReplayCharSequence.java new file mode 100644 index 00000000..1af3922b --- /dev/null +++ b/src/main/java/org/archive/io/GenericReplayCharSequence.java @@ -0,0 +1,412 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.io.Writer; +import java.nio.CharBuffer; +import java.nio.channels.FileChannel; +import java.nio.charset.CharacterCodingException; +import java.nio.charset.Charset; +import java.text.NumberFormat; +import java.util.logging.Level; +import java.util.logging.Logger; + +import org.apache.commons.io.IOUtils; +import org.archive.util.DevUtils; + +import com.google.common.base.Charsets; +import com.google.common.primitives.Ints; + +/** + * (Replay)CharSequence view on recorded streams. + * + * For small streams, use {@link InMemoryReplayCharSequence}. + * + *

Call {@link close()} on this class when done to clean up resources. + * + * @contributor stack + * @contributor nlevitt + * @version $Revision$, $Date$ + */ +public class GenericReplayCharSequence implements ReplayCharSequence { + + protected static Logger logger = Logger + .getLogger(GenericReplayCharSequence.class.getName()); + + /** + * Name of the encoding we use writing out concatenated decoded prefix + * buffer and decoded backing file. + * + *

This define is also used as suffix for the file that holds the + * decodings. The name of the file that holds the decoding is the name + * of the backing file w/ this encoding for a suffix. + * + *

See Encoding. + */ + public static final Charset WRITE_ENCODING = Charsets.UTF_16BE; + + private static final long MAP_MAX_BYTES = 64 * 1024 * 1024; // 64M + + /** + * When the memory map moves away from the beginning of the file + * (to the "right") in order to reach a certain index, it will + * map up to this many bytes preceding (to the left of) the target character. + * Consequently it will map up to + * MAP_MAX_BYTES - MAP_TARGET_LEFT_PADDING + * bytes to the right of the target. + */ + private static final long MAP_TARGET_LEFT_PADDING_BYTES = (long) (MAP_MAX_BYTES * 0.01); + + /** + * Total length of character stream to replay minus the HTTP headers + * if present. + * + * If the backing file is larger than Integer.MAX_VALUE (i.e. 2gb), + * only the first Integer.MAX_VALUE characters are available through this API. + * We're overriding java.lang.CharSequence so that we can use + * java.util.regex directly on the data, and the CharSequence + * API uses int for the length and index. + */ + protected int length; + + /** counter of decoding exceptions for report at end */ + protected long decodingExceptions = 0; + protected CharacterCodingException codingException = null; + + /** + * Byte offset into the file where the memory mapped portion begins. + */ + private long mapByteOffset; + + // XXX do we need to keep the input stream around? + private FileInputStream backingFileIn = null; + + private FileChannel backingFileChannel = null; + + private long bytesPerChar; + + private CharBuffer mappedBuffer = null; + + /** + * File that has decoded content. + * + * Keep it around so we can remove on close. + */ + private File decodedFile = null; + + /* + * This portion of the CharSequence precedes what's in the backing file. In + * cases where we decodeToFile(), this is always empty, because we decode + * the entire input stream. + */ + private CharBuffer prefixBuffer = null; + + private boolean isOpen = true; + + protected Charset charset = null; + + /** + * Constructor. + * + * @param contentReplayInputStream inputStream of content + * @param charset Encoding to use reading the passed prefix + * buffer and backing file. Must not be null. + * @param backingFilename Path to backing file with content in excess of + * whats in buffer. + * + * @throws IOException + */ + public GenericReplayCharSequence(InputStream contentReplayInputStream, + int prefixMax, + String backingFilename, + Charset charset) throws IOException { + super(); + logger.fine("characterEncoding=" + charset + " backingFilename=" + + backingFilename); + + if(charset==null) { + charset = ReplayCharSequence.FALLBACK_CHARSET; + } + // decodes only up to Integer.MAX_VALUE characters + decode(contentReplayInputStream, prefixMax, backingFilename, charset); + + this.bytesPerChar = 2; + + if(length>prefixBuffer.position()) { + this.backingFileIn = new FileInputStream(decodedFile); + this.backingFileChannel = backingFileIn.getChannel(); + this.mapByteOffset = 0; + updateMemoryMappedBuffer(); + } + } + + private void updateMemoryMappedBuffer() { + long charLength = (long) this.length() - (long) prefixBuffer.limit(); // in characters + long mapSize = Math.min((charLength * bytesPerChar) - mapByteOffset, MAP_MAX_BYTES); + logger.fine("updateMemoryMappedBuffer: mapOffset=" + + NumberFormat.getInstance().format(mapByteOffset) + + " mapSize=" + NumberFormat.getInstance().format(mapSize)); + try { + // TODO: stress-test without these possibly-costly requests! +// System.gc(); +// System.runFinalization(); + // TODO: Confirm the READ_ONLY works. I recall it not working. + // The buffers seem to always say that the buffer is writable. + mappedBuffer = backingFileChannel.map( + FileChannel.MapMode.READ_ONLY, mapByteOffset, mapSize) + .asReadOnlyBuffer().asCharBuffer(); + } catch (IOException e) { + // TODO convert this to a runtime error? + DevUtils.logger.log(Level.SEVERE, + " backingFileChannel.map() mapByteOffset=" + mapByteOffset + + " mapSize=" + mapSize + "\n" + "decodedFile=" + + decodedFile + " length=" + length + "\n" + + DevUtils.extraInfo(), e); + throw new RuntimeException(e); + } + } + + /** + * Converts the first Integer.MAX_VALUE characters from the + * file backingFilename from encoding encoding to + * encoding WRITE_ENCODING and saves as + * this.decodedFile, which is named backingFilename + * + "." + WRITE_ENCODING. + * + * @throws IOException + */ + protected void decode(InputStream inStream, int prefixMax, + String backingFilename, Charset charset) throws IOException { + + this.charset = charset; + + // TODO: consider if BufferedReader is helping any + // TODO: consider adding TBW 'LimitReader' to stop reading at + // Integer.MAX_VALUE characters because of charAt(int) limit + BufferedReader reader = new BufferedReader(new InputStreamReader( + inStream, charset)); + + logger.fine("backingFilename=" + backingFilename + " encoding=" + + charset + " decodedFile=" + decodedFile); + + this.prefixBuffer = CharBuffer.allocate(prefixMax); + + long count = 0; + while(count < prefixMax) { + int read = reader.read(prefixBuffer); + if(read<0) { + break; + } + count += read; + } + + int ch = reader.read(); + if(ch >= 0) { + count++; + + // more to decode to file overflow + this.decodedFile = new File(backingFilename + "." + WRITE_ENCODING); + + FileOutputStream fos; + try { + fos = new FileOutputStream(this.decodedFile); + } catch (FileNotFoundException e) { + // Windows workaround attempt + System.gc(); + System.runFinalization(); + this.decodedFile = new File(decodedFile.getAbsolutePath()+".win"); + logger.info("Windows 'file with a user-mapped section open' " + + "workaround gc/finalization/name-extension performed."); + // try again + fos = new FileOutputStream(this.decodedFile); + } + + Writer writer = new OutputStreamWriter(fos,WRITE_ENCODING); + writer.write(ch); + count += IOUtils.copyLarge(reader, writer); + writer.close(); + reader.close(); + } + + this.length = Ints.saturatedCast(count); + if(count>Integer.MAX_VALUE) { + logger.warning("input stream is longer than Integer.MAX_VALUE=" + + NumberFormat.getInstance().format(Integer.MAX_VALUE) + + " characters -- only first " + + NumberFormat.getInstance().format(Integer.MAX_VALUE) + + " are accessible through this GenericReplayCharSequence"); + } + + logger.fine("decode: decoded " + count + " characters" + + ((decodedFile==null) ? "" + : " ("+(count-prefixBuffer.length())+" to "+decodedFile+")")); + } + + /** + * Get character at passed absolute position. + * @param index Index into content + * @return Character at offset index. + */ + public char charAt(int index) { + if (index < 0 || index >= this.length()) { + throw new IndexOutOfBoundsException("index=" + index + + " - should be between 0 and length()=" + this.length()); + } + + // is it in the buffer + if (index < prefixBuffer.limit()) { + return prefixBuffer.get(index); + } + + // otherwise we gotta get it from disk via memory map + long charFileIndex = (long) index - (long) prefixBuffer.limit(); + long charFileLength = (long) this.length() - (long) prefixBuffer.limit(); // in characters + if (charFileIndex * bytesPerChar < mapByteOffset) { + logger.log(Level.WARNING,"left-fault; probably don't want to use CharSequence that far backward"); + } + if (charFileIndex * bytesPerChar < mapByteOffset + || charFileIndex - (mapByteOffset / bytesPerChar) >= mappedBuffer.limit()) { + // fault + /* + * mapByteOffset is bounded by 0 and file size +/- size of the map, + * and starts as close to fileIndex - + * MAP_TARGET_LEFT_PADDING_BYTES as it can while also not + * being smaller than it needs to be. + */ + mapByteOffset = Math.min(charFileIndex * bytesPerChar - MAP_TARGET_LEFT_PADDING_BYTES, + charFileLength * bytesPerChar - MAP_MAX_BYTES); + mapByteOffset = Math.max(0, mapByteOffset); + updateMemoryMappedBuffer(); + } + + return mappedBuffer.get((int)(charFileIndex-(mapByteOffset/bytesPerChar))); + } + + public CharSequence subSequence(int start, int end) { + return new CharSubSequence(this, start, end); + } + + private void deleteFile(File fileToDelete) { + deleteFile(fileToDelete, null); + } + + private void deleteFile(File fileToDelete, final Exception e) { + if (e != null) { + // Log why the delete to help with debug of + // java.io.FileNotFoundException: + // ....tt53http.ris.UTF-16BE. + logger.severe("Deleting " + fileToDelete + " because of " + + e.toString()); + } + if (fileToDelete != null && fileToDelete.exists()) { + logger.fine("deleting file: " + fileToDelete); + fileToDelete.delete(); + } + } + + + @Override + public boolean isOpen() { + return this.isOpen; + } + + public void close() throws IOException { + this.isOpen = false; + + logger.fine("closing"); + + if (this.backingFileChannel != null && this.backingFileChannel.isOpen()) { + this.backingFileChannel.close(); + } + if (backingFileIn != null) { + backingFileIn.close(); + } + + deleteFile(this.decodedFile); + + // clear decodedFile -- so that double-close (as in finalize()) won't + // delete a later instance with same name see bug [ 1218961 ] + // "failed get of replay" in ExtractorHTML... usu: UTF-16BE + this.decodedFile = null; + } + + /* + * (non-Javadoc) + * + * @see java.lang.Object#finalize() + */ + protected void finalize() throws Throwable { + super.finalize(); + logger.fine("finalizing"); + close(); + } + + /** + * Convenience method for getting a substring. + * + * @deprecated please use subSequence() and then toString() directly + */ + public String substring(int offset, int len) { + return subSequence(offset, offset + len).toString(); + } + + public String toString() { + StringBuilder sb = new StringBuilder(this.length()); + sb.append(this); + return sb.toString(); + } + + public int length() { + return length; + } + + /* (non-Javadoc) + * @see org.archive.io.ReplayCharSequence#getDecodeExceptionCount() + */ + @Override + public long getDecodeExceptionCount() { + return decodingExceptions; + } + + + /* (non-Javadoc) + * @see org.archive.io.ReplayCharSequence#getCodingException() + */ + @Override + public CharacterCodingException getCodingException() { + return codingException; + } + + /* (non-Javadoc) + * @see org.archive.io.ReplayCharSequence#getCharset() + */ + public Charset getCharset() { + return charset; + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/GzipHeader.java b/src/main/java/org/archive/io/GzipHeader.java new file mode 100644 index 00000000..6b8263bc --- /dev/null +++ b/src/main/java/org/archive/io/GzipHeader.java @@ -0,0 +1,26 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +/** + * @deprecated use {@link org.archive.util.zip.GzipHeader} + */ +@Deprecated +public class GzipHeader extends org.archive.util.zip.GzipHeader { +} diff --git a/src/main/java/org/archive/io/HeaderedArchiveRecord.java b/src/main/java/org/archive/io/HeaderedArchiveRecord.java new file mode 100644 index 00000000..3cce595b --- /dev/null +++ b/src/main/java/org/archive/io/HeaderedArchiveRecord.java @@ -0,0 +1,423 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.io.PrintStream; + +import org.apache.commons.httpclient.Header; +import org.apache.commons.httpclient.HttpParser; +import org.apache.commons.httpclient.StatusLine; +import org.apache.commons.httpclient.util.EncodingUtil; +import org.archive.io.arc.ARCConstants; +import org.archive.util.LaxHttpParser; + +/** + * An ArchiveRecord whose content has a preamble of RFC822-like headers: e.g. + * The ArchiveRecord is a http response that leads off with http response + * headers. Use this ArchiveRecord Decorator to get at the content headers and + * the header/content demarcation. + * + * @author stack + * @author Olaf Freyer + */ +public class HeaderedArchiveRecord extends ArchiveRecord { + private int contentHeadersLength = -1; + private int statusCode = -1; + + /** + * Http header bytes. + * + * If non-null and bytes available, give out its contents before we + * go back to the underlying stream. + */ + private InputStream contentHeaderStream = null; + + /** + * Content headers. + * + * Only available after the reading of headers. + */ + private Header [] contentHeaders = null; + + + public HeaderedArchiveRecord(final ArchiveRecord ar) throws IOException { + super(ar); + } + + public HeaderedArchiveRecord(final ArchiveRecord ar, + final boolean readContentHeader) throws IOException { + super(ar); + if (readContentHeader) { + this.contentHeaderStream = readContentHeaders(); + } + } + + /** + * Skip over the the content headers if present. + * + * Subsequent reads will get the body. + * + *

Calling this method in the midst of reading the header + * will make for strange results. Otherwise, safe to call + * at any time though before reading any of the record + * content is only time that it makes sense. + * + *

After calling this method, you can call + * {@link #getContentHeaders()} to get the read http header. + * + * @throws IOException + */ + public void skipHttpHeader() throws IOException { + if (this.contentHeaderStream == null) { + return; + } + // Empty the contentHeaderStream + for (int available = this.contentHeaderStream.available(); + this.contentHeaderStream != null + && (available = this.contentHeaderStream.available()) > 0;) { + // We should be in this loop once only we should only do this + // buffer allocation once. + byte[] buffer = new byte[available]; + // The read nulls out httpHeaderStream when done with it so + // need check for null in the loop control line. + read(buffer, 0, available); + } + } + + public void dumpHttpHeader() throws IOException { + dumpHttpHeader(System.out); + } + + public void dumpHttpHeader(final PrintStream stream) throws IOException { + if (this.contentHeaderStream == null) { + return; + } + // Dump the httpHeaderStream to STDOUT + for (int available = this.contentHeaderStream.available(); + this.contentHeaderStream != null + && (available = this.contentHeaderStream.available()) > 0;) { + // We should be in this loop only once and should do this + // buffer allocation once. + byte[] buffer = new byte[available]; + // The read nulls out httpHeaderStream when done with it so + // need check for null in the loop control line. + int read = read(buffer, 0, available); + stream.write(buffer, 0, read); + } + } + + /** + * Read header if present. Technique borrowed from HttpClient HttpParse + * class. Using http parser code for now. Later move to more generic header + * parsing code if there proves a need. + * + * @return ByteArrayInputStream with the http header in it or null if no + * http header. + * @throws IOException + */ + private InputStream readContentHeaders() throws IOException { + // If judged a record that doesn't have an http header, return + // immediately. + if (!hasContentHeaders()) { + return null; + } + byte [] statusBytes = LaxHttpParser.readRawLine(getIn()); + int eolCharCount = getEolCharsCount(statusBytes); + if (eolCharCount <= 0) { + throw new IOException("Failed to read raw lie where one " + + " was expected: " + new String(statusBytes)); + } + String statusLine = EncodingUtil.getString(statusBytes, 0, + statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING); + if (statusLine == null) { + throw new NullPointerException("Expected status line is null"); + } + // TODO: Tighten up this test. + boolean isHttpResponse = StatusLine.startsWithHTTP(statusLine); + boolean isHttpRequest = false; + if (!isHttpResponse) { + isHttpRequest = statusLine.toUpperCase().startsWith("GET") || + !statusLine.toUpperCase().startsWith("POST"); + } + if (!isHttpResponse && !isHttpRequest) { + throw new UnexpectedStartLineIOException("Failed parse of " + + "status line: " + statusLine); + } + this.statusCode = isHttpResponse? + (new StatusLine(statusLine)).getStatusCode(): -1; + + // Save off all bytes read. Keep them as bytes rather than + // convert to strings so we don't have to worry about encodings + // though this should never be a problem doing http headers since + // its all supposed to be ascii. + ByteArrayOutputStream baos = + new ByteArrayOutputStream(statusBytes.length + 4 * 1024); + baos.write(statusBytes); + + // Now read rest of the header lines looking for the separation + // between header and body. + for (byte [] lineBytes = null; true;) { + lineBytes = LaxHttpParser.readRawLine(getIn()); + eolCharCount = getEolCharsCount(lineBytes); + if (eolCharCount <= 0) { + throw new IOException("Failed reading headers: " + + ((lineBytes != null)? new String(lineBytes): null)); + } + // Save the bytes read. + baos.write(lineBytes); + if ((lineBytes.length - eolCharCount) <= 0) { + // We've finished reading the http header. + break; + } + } + + byte [] headerBytes = baos.toByteArray(); + // Save off where content body, post content headers, starts. + this.contentHeadersLength = headerBytes.length; + ByteArrayInputStream bais = + new ByteArrayInputStream(headerBytes); + if (!bais.markSupported()) { + throw new IOException("ByteArrayInputStream does not support mark"); + } + bais.mark(headerBytes.length); + // Read the status line. Don't let it into the parseHeaders function. + // It doesn't know what to do with it. + bais.read(statusBytes, 0, statusBytes.length); + this.contentHeaders = LaxHttpParser.parseHeaders(bais, + ARCConstants.DEFAULT_ENCODING); + bais.reset(); + return bais; + } + + public static class UnexpectedStartLineIOException + extends RecoverableIOException { + private static final long serialVersionUID = 1L; + + public UnexpectedStartLineIOException(final String reason) { + super(reason); + } + } + + /** + * @param bytes Array of bytes to examine for an EOL. + * @return Count of end-of-line characters or zero if none. + */ + private int getEolCharsCount(byte [] bytes) { + int count = 0; + if (bytes != null && bytes.length >=1 && + bytes[bytes.length - 1] == '\n') { + count++; + if (bytes.length >=2 && bytes[bytes.length -2] == '\r') { + count++; + } + } + return count; + } + + /** + * @return If headers are for a http response AND the headers have been + * read, return status code. Else return -1. + */ + public int getStatusCode() { + return this.statusCode; + } + + /** + * @return Returns length of content headers or -1 if headers have + * not yet been read. + */ + public int getContentHeadersLength() { + return this.contentHeadersLength; + } + + public Header[] getContentHeaders() { + return contentHeaders; + } + + /** + * @return Next character in this ARCRecord's content else -1 if at end of + * this record. + * @throws IOException + */ + public int read() throws IOException { + int c = -1; + if (this.contentHeaderStream != null && + (this.contentHeaderStream.available() > 0)) { + // If http header, return bytes from it before we go to underlying + // stream. + c = this.contentHeaderStream.read(); + // If done with the header stream, null it out. + if (this.contentHeaderStream.available() <= 0) { + this.contentHeaderStream = null; + } + // do not increment position - + // the underlying ArchiveRecord stream allready did this + // incrementPosition(); + } else { + c = super.read(); + } + return c; + } + + public int read(byte [] b, int offset, int length) throws IOException { + int read = -1; + if (this.contentHeaderStream != null && + (this.contentHeaderStream.available() > 0)) { + // If http header, return bytes from it before we go to underlying + // stream. + read = Math.min(length, this.contentHeaderStream.available()); + if (read == 0) { + read = -1; + } else { + read = this.contentHeaderStream.read(b, offset, read); + } + // If done with the header stream, null it out. + if (this.contentHeaderStream.available() <= 0) { + this.contentHeaderStream = null; + } + // do not increment position - + // the underlying ArchiveRecord stream allready did this + //incrementPosition(); + } else { + read = super.read(b, offset, length); + } + return read; + } + + @Override + public int available() { + return ((ArchiveRecord)this.in).available(); + } + + @Override + public void close() throws IOException { + ((ArchiveRecord)this.in).close(); + } + + @Override + public void dump() throws IOException { + ((ArchiveRecord)this.in).dump(); + } + + @Override + public void dump(OutputStream os) throws IOException { + ((ArchiveRecord)this.in).dump(os); + } + + @Override + protected String getDigest4Cdx(ArchiveRecordHeader h) { + return ((ArchiveRecord)this.in).getDigest4Cdx(h); + } + + @Override + public String getDigestStr() { + return ((ArchiveRecord)this.in).getDigestStr(); + } + + @Override + public ArchiveRecordHeader getHeader() { + return ((ArchiveRecord)this.in).getHeader(); + } + + @Override + protected String getIp4Cdx(ArchiveRecordHeader h) { + return ((ArchiveRecord)this.in).getIp4Cdx(h); + } + + @Override + protected String getMimetype4Cdx(ArchiveRecordHeader h) { + return ((ArchiveRecord)this.in).getMimetype4Cdx(h); + } + + @Override + public long getPosition() { + return ((ArchiveRecord)this.in).getPosition(); + } + + @Override + protected String getStatusCode4Cdx(ArchiveRecordHeader h) { + return ((ArchiveRecord)this.in).getStatusCode4Cdx(h); + } + + @Override + public boolean hasContentHeaders() { + return ((ArchiveRecord)this.in).hasContentHeaders(); + } + + @Override + protected void incrementPosition() { + ((ArchiveRecord)this.in).incrementPosition(); + } + + @Override + protected void incrementPosition(long incr) { + ((ArchiveRecord)this.in).incrementPosition(incr); + } + + @Override + protected boolean isEor() { + return ((ArchiveRecord)this.in).isEor(); + } + + @Override + public boolean isStrict() { + return ((ArchiveRecord)this.in).isStrict(); + } + + @Override + public boolean markSupported() { + return ((ArchiveRecord)this.in).markSupported(); + } + + @Override + protected String outputCdx(String strippedFileName) throws IOException { + return ((ArchiveRecord)this.in).outputCdx(strippedFileName); + } + + @Override + protected void setEor(boolean eor) { + ((ArchiveRecord)this.in).setEor(eor); + } + + @Override + protected void setHeader(ArchiveRecordHeader header) { + ((ArchiveRecord)this.in).setHeader(header); + } + + @Override + public void setStrict(boolean strict) { + ((ArchiveRecord)this.in).setStrict(strict); + } + + @Override + protected void skip() throws IOException { + ((ArchiveRecord)this.in).skip(); + } + + @Override + public long skip(long n) throws IOException { + return ((ArchiveRecord)this.in).skip(n); + } +} diff --git a/src/main/java/org/archive/io/LoudObjectOutputStream.java b/src/main/java/org/archive/io/LoudObjectOutputStream.java new file mode 100644 index 00000000..959c2620 --- /dev/null +++ b/src/main/java/org/archive/io/LoudObjectOutputStream.java @@ -0,0 +1,63 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + +import java.io.IOException; +import java.io.ObjectOutputStream; +import java.io.OutputStream; +import java.util.HashSet; +import java.util.Set; +import java.util.logging.Logger; + +/** + * ObjectOutputStream that logs class name of each object that is written + * to the stream. Useful for tracking down sources of NotSerializableException. + * + * @author pjack + * + */ +public class LoudObjectOutputStream extends ObjectOutputStream { + + + final private static Logger LOGGER = Logger.getLogger( + LoudObjectOutputStream.class.getName()); + + // Only log each class name once + private Set alreadyLogged = new HashSet(); + + public LoudObjectOutputStream(OutputStream out) throws IOException { + super(out); + this.enableReplaceObject(true); + } + + + @Override + protected Object replaceObject(Object obj) throws IOException { + if (obj != null) { + String name = obj.getClass().getName(); + if (alreadyLogged.add(name)) { + LOGGER.info("WROTE: " + name); + } + } + return obj; + } + + +} diff --git a/src/main/java/org/archive/io/MiserOutputStream.java b/src/main/java/org/archive/io/MiserOutputStream.java new file mode 100644 index 00000000..f10ac9ca --- /dev/null +++ b/src/main/java/org/archive/io/MiserOutputStream.java @@ -0,0 +1,82 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + +import java.io.FilterOutputStream; +import java.io.IOException; +import java.io.OutputStream; + +/** + * A filter stream that both counts bytes written, and optionally swallows + * flush() requests. + * + * @contributor gojomo + */ +public class MiserOutputStream extends FilterOutputStream { + protected long count; + protected boolean passFlushes; + + /** + * Wraps another output stream, counting the number of bytes written. + * + * @param out the output stream to be wrapped + */ + public MiserOutputStream(OutputStream out) { + this(out,true); + } + + /** + * Wraps another output stream, counting the number of bytes written. + * + * @param out the output stream to be wrapped + */ + public MiserOutputStream(OutputStream out, boolean passFlushes) { + super(out); + this.passFlushes = passFlushes; + } + + /** Returns the number of bytes written. */ + public long getCount() { + return count; + } + + @Override public void write(byte[] b, int off, int len) throws IOException { + out.write(b, off, len); + count += len; + } + + @Override public void write(int b) throws IOException { + out.write(b); + count++; + } + + @Override + public void close() throws IOException { + passFlushes = true; + super.close(); + } + + @Override + public void flush() throws IOException { + if(passFlushes) { + super.flush(); + } + } +} diff --git a/src/main/java/org/archive/io/NoGzipMagicException.java b/src/main/java/org/archive/io/NoGzipMagicException.java new file mode 100644 index 00000000..27d1058a --- /dev/null +++ b/src/main/java/org/archive/io/NoGzipMagicException.java @@ -0,0 +1,26 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +/** + * @deprecated use {@link org.archive.util.zip.NoGzipMagicException} + */ +@Deprecated +public class NoGzipMagicException extends org.archive.util.zip.NoGzipMagicException { +} diff --git a/src/main/java/org/archive/io/ObjectPlusFilesInputStream.java b/src/main/java/org/archive/io/ObjectPlusFilesInputStream.java new file mode 100644 index 00000000..892860ed --- /dev/null +++ b/src/main/java/org/archive/io/ObjectPlusFilesInputStream.java @@ -0,0 +1,143 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.ObjectInputStream; +import java.util.Iterator; +import java.util.LinkedList; + +import org.archive.util.FileUtils; + + +/** + * Enhanced ObjectOutputStream with support for restoring + * files that had been saved, in parallel with object + * serialization. + * + * @author gojomo + * + */ +public class ObjectPlusFilesInputStream extends ObjectInputStream { + protected LinkedList auxiliaryDirectoryStack = new LinkedList(); + protected LinkedList postRestoreTasks = new LinkedList(); + + /** + * Instantiate over the given stream and using the supplied + * auxiliary storage directory. + * + * @param in + * @param storeDir + * @throws IOException + */ + public ObjectPlusFilesInputStream(InputStream in, File storeDir) + throws IOException { + super(in); + auxiliaryDirectoryStack.addFirst(storeDir); + } + + /** + * Push another default storage directory for use + * until popped. + * + * @param dir + */ + public void pushAuxiliaryDirectory(String dir) { + auxiliaryDirectoryStack. + addFirst(new File(getAuxiliaryDirectory(), dir)); + } + + /** + * Discard the top auxiliary directory. + */ + public void popAuxiliaryDirectory() { + auxiliaryDirectoryStack.removeFirst(); + } + + /** + * Return the top auxiliary directory, from + * which saved files are restored. + * + * @return Auxillary directory. + */ + public File getAuxiliaryDirectory() { + return (File)auxiliaryDirectoryStack.getFirst(); + } + + /** + * Restore a file from storage, using the name and length + * info on the serialization stream and the file from the + * current auxiliary directory, to the given File. + * + * @param destination + * @throws IOException + */ + public void restoreFile(File destination) throws IOException { + String nameAsStored = readUTF(); + long lengthAtStoreTime = readLong(); + File storedFile = new File(getAuxiliaryDirectory(),nameAsStored); + FileUtils.copyFile(storedFile, destination, lengthAtStoreTime); + } + + /** + * Restore a file from storage, using the name and length + * info on the serialization stream and the file from the + * current auxiliary directory, to the given File. + * + * @param directory + * @throws IOException + */ + public void restoreFileTo(File directory) throws IOException { + String nameAsStored = readUTF(); + long lengthAtStoreTime = readLong(); + File storedFile = new File(getAuxiliaryDirectory(),nameAsStored); + File destination = new File(directory,nameAsStored); + FileUtils.copyFile(storedFile, destination, lengthAtStoreTime); + } + + /** + * Register a task to be done when the ObjectPlusFilesInputStream + * is closed. + * + * @param task + */ + public void registerFinishTask(Runnable task) { + postRestoreTasks.addFirst(task); + } + + private void doFinishTasks() { + Iterator iter = postRestoreTasks.iterator(); + while(iter.hasNext()) { + ((Runnable)iter.next()).run(); + } + } + + /** + * In addition to default, do any registered cleanup tasks. + * + * @see java.io.InputStream#close() + */ + public void close() throws IOException { + super.close(); + doFinishTasks(); + } +} diff --git a/src/main/java/org/archive/io/ObjectPlusFilesOutputStream.java b/src/main/java/org/archive/io/ObjectPlusFilesOutputStream.java new file mode 100644 index 00000000..224f24e7 --- /dev/null +++ b/src/main/java/org/archive/io/ObjectPlusFilesOutputStream.java @@ -0,0 +1,134 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.io.File; +import java.io.IOException; +import java.io.ObjectOutputStream; +import java.io.OutputStream; +import java.util.LinkedList; + +import org.archive.util.FileUtils; + + +/** + * Enhanced ObjectOutputStream which maintains (a stack of) auxiliary + * directories and offers convenience methods for serialized objects + * to save their related disk files alongside their serialized version. + * + * @author gojomo + */ +public class ObjectPlusFilesOutputStream extends ObjectOutputStream { + protected LinkedList auxiliaryDirectoryStack = new LinkedList(); + + /** + * Constructor + * + * @param out + * @param topDirectory + * @throws java.io.IOException + */ + public ObjectPlusFilesOutputStream(OutputStream out, File topDirectory) throws IOException { + super(out); + auxiliaryDirectoryStack.addFirst(topDirectory); + } + + /** + * Add another subdirectory for any file-capture needs during the + * current serialization. + * + * @param dir + */ + public void pushAuxiliaryDirectory(String dir) { + auxiliaryDirectoryStack.addFirst(new File(getAuxiliaryDirectory(),dir)); + } + + /** + * Remove the top subdirectory. + * + */ + public void popAuxiliaryDirectory() { + auxiliaryDirectoryStack.removeFirst(); + } + + /** + * Return the current auxiliary directory for storing + * files associated with serialized objects. + * + * @return Auxillary directory. + */ + public File getAuxiliaryDirectory() { + return (File)auxiliaryDirectoryStack.getFirst(); + } + + /** + * Store a snapshot of an object's supporting file to the + * current auxiliary directory. Should only be used for + * files which are strictly appended-to, because it tries + * to use a "hard link" where possible (meaning that + * future edits to the original file's contents will + * also affect the snapshot). + * + * Remembers current file extent to allow a future restore + * to ignore subsequent appended data. + * + * @param file + * @throws IOException + */ + public void snapshotAppendOnlyFile(File file) throws IOException { + // write filename + String name = file.getName(); + writeUTF(name); + // write current file length + writeLong(file.length()); + File auxDir = getAuxiliaryDirectory(); + if(!auxDir.exists()) { + FileUtils.ensureWriteableDirectory(auxDir); + } + File destination = new File(auxDir,name); + hardlinkOrCopy(file, destination); + } + + /** + * Create a backup of this given file, first by trying a "hard + * link", then by using a copy if hard linking is unavailable + * (either because it is unsupported or the origin and checkpoint + * directories are on different volumes). + * + * @param file + * @param destination + * @throws IOException + */ + private void hardlinkOrCopy(File file, File destination) throws IOException { + // For Linux/UNIX, try a hard link first. + Process link = Runtime.getRuntime().exec("ln "+file.getAbsolutePath()+" "+destination.getAbsolutePath()); + // TODO NTFS also supports hard links; add appropriate try + try { + link.waitFor(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + if(link.exitValue()!=0) { + // hard link failed + FileUtils.copyFile(file,destination); + } + } + +} diff --git a/src/main/java/org/archive/io/OriginSeekInputStream.java b/src/main/java/org/archive/io/OriginSeekInputStream.java new file mode 100644 index 00000000..00605d82 --- /dev/null +++ b/src/main/java/org/archive/io/OriginSeekInputStream.java @@ -0,0 +1,121 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + + +import java.io.IOException; + + +/** + * Alters the origin of some other SeekInputStream. This class allows you + * to completely ignore everything in the underlying stream before a specified + * position, the origin position. + * + *

With the exception of {@link #position()} and {@link position(long)}, + * all of the methods in this class simply delegate to the underlying input + * stream. The position methods adjust the position of the + * underlying stream relative to the origin specified at construction time. + * + * @author pjack + */ +public class OriginSeekInputStream extends SeekInputStream { + + + /** + * The underlying stream. + */ + final private SeekInputStream input; + + + /** + * The origin position. In other words, this.position(0) + * resolves to input.position(start). + */ + final private long origin; + + + /** + * Constructor. + * + * @param input the underlying stream + * @param origin the origin position + * @throws IOException if an IO error occurs + */ + public OriginSeekInputStream(SeekInputStream input, long origin) + throws IOException { + this.input = input; + this.origin = origin; + input.position(origin); + } + + + @Override + public int available() throws IOException { + return input.available(); + } + + + @Override + public int read() throws IOException { + return input.read(); + } + + + @Override + public int read(byte[] buf, int ofs, int len) throws IOException { + return input.read(buf, ofs, len); + } + + + @Override + public int read(byte[] buf) throws IOException { + return input.read(buf); + } + + + @Override + public long skip(long count) throws IOException { + return input.skip(count); + } + + + /** + * Returns the position of the underlying stream relative to the origin. + * + * @return the relative position + * @throws IOException if an IO error occurs + */ + public long position() throws IOException { + return input.position() - origin; + } + + + /** + * Positions the underlying stream relative to the origin. + * In other words, this.position(0) resolves to input.position(origin), + * where input is underlying stream and origin is the origin specified + * at construction time. + * + * @param p the new position for this stream + * @throws IOException if an IO error occurs + */ + public void position(long p) throws IOException { + input.position(p + origin); + } +} diff --git a/src/main/java/org/archive/io/Preformatter.java b/src/main/java/org/archive/io/Preformatter.java new file mode 100644 index 00000000..dcd31bb6 --- /dev/null +++ b/src/main/java/org/archive/io/Preformatter.java @@ -0,0 +1,32 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.util.logging.LogRecord; + +/** + * Interface indicating a logging Formatter can preformat a record (outside + * the standard-implementation synchronized block) and cache it, returning it + * for the next request for formatting from the same thread. + * @contributor gojomo + */ +public interface Preformatter { + public void preformat(LogRecord record); + public void clear(); +} diff --git a/src/main/java/org/archive/io/RandomAccessInputStream.java b/src/main/java/org/archive/io/RandomAccessInputStream.java new file mode 100644 index 00000000..d8dd260b --- /dev/null +++ b/src/main/java/org/archive/io/RandomAccessInputStream.java @@ -0,0 +1,180 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + + +import java.io.File; +import java.io.IOException; +import java.io.RandomAccessFile; + + +/** + * Wraps a RandomAccessFile with an InputStream interface. + * + * @author gojomo + */ +public class RandomAccessInputStream extends SeekInputStream { + + /** + * Reference to the random access file this stream is reading from. + */ + private RandomAccessFile raf = null; + + /** + * When mark is called, save here the current position so we can go back + * on reset. + */ + private long markpos = -1; + + /** + * True if we are to close the underlying random access file when this + * stream is closed. + */ + private boolean sympathyClose; + + /** + * Constructor. + * + * If using this constructor, caller created the RAF and therefore + * its assumed wants to control close of the RAF. The RAF.close + * is not called if this constructor is used on close of this stream. + * + * @param raf RandomAccessFile to wrap. + * @throws IOException + */ + public RandomAccessInputStream(RandomAccessFile raf) + throws IOException { + this(raf, false, 0); + } + + /** + * Constructor. + * + * @param file File to get RAFIS on. Creates an RAF from passed file. + * Closes the created RAF when this stream is closed. + * @throws IOException + */ + public RandomAccessInputStream(final File file) + throws IOException { + this(new RandomAccessFile(file, "r"), true, 0); + } + + /** + * Constructor. + * + * @param file File to get RAFIS on. Creates an RAF from passed file. + * Closes the created RAF when this stream is closed. + * @param offset + * @throws IOException + */ + public RandomAccessInputStream(final File file, final long offset) + throws IOException { + this(new RandomAccessFile(file, "r"), true, offset); + } + + /** + * @param raf RandomAccessFile to wrap. + * @param sympathyClose Set to true if we are to close the RAF + * file when this stream is closed. + * @param offset + * @throws IOException + */ + public RandomAccessInputStream(final RandomAccessFile raf, + final boolean sympathyClose, final long offset) + throws IOException { + super(); + this.sympathyClose = sympathyClose; + this.raf = raf; + if (offset > 0) { + this.raf.seek(offset); + } + } + + /* (non-Javadoc) + * @see java.io.InputStream#read() + */ + public int read() throws IOException { + return this.raf.read(); + } + + /* (non-Javadoc) + * @see java.io.InputStream#read(byte[], int, int) + */ + public int read(byte[] b, int off, int len) throws IOException { + return this.raf.read(b, off, len); + } + + /* (non-Javadoc) + * @see java.io.InputStream#read(byte[]) + */ + public int read(byte[] b) throws IOException { + return this.raf.read(b); + } + + /* (non-Javadoc) + * @see java.io.InputStream#skip(long) + */ + public long skip(long n) throws IOException { + this.raf.seek(this.raf.getFilePointer() + n); + return n; + } + + public long position() throws IOException { + return this.raf.getFilePointer(); + } + + public void position(long position) throws IOException { + this.raf.seek(position); + } + + public int available() throws IOException { + long amount = this.raf.length() - this.position(); + return (amount >= Integer.MAX_VALUE)? Integer.MAX_VALUE: (int)amount; + } + + public boolean markSupported() { + return true; + } + + public synchronized void mark(int readlimit) { + try { + this.markpos = position(); + } catch (IOException e) { + // Set markpos to -1. Will cause exception reset. + this.markpos = -1; + } + } + + public synchronized void reset() throws IOException { + if (this.markpos == -1) { + throw new IOException("Mark has not been set."); + } + position(this.markpos); + } + + public void close() throws IOException { + try { + super.close(); + } finally { + if (this.sympathyClose) { + this.raf.close(); + } + } + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/RandomAccessOutputStream.java b/src/main/java/org/archive/io/RandomAccessOutputStream.java new file mode 100644 index 00000000..225f995f --- /dev/null +++ b/src/main/java/org/archive/io/RandomAccessOutputStream.java @@ -0,0 +1,69 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.io.IOException; +import java.io.OutputStream; +import java.io.RandomAccessFile; + + +/** + * Wraps a RandomAccessFile with OutputStream interface. + * + * @author gojomo + */ +public class RandomAccessOutputStream extends OutputStream { + protected RandomAccessFile raf; + + /** + * Wrap the given RandomAccessFile + */ + public RandomAccessOutputStream(RandomAccessFile raf) { + super(); + this.raf = raf; + } + + /* (non-Javadoc) + * @see java.io.OutputStream#write(int) + */ + public void write(int b) throws IOException { + raf.write(b); + } + + /* (non-Javadoc) + * @see java.io.OutputStream#close() + */ + public void close() throws IOException { + raf.close(); + } + + /* (non-Javadoc) + * @see java.io.OutputStream#write(byte[], int, int) + */ + public void write(byte[] b, int off, int len) throws IOException { + raf.write(b, off, len); + } + + /* (non-Javadoc) + * @see java.io.OutputStream#write(byte[]) + */ + public void write(byte[] b) throws IOException { + raf.write(b); + } +} diff --git a/src/main/java/org/archive/io/ReadSource.java b/src/main/java/org/archive/io/ReadSource.java new file mode 100644 index 00000000..a3c29967 --- /dev/null +++ b/src/main/java/org/archive/io/ReadSource.java @@ -0,0 +1,37 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + +import java.io.Reader; + +/** + * Interface for objects that can provide a Reader view of their + * contents. + * + */ +public interface ReadSource { + /** + * Obtain a Reader. Not named 'getReader' so that it is not + * considered a simple costless read-only property by + * bean-convention introspection tools. + * @return a Reader on this object + */ + Reader obtainReader(); +} diff --git a/src/main/java/org/archive/io/RecorderIOException.java b/src/main/java/org/archive/io/RecorderIOException.java new file mode 100644 index 00000000..07b30061 --- /dev/null +++ b/src/main/java/org/archive/io/RecorderIOException.java @@ -0,0 +1,38 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.io.IOException; + +/** + * + * @author Gordon Mohr + */ +public class RecorderIOException extends IOException { + + private static final long serialVersionUID = 5907470275350314277L; + + public RecorderIOException() { + super(); + } + + public RecorderIOException(String msg) { + super(msg); + } +} diff --git a/src/main/java/org/archive/io/RecorderLengthExceededException.java b/src/main/java/org/archive/io/RecorderLengthExceededException.java new file mode 100644 index 00000000..8c3e067d --- /dev/null +++ b/src/main/java/org/archive/io/RecorderLengthExceededException.java @@ -0,0 +1,39 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + + +/** + * Indicates a length exception thrown by the Recorder. + * + * @author Gordon Mohr + */ +public class RecorderLengthExceededException +extends RecorderIOException { + + private static final long serialVersionUID = 6655419033414648444L; + + public RecorderLengthExceededException() { + super(); + } + + public RecorderLengthExceededException(String msg) { + super(msg); + } +} diff --git a/src/main/java/org/archive/io/RecorderTimeoutException.java b/src/main/java/org/archive/io/RecorderTimeoutException.java new file mode 100644 index 00000000..32be5b5d --- /dev/null +++ b/src/main/java/org/archive/io/RecorderTimeoutException.java @@ -0,0 +1,37 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +/** + * Indicates a timeout thrown by the RecordingInputStream. + * + * @author Gordon Mohr + */ +public class RecorderTimeoutException extends RecorderIOException { + + private static final long serialVersionUID = 7433214063765078269L; + + public RecorderTimeoutException() { + super(); + } + + public RecorderTimeoutException(String msg) { + super(msg); + } +} diff --git a/src/main/java/org/archive/io/RecorderTooMuchHeaderException.java b/src/main/java/org/archive/io/RecorderTooMuchHeaderException.java new file mode 100644 index 00000000..23f5d264 --- /dev/null +++ b/src/main/java/org/archive/io/RecorderTooMuchHeaderException.java @@ -0,0 +1,40 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + + +/** + * Indicates a too much header material exception thrown by the Recorder + * (specificially the RecordingOutputStream) + * + * @author Gordon Mohr + */ +public class RecorderTooMuchHeaderException +extends RecorderIOException { + + private static final long serialVersionUID = 3528516034898129150L; + + public RecorderTooMuchHeaderException() { + super(); + } + + public RecorderTooMuchHeaderException(String msg) { + super(msg); + } +} diff --git a/src/main/java/org/archive/io/RecordingInputStream.java b/src/main/java/org/archive/io/RecordingInputStream.java new file mode 100644 index 00000000..b46905ed --- /dev/null +++ b/src/main/java/org/archive/io/RecordingInputStream.java @@ -0,0 +1,355 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.io.IOException; +import java.io.InputStream; +import java.net.SocketException; +import java.net.SocketTimeoutException; +import java.security.MessageDigest; +import java.util.logging.Level; +import java.util.logging.Logger; + +import org.apache.commons.io.IOUtils; + + +/** + * Stream which records all data read from it, which it acquires from a wrapped + * input stream. + * + * Makes use of a RecordingOutputStream for recording because of its being + * file backed so we can write massive amounts of data w/o worrying about + * overflowing memory. + * + * @author gojomo + * + */ +public class RecordingInputStream + extends InputStream { + + protected static Logger logger = + Logger.getLogger("org.archive.io.RecordingInputStream"); + + /** + * Where we are recording to. + */ + private RecordingOutputStream recordingOutputStream; + + /** + * Stream to record. + */ + private InputStream in = null; + + /** + * Reusable buffer to avoid reallocation on each readFullyUntil + */ + protected byte[] drainBuffer = new byte[16*1024]; + + /** + * Create a new RecordingInputStream. + * + * @param bufferSize Size of buffer to use. + * @param backingFilename Name of backing file. + */ + public RecordingInputStream(int bufferSize, String backingFilename) + { + this.recordingOutputStream = new RecordingOutputStream(bufferSize, + backingFilename); + } + + public void open(InputStream wrappedStream) throws IOException { + logger.fine(Thread.currentThread().getName() + " opening " + + wrappedStream + ", " + Thread.currentThread().getName()); + if(isOpen()) { + // error; should not be opening/wrapping in an unclosed + // stream remains open + throw new IOException("RIS already open for " + +Thread.currentThread().getName()); + } + try { + this.in = wrappedStream; + this.recordingOutputStream.open(); + } catch (IOException ioe) { + close(); // ...and rethrow... + throw ioe; + } + } + + public int read() throws IOException { + if (!isOpen()) { + throw new IOException("Stream closed " + + Thread.currentThread().getName()); + } + int b = this.in.read(); + if (b != -1) { + assert this.recordingOutputStream != null: "ROS is null " + + Thread.currentThread().getName(); + this.recordingOutputStream.write(b); + } + return b; + } + + public int read(byte[] b, int off, int len) throws IOException { + if (!isOpen()) { + throw new IOException("Stream closed " + + Thread.currentThread().getName()); + } + int count = this.in.read(b,off,len); + if (count > 0) { + assert this.recordingOutputStream != null: "ROS is null " + + Thread.currentThread().getName(); + this.recordingOutputStream.write(b,off,count); + } + return count; + } + + public int read(byte[] b) throws IOException { + if (!isOpen()) { + throw new IOException("Stream closed " + + Thread.currentThread().getName()); + } + int count = this.in.read(b); + if (count > 0) { + assert this.recordingOutputStream != null: "ROS is null " + + Thread.currentThread().getName(); + this.recordingOutputStream.write(b,0,count); + } + return count; + } + + public void close() throws IOException { + if (logger.isLoggable(Level.FINE)) { + logger.fine(Thread.currentThread().getName() + " closing " + + this.in + ", " + Thread.currentThread().getName()); + } + IOUtils.closeQuietly(this.in); + this.in = null; + IOUtils.closeQuietly(this.recordingOutputStream); + } + + public ReplayInputStream getReplayInputStream() throws IOException { + return this.recordingOutputStream.getReplayInputStream(); + } + + public ReplayInputStream getMessageBodyReplayInputStream() throws IOException { + return this.recordingOutputStream.getMessageBodyReplayInputStream(); + } + + public long readFully() throws IOException { + while(read(drainBuffer) != -1) { + // Empty out stream. + continue; + } + return this.recordingOutputStream.getSize(); + } + + /** + * Read all of a stream (Or read until we timeout or have read to the max). + * @param softMaxLength Maximum length to read; if zero or < 0, then no + * limit. If met, return normally. + * @param hardMaxLength Maximum length to read; if zero or < 0, then no + * limit. If exceeded, throw RecorderLengthExceededException + * @param timeout Timeout in milliseconds for total read; if zero or + * negative, timeout is Long.MAX_VALUE. If exceeded, throw + * RecorderTimeoutException + * @param maxBytesPerMs How many bytes per millisecond. + * @throws IOException failed read. + * @throws RecorderLengthExceededException + * @throws RecorderTimeoutException + * @throws InterruptedException + */ + public void readFullyOrUntil(long softMaxLength) + throws IOException, RecorderLengthExceededException, + RecorderTimeoutException, InterruptedException { + // Check we're open before proceeding. + if (!isOpen()) { + // TODO: should this be a noisier exception-raising error? + return; + } + + long totalBytes = 0L; + long bytesRead = -1L; + long maxToRead = -1; + while (true) { + try { + // read no more than soft max + maxToRead = (softMaxLength <= 0) + ? drainBuffer.length + : Math.min(drainBuffer.length, softMaxLength - totalBytes); + // nor more than hard max + maxToRead = Math.min(maxToRead, recordingOutputStream.getRemainingLength()); + // but always at least 1 (to trigger hard max exception + maxToRead = Math.max(maxToRead, 1); + + bytesRead = read(drainBuffer,0,(int)maxToRead); + if (bytesRead == -1) { + break; + } + totalBytes += bytesRead; + + if (Thread.interrupted()) { + throw new InterruptedException("Interrupted during IO"); + } + } catch (SocketTimeoutException e) { + // A socket timeout is just a transient problem, meaning + // nothing was available in the configured timeout period, + // but something else might become available later. + // Take this opportunity to check the overall + // timeout (below). One reason for this timeout is + // servers that keep up the connection, 'keep-alive', even + // though we asked them to not keep the connection open. + if (logger.isLoggable(Level.FINE)) { + logger.log(Level.FINE, "socket timeout", e); + } + // check for interrupt + if (Thread.interrupted()) { + throw new InterruptedException("Interrupted during IO"); + } + // check for overall timeout + recordingOutputStream.checkLimits(); + } catch (SocketException se) { + throw se; + } catch (NullPointerException e) { + // [ 896757 ] NPEs in Andy's Th-Fri Crawl. + // A crawl was showing NPE's in this part of the code but can + // not reproduce. Adding this rethrowing catch block w/ + // diagnostics to help should we come across the problem in the + // future. + throw new NullPointerException("Stream " + this.in + ", " + + e.getMessage() + " " + Thread.currentThread().getName()); + } + + // if have read 'enough', just finish + if (softMaxLength > 0 && totalBytes >= softMaxLength) { + break; // return + } + } + } + + public long getSize() { + return this.recordingOutputStream.getSize(); + } + + public void markContentBegin() { + this.recordingOutputStream.markMessageBodyBegin(); + } + + public long getContentBegin() { + return this.recordingOutputStream.getMessageBodyBegin(); + } + + public void startDigest() { + this.recordingOutputStream.startDigest(); + } + + /** + * Convenience method for setting SHA1 digest. + */ + public void setSha1Digest() { + this.recordingOutputStream.setSha1Digest(); + } + + /** + * Sets a digest algorithm which may be applied to recorded data. + * As usually only a subset of the recorded data should + * be fed to the digest, you must also call startDigest() + * to begin digesting. + * + * @param algorithm + */ + public void setDigest(String algorithm) { + this.recordingOutputStream.setDigest(algorithm); + } + + /** + * Sets a digest function which may be applied to recorded data. + * As usually only a subset of the recorded data should + * be fed to the digest, you must also call startDigest() + * to begin digesting. + * + * @param md + */ + public void setDigest(MessageDigest md) { + this.recordingOutputStream.setDigest(md); + } + + /** + * Return the digest value for any recorded, digested data. Call + * only after all data has been recorded; otherwise, the running + * digest state is ruined. + * + * @return the digest final value + */ + public byte[] getDigestValue() { + return this.recordingOutputStream.getDigestValue(); + } + + public long getResponseContentLength() { + return this.recordingOutputStream.getResponseContentLength(); + } + + public void closeRecorder() throws IOException { + this.recordingOutputStream.closeRecorder(); + } + + /** + * @return True if we've been opened. + */ + public boolean isOpen() + { + return this.in != null; + } + + @Override + public synchronized void mark(int readlimit) { + this.in.mark(readlimit); + this.recordingOutputStream.mark(); + } + + @Override + public boolean markSupported() { + return this.in.markSupported(); + } + + @Override + public synchronized void reset() throws IOException { + this.in.reset(); + this.recordingOutputStream.reset(); + } + + /** + * Set limits to be enforced by internal recording-out + */ + public void setLimits(long hardMax, long timeoutMs, long maxRateKBps) { + recordingOutputStream.setLimits(hardMax, timeoutMs, maxRateKBps); + } + + /** + * Expose the amount of in-memory buffering used by the internal + * recording stream. + * @return int buffer size + */ + public int getRecordedBufferLength() { + return recordingOutputStream.getBufferLength(); + } + + public void clearForReuse() throws IOException { + recordingOutputStream.clearForReuse(); + } +} diff --git a/src/main/java/org/archive/io/RecordingOutputStream.java b/src/main/java/org/archive/io/RecordingOutputStream.java new file mode 100644 index 00000000..4d0713da --- /dev/null +++ b/src/main/java/org/archive/io/RecordingOutputStream.java @@ -0,0 +1,576 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + +import it.unimi.dsi.fastutil.io.FastBufferedOutputStream; + +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.util.logging.Level; +import java.util.logging.Logger; + + +/** + * An output stream that records all writes to wrapped output + * stream. + * + * A RecordingOutputStream can be wrapped around any other + * OutputStream to record all bytes written to it. You can + * then request a ReplayInputStream to read those bytes. + * + *

The RecordingOutputStream uses an in-memory buffer and + * backing disk file to allow it to record streams of + * arbitrary length limited only by available disk space. + * + *

As long as the stream recorded is smaller than the + * in-memory buffer, no disk access will occur. + * + *

Recorded content can be recovered as a ReplayInputStream + * (via getReplayInputStream() or, for only the content after + * the content-begin-mark is set, getContentReplayInputStream() ) + * or as a ReplayCharSequence (via getReplayCharSequence()). + * + *

This class is also used as a straight output stream + * by {@link RecordingInputStream} to which it records all reads. + * {@link RecordingInputStream} is exploiting the file backed buffer + * facility of this class passing null for the stream + * to wrap. TODO: Make a FileBackedOutputStream class that is + * subclassed by RecordingInputStream. + * + * @author gojomo + * + */ +public class RecordingOutputStream extends OutputStream { + protected static Logger logger = + Logger.getLogger(RecordingOutputStream.class.getName()); + + /** + * Size of recording. + * + * Later passed to ReplayInputStream on creation. It uses it to know when + * EOS. + */ + protected long size = 0; + + protected String backingFilename; + protected OutputStream diskStream = null; + + /** + * Buffer we write recordings to. + * + * We write all recordings here first till its full. Thereafter we + * write the backing file. + */ + private byte[] buffer; + + /** current virtual position in the recording */ + private long position; + + /** flag to disable recording */ + private boolean recording; + + /** + * Reusable buffer for FastBufferedOutputStream + */ + protected byte[] bufStreamBuf = + new byte [ FastBufferedOutputStream.DEFAULT_BUFFER_SIZE ]; + + /** + * True if we're to digest content. + */ + private boolean shouldDigest = false; + + /** + * Digest instance. + */ + private MessageDigest digest = null; + + /** + * Define for SHA1 algarithm. + */ + private static final String SHA1 = "SHA1"; + + /** + * Maximum amount of header material to accept without the content + * body beginning -- if more, throw a RecorderTooMuchHeaderException. + * TODO: make configurable? make smaller? + */ + protected static final long MAX_HEADER_MATERIAL = 1024*1024; // 1MB + + // configurable max length, max time limits + /** maximum length of material to record before throwing exception */ + protected long maxLength = Long.MAX_VALUE; + /** maximum time to record before throwing exception */ + protected long timeoutMs = Long.MAX_VALUE; + /** maximum rate to record (adds delays to hit target rate) */ + protected long maxRateBytesPerMs = Long.MAX_VALUE; + /** time recording begins for timeout, rate calculations */ + protected long startTime = Long.MAX_VALUE; + + /** + * When recording HTTP, where the content-body starts. + */ + protected long messageBodyBeginMark; + + /** + * Stream to record. + */ + private OutputStream out = null; + + // mark/reset support + /** furthest position reached before any reset()s */ + private long maxPosition = 0; + /** remembered position to reset() to */ + private long markPosition = 0; + + /** + * Create a new RecordingOutputStream. + * + * @param bufferSize Buffer size to use. + * @param backingFilename Name of backing file to use. + */ + public RecordingOutputStream(int bufferSize, String backingFilename) { + this.buffer = new byte[bufferSize]; + this.backingFilename = backingFilename; + recording = true; + } + + /** + * Wrap the given stream, both recording and passing along any data written + * to this RecordingOutputStream. + * + * @throws IOException If failed creation of backing file. + */ + public void open() throws IOException { + this.open(null); + } + + /** + * Wrap the given stream, both recording and passing along any data written + * to this RecordingOutputStream. + * + * @param wrappedStream Stream to wrap. May be null for case where we + * want to write to a file backed stream only. + * + * @throws IOException If failed creation of backing file. + */ + public void open(OutputStream wrappedStream) throws IOException { + if(isOpen()) { + // error; should not be opening/wrapping in an unclosed + // stream remains open + throw new IOException("ROS already open for " + +Thread.currentThread().getName()); + } + clearForReuse(); + this.out = wrappedStream; + if (this.diskStream == null) { + // TODO: Fix so we only make file when its actually needed. + FileOutputStream fis = new FileOutputStream(this.backingFilename); + + this.diskStream = new RecyclingFastBufferedOutputStream(fis, bufStreamBuf); + } + startTime = System.currentTimeMillis(); + } + + public void write(int b) throws IOException { + if(position< maxPosition) { + if(position+len<=maxPosition) { + // revisiting; do nothing but advance position + position += len; + return; + } + // consume part of the array doing nothing but advancing position + long consumeRange = maxPosition - position; + position += consumeRange; + off += consumeRange; + len -= consumeRange; + } + if(recording) { + record(b, off, len); + } + if (this.out != null) { + this.out.write(b, off, len); + } + checkLimits(); + } + + /** + * Check any enforced limits. + */ + protected void checkLimits() throws RecorderIOException { + // too much material before finding end of headers? + if (messageBodyBeginMark<0) { + // no mark yet + if(position>MAX_HEADER_MATERIAL) { + throw new RecorderTooMuchHeaderException(); + } + } + // overlong? + if(position>maxLength) { + throw new RecorderLengthExceededException(); + } + // taking too long? + long duration = System.currentTimeMillis() - startTime; + duration = Math.max(duration,1); // !divzero + if(duration>timeoutMs) { + throw new RecorderTimeoutException(); + } + // need to throttle reading to hit max configured rate? + if(position/duration > maxRateBytesPerMs) { + long desiredDuration = position / maxRateBytesPerMs; + try { + Thread.sleep(desiredDuration-duration); + } catch (InterruptedException e) { + logger.log(Level.WARNING, + "bandwidth throttling sleep interrupted", e); + } + } + } + + /** + * Record the given byte for later recovery + * + * @param b Int to record. + * + * @exception IOException Failed write to backing file. + */ + private void record(int b) throws IOException { + if (this.shouldDigest) { + this.digest.update((byte)b); + } + if (this.position >= this.buffer.length) { + // TODO: Its possible to call write w/o having first opened a + // stream. Protect ourselves against this. + assert this.diskStream != null: "Diskstream is null"; + this.diskStream.write(b); + } else { + this.buffer[(int) this.position] = (byte) b; + } + this.position++; + } + + /** + * Record the given byte-array range for recovery later + * + * @param b Buffer to record. + * @param off Offset into buffer at which to start recording. + * @param len Length of buffer to record. + * + * @exception IOException Failed write to backing file. + */ + private void record(byte[] b, int off, int len) throws IOException { + if(this.shouldDigest) { + assert this.digest != null: "Digest is null."; + this.digest.update(b, off, len); + } + tailRecord(b, off, len); + } + + /** + * Record without digesting. + * + * @param b Buffer to record. + * @param off Offset into buffer at which to start recording. + * @param len Length of buffer to record. + * + * @exception IOException Failed write to backing file. + */ + private void tailRecord(byte[] b, int off, int len) throws IOException { + if(this.position >= this.buffer.length){ + // TODO: Its possible to call write w/o having first opened a + // stream. Lets protect ourselves against this. + if (this.diskStream == null) { + throw new IOException("diskstream is null"); + } + this.diskStream.write(b, off, len); + this.position += len; + } else { + assert this.buffer != null: "Buffer is null"; + int toCopy = (int)Math.min(this.buffer.length - this.position, len); + assert b != null: "Passed buffer is null"; + System.arraycopy(b, off, this.buffer, (int)this.position, toCopy); + this.position += toCopy; + // TODO verify these are +1 -1 right + if (toCopy < len) { + tailRecord(b, off + toCopy, len - toCopy); + } + } + } + + public void close() throws IOException { + if(messageBodyBeginMark<0) { + // if unset, consider 0 posn as content-start + // (so that a -1 never survives to replay step) + messageBodyBeginMark = 0; + } + if (this.out != null) { + this.out.close(); + this.out = null; + } + closeRecorder(); + } + + protected synchronized void closeDiskStream() + throws IOException { + if (this.diskStream != null) { + this.diskStream.close(); + this.diskStream = null; + } + } + + public void closeRecorder() throws IOException { + recording = false; + closeDiskStream(); // if any + // This setting of size is important. Its passed to ReplayInputStream + // on creation. It uses it to know EOS. + if (this.size == 0) { + this.size = this.position; + } + } + + /* (non-Javadoc) + * @see java.io.OutputStream#flush() + */ + public void flush() throws IOException { + if (this.out != null) { + this.out.flush(); + } + if (this.diskStream != null) { + this.diskStream.flush(); + } + } + + public ReplayInputStream getReplayInputStream() throws IOException { + return getReplayInputStream(0); + } + + public ReplayInputStream getReplayInputStream(long skip) throws IOException { + // If this method is being called, then assumption must be that the + // stream is closed. If it ain't, then the stream gotten won't work + // -- the size will zero so any attempt at a read will get back EOF. + assert this.out == null: "Stream is still open."; + ReplayInputStream replay = new ReplayInputStream(this.buffer, + this.size, this.messageBodyBeginMark, this.backingFilename); + replay.skip(skip); + return replay; + } + + /** + * Return a replay stream, cued up to begining of content + * + * @throws IOException + * @return An RIS. + */ + public ReplayInputStream getMessageBodyReplayInputStream() throws IOException { + return getReplayInputStream(this.messageBodyBeginMark); + } + + public long getSize() { + return this.size; + } + + /** + * Remember the current position as the start of the "message + * body". Useful when recording HTTP traffic as a way to start + * replays after the headers. + */ + public void markMessageBodyBegin() { + this.messageBodyBeginMark = this.position; + startDigest(); + } + + /** + * Return stored message-body-begin-mark (which is also end-of-headers) + */ + public long getMessageBodyBegin() { + return this.messageBodyBeginMark; + } + + /** + * Starts digesting recorded data, if a MessageDigest has been + * set. + */ + public void startDigest() { + if (this.digest != null) { + this.digest.reset(); + this.shouldDigest = true; + } + } + + /** + * Convenience method for setting SHA1 digest. + * @see #setDigest(String) + */ + public void setSha1Digest() { + setDigest(SHA1); + } + + + /** + * Sets a digest function which may be applied to recorded data. + * The difference between calling this method and {@link #setDigest(MessageDigest)} + * is that this method tries to reuse MethodDigest instance if already allocated + * and of appropriate algorithm. + * @param algorithm Message digest algorithm to use. + * @see #setDigest(MessageDigest) + */ + public void setDigest(String algorithm) { + try { + // Reuse extant digest if its sha1 algorithm. + if (this.digest == null || + !this.digest.getAlgorithm().equals(algorithm)) { + setDigest(MessageDigest.getInstance(algorithm)); + } + } catch (NoSuchAlgorithmException e) { + e.printStackTrace(); + } + } + + /** + * Sets a digest function which may be applied to recorded data. + * + * As usually only a subset of the recorded data should + * be fed to the digest, you must also call startDigest() + * to begin digesting. + * + * @param md Message digest function to use. + */ + public void setDigest(MessageDigest md) { + this.digest = md; + } + + /** + * Return the digest value for any recorded, digested data. Call + * only after all data has been recorded; otherwise, the running + * digest state is ruined. + * + * @return the digest final value + */ + public byte[] getDigestValue() { + if(this.digest == null) { + return null; + } + return this.digest.digest(); + } + + public long getResponseContentLength() { + return this.size - this.messageBodyBeginMark; + } + + /** + * @return True if this ROS is open. + */ + public boolean isOpen() { + return this.out != null; + } + + public int getBufferLength() { + return this.buffer.length; + } + + /** + * When used alongside a mark-supporting RecordingInputStream, remember + * a position reachable by a future reset(). + */ + public void mark() { + // remember this position for subsequent reset() + this.markPosition = position; + } + + /** + * When used alongside a mark-supporting RecordingInputStream, reset + * the position to that saved by previous mark(). Until the position + * again reached "new" material, none of the bytes pushed to this + * stream will be digested or recorded. + */ + public void reset() { + // take note of furthest-position-reached to avoid double-recording + maxPosition = Math.max(maxPosition, position); + // reset to previous position + position = markPosition; + } + + /** + * Set limits on length, time, and rate to enforce. + * + * @param length + * @param milliseconds + * @param rateKBps + */ + public void setLimits(long length, long milliseconds, long rateKBps) { + maxLength = (length>0) ? length : Long.MAX_VALUE; + timeoutMs = (milliseconds>0) ? milliseconds : Long.MAX_VALUE; + maxRateBytesPerMs = (rateKBps>0) ? rateKBps*1024/1000 : Long.MAX_VALUE; + } + + /** + * Reset limits to effectively-unlimited defaults + */ + public void resetLimits() { + maxLength = Long.MAX_VALUE; + timeoutMs = Long.MAX_VALUE; + maxRateBytesPerMs = Long.MAX_VALUE; + } + + /** + * Return number of bytes that could be recorded without hitting + * length limit + * + * @return long byte count + */ + public long getRemainingLength() { + return maxLength - position; + } + + public void clearForReuse() throws IOException { + this.out = null; + this.position = 0; + this.markPosition = 0; + this.maxPosition = 0; + this.size = 0; + this.messageBodyBeginMark = -1; + // ensure recording turned on + this.recording = true; + // Always begins false; must use startDigest() to begin + this.shouldDigest = false; + if (this.diskStream != null) { + closeDiskStream(); + } + } +} + diff --git a/src/main/java/org/archive/io/RecoverableIOException.java b/src/main/java/org/archive/io/RecoverableIOException.java new file mode 100644 index 00000000..5ce2251a --- /dev/null +++ b/src/main/java/org/archive/io/RecoverableIOException.java @@ -0,0 +1,83 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.io.IOException; +import java.io.PrintStream; +import java.io.PrintWriter; + +/** + * A decorator on IOException for IOEs that are likely not fatal or at least + * merit retry. + * @author stack + * @version $Date$, $Revision$ + */ +public class RecoverableIOException extends IOException { + private static final long serialVersionUID = 6194776587381865451L; + private final IOException decoratedIOException; + + public RecoverableIOException(final String message) { + this(new IOException(message)); + } + + public RecoverableIOException(final IOException ioe) { + super(); + this.decoratedIOException = ioe; + } + + public Throwable getCause() { + return this.decoratedIOException.getCause(); + } + + public String getLocalizedMessage() { + return this.decoratedIOException.getLocalizedMessage(); + } + + public String getMessage() { + return this.decoratedIOException.getMessage(); + } + + public StackTraceElement[] getStackTrace() { + return this.decoratedIOException.getStackTrace(); + } + + public synchronized Throwable initCause(Throwable cause) { + return this.decoratedIOException.initCause(cause); + } + + public void printStackTrace() { + this.decoratedIOException.printStackTrace(); + } + + public void printStackTrace(PrintStream s) { + this.decoratedIOException.printStackTrace(s); + } + + public void printStackTrace(PrintWriter s) { + this.decoratedIOException.printStackTrace(s); + } + + public void setStackTrace(StackTraceElement[] stackTrace) { + this.decoratedIOException.setStackTrace(stackTrace); + } + + public String toString() { + return this.decoratedIOException.toString(); + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/RecyclingFastBufferedOutputStream.java b/src/main/java/org/archive/io/RecyclingFastBufferedOutputStream.java new file mode 100644 index 00000000..a3b76e46 --- /dev/null +++ b/src/main/java/org/archive/io/RecyclingFastBufferedOutputStream.java @@ -0,0 +1,37 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import it.unimi.dsi.fastutil.io.FastBufferedOutputStream; + +import java.io.OutputStream; + +/** + * FastBufferedOutputStream that accepts a passed-in buffer (avoiding + * reallocation). + */ +public class RecyclingFastBufferedOutputStream extends FastBufferedOutputStream { + public RecyclingFastBufferedOutputStream( final OutputStream os, final byte[] buffer ) { + super(os); + this.buffer = buffer; + avail = buffer.length; + } +} + + diff --git a/src/main/java/org/archive/io/ReplayCharSequence.java b/src/main/java/org/archive/io/ReplayCharSequence.java new file mode 100644 index 00000000..aa9b9587 --- /dev/null +++ b/src/main/java/org/archive/io/ReplayCharSequence.java @@ -0,0 +1,77 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + +import java.io.Closeable; +import java.io.IOException; +import java.nio.charset.CharacterCodingException; +import java.nio.charset.Charset; + +import com.google.common.base.Charsets; + + +/** + * CharSequence interface with addition of a {@link #close()} method. + * + * Users of implementations of this interface must call {@link #close()} so + * implementations get a chance at cleaning up after themselves. + * + * @author stack + * @version $Revision$, $Date$ + */ +public interface ReplayCharSequence extends CharSequence, Closeable { + + /** charset to use in replay when declared value + * is absent/illegal/unavailable */ + public Charset FALLBACK_CHARSET = Charsets.ISO_8859_1; // TODO: should this be UTF-8? + + /** + * Call this method when done so implementation has chance to clean up + * resources. + * + * @throws IOException Problem cleaning up file system resources. + */ + public void close() throws IOException; + + /** + * Report count of decoder errors silently eaten during ReplayCharSequence + * use. May be less than the number of individual decoding anomalies in + * underlying content (if decoding method doesn't allow counting individual + * errors). + */ + public long getDecodeExceptionCount(); + + /** + * Return the first coding-exception encountered, if the count > 0. + * @return CharacterCodingException + */ + public CharacterCodingException getCodingException(); + + /** + * @return false if {@link #close()} has been called + */ + public boolean isOpen(); + + /** + * Return the effective Charset used to create this CharSequence from + * (raw byte) source material. + */ + public Charset getCharset(); +} diff --git a/src/main/java/org/archive/io/ReplayInputStream.java b/src/main/java/org/archive/io/ReplayInputStream.java new file mode 100644 index 00000000..fccf5fd3 --- /dev/null +++ b/src/main/java/org/archive/io/ReplayInputStream.java @@ -0,0 +1,325 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; + +import org.apache.commons.io.IOUtils; +import org.archive.util.ArchiveUtils; +import org.archive.util.FileUtils; + + +/** + * Replays the bytes recorded from a RecordingInputStream or + * RecordingOutputStream. + * + * This InputStream supports mark and reset. + * + * @author gojomo + */ +public class ReplayInputStream extends SeekInputStream +{ + private static final int DEFAULT_BUFFER_SIZE = 256*1024; // 256KiB + private BufferedSeekInputStream diskStream; + private byte[] buffer; + private long position; + + /** + * Total size of stream content. + * + * Size of data to replay. + */ + private long size = -1; + + /** + * Where the response body starts, if marked + */ + protected long responseBodyStart = -1; + + + /** + * Constructor. + * + * @param buffer Buffer to read from. + * @param size Size of data to replay. + * @param responseBodyStart Start of the response body. + * @param backingFilename Backing file that sits behind the buffer. If + * size > than buffer then we go to backing file to read + * data that is beyond buffer.length. + * + * @throws IOException If we fail to open an input stream on + * backing file. + */ + public ReplayInputStream(byte[] buffer, long size, long responseBodyStart, + String backingFilename) + throws IOException + { + this(buffer, size, backingFilename); + this.responseBodyStart = responseBodyStart; + } + + /** + * Constructor. + * + * @param buffer Buffer to read from. + * @param size Size of data to replay. + * @param backingFilename Backing file that sits behind the buffer. If + * size > than buffer then we go to backing file to read + * data that is beyond buffer.length. + * @throws IOException If we fail to open an input stream on + * backing file. + */ + public ReplayInputStream(byte[] buffer, long size, String backingFilename) + throws IOException + { + this.buffer = buffer; + this.size = size; + if (size > buffer.length) { + setupDiskStream(new File(backingFilename)); + } + } + + protected void setupDiskStream(File backingFile) throws IOException { + RandomAccessInputStream rais = new RandomAccessInputStream(backingFile); + diskStream = new BufferedSeekInputStream(rais, 4096); + } + + protected File backingFile; + + /** + * Create a ReplayInputStream from the given source stream. Requires + * reading the entire stream (and possibly overflowing to a temporary + * file). Primary reason for doing so would be to have a repositionable + * version of the original stream's contents. + * + * If created via this constructor, use the destroy() method to ensure + * prompt deletion of any associated tmp file when done. + * + * @param fillStream + * @throws IOException + */ + public ReplayInputStream(InputStream fillStream) throws IOException { + this.buffer = new byte[DEFAULT_BUFFER_SIZE]; + long count = ArchiveUtils.readFully(fillStream, buffer); + if(fillStream.available()>0) { + this.backingFile = File.createTempFile("tid"+Thread.currentThread().getId(), "ris"); + count += FileUtils.readFullyToFile(fillStream, backingFile); + setupDiskStream(backingFile); + } + this.size = count; + } + + /** + * Close & destroy any internally-generated temporary files. + */ + public void destroy() { + IOUtils.closeQuietly(this); + if(backingFile!=null) { + FileUtils.deleteSoonerOrLater(backingFile); + } + } + + public long setToResponseBodyStart() throws IOException { + position(responseBodyStart); + return this.position; + } + + + /* (non-Javadoc) + * @see java.io.InputStream#read() + */ + public int read() throws IOException { + if (position == size) { + return -1; // EOF + } + if (position < buffer.length) { + // Convert to unsigned int. + int c = buffer[(int) position] & 0xFF; + position++; + return c; + } + int c = diskStream.read(); + if (c >= 0) { + position++; + } + return c; + } + + /* + * (non-Javadoc) + * + * @see java.io.InputStream#read(byte[], int, int) + */ + public int read(byte[] b, int off, int len) throws IOException { + if (position == size) { + return -1; // EOF + } + if (position < buffer.length) { + int toCopy = (int)Math.min(size - position, + Math.min(len, buffer.length - position)); + System.arraycopy(buffer, (int)position, b, off, toCopy); + if (toCopy > 0) { + position += toCopy; + } + return toCopy; + } + // into disk zone + int read = diskStream.read(b,off,len); + if(read>0) { + position += read; + } + return read; + } + + public void readFullyTo(OutputStream os) throws IOException { + byte[] buf = new byte[4096]; + int c = read(buf); + while (c != -1) { + os.write(buf,0,c); + c = read(buf); + } + } + + /* + * Like 'readFullyTo', but only reads the header-part. + * Starts from the beginning each time it is called. + */ + public void readHeaderTo(OutputStream os) throws IOException { + position = 0; + byte[] buf = new byte[(int)responseBodyStart]; + int c = read(buf,0,buf.length); + if(c != -1) { + os.write(buf,0,c); + } + } + + /* + * Like 'readFullyTo', but only reads the content-part. + */ + public void readContentTo(OutputStream os) throws IOException { + setToResponseBodyStart(); + byte[] buf = new byte[4096]; + int c = read(buf); + while (c != -1) { + os.write(buf,0,c); + c = read(buf); + } + } + + /** + * Convenience method to copy content out to target stream. + * @param os stream to write content to + * @param maxSize maximum count of bytes to copy + * @throws IOException + */ + public void readContentTo(OutputStream os, long maxSize) throws IOException { + setToResponseBodyStart(); + byte[] buf = new byte[4096]; + int c = read(buf); + long tot = 0; + while (c != -1 && tot < maxSize) { + os.write(buf,0,c); + c = read(buf); + tot += c; + } + } + + /* (non-Javadoc) + * @see java.io.InputStream#close() + */ + public void close() throws IOException { + super.close(); + if(diskStream != null) { + diskStream.close(); + } + } + + /** + * Total size of stream content. + * @return Returns the size. + */ + public long getSize() + { + return size; + } + + /** + * Total size of header. + * @return the size of the header. + */ + public long getHeaderSize() + { + return responseBodyStart; + } + + /** + * Total size of content. + * @return the size of the content. + */ + public long getContentSize() + { + return size - responseBodyStart; + } + + /** + * @return Amount THEORETICALLY remaining (TODO: Its not theoretical + * seemingly. The class implemetentation depends on it being exact). + */ + public long remaining() { + return size - position; + } + + + /** + * Reposition the stream. + * + * @param p the new position for this stream + * @throws IOException if an IO error occurs + */ + public void position(long p) throws IOException { + if (p < 0) { + throw new IOException("Negative seek offset."); + } + if (p > size) { + throw new IOException("Desired position exceeds size."); + } + if (p < buffer.length) { + // Only seek file if necessary + if (position > buffer.length) { + diskStream.position(0); + } + } else { + diskStream.position(p - buffer.length); + } + this.position = p; + } + + + public long position() throws IOException { + return position; + } + + protected byte[] getBuffer() { + return buffer; + } +} diff --git a/src/main/java/org/archive/io/RepositionableInputStream.java b/src/main/java/org/archive/io/RepositionableInputStream.java new file mode 100644 index 00000000..6f885130 --- /dev/null +++ b/src/main/java/org/archive/io/RepositionableInputStream.java @@ -0,0 +1,133 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import it.unimi.dsi.fastutil.io.RepositionableStream; + +import java.io.BufferedInputStream; +import java.io.IOException; +import java.io.InputStream; + +/** + * Wrapper around an {@link InputStream} to make a primitive Repositionable + * stream. Uses a {@link BufferedInputStream}. Calls mark on every read so + * we'll remember at least the last thing read (You can only backup on the + * last thing read -- not last 2 or 3 things read). Used by + * {@link GzippedInputStream} when reading streams over a network. Wraps a + * HTTP, etc., stream so we can back it up if needs be after the + * GZIP inflater has done a fill of its full buffer though it only needed + * the first few bytes to finish decompressing the current GZIP member. + * + *

TODO: More robust implementation. Tried to use the it.unimi.dsi.io + * FastBufferdInputStream but relies on FileChannel ByteBuffers and if not + * present -- as would be the case reading from a network stream, the main + * application for this instance -- then it expects the underlying stream + * implements RepositionableStream interface so chicken or egg problem. + * @author stack + */ +public class RepositionableInputStream extends BufferedInputStream implements + RepositionableStream { + private long position = 0; + private long markPosition = -1; + + public RepositionableInputStream(InputStream in) { + super(in); + } + + public RepositionableInputStream(InputStream in, int size) { + super(in, size); + } + + public int read(byte[] b) throws IOException { + int read = super.read(b); + if (read != -1) { + position += read; + } + return read; + } + + public synchronized int read(byte[] b, int offset, int ct) + throws IOException { + // Mark the underlying stream so that we'll remember what we are about + // to read unless a mark has been set in this RepositionableStream + // (We have two levels of mark). In this latter case we want the + // underlying stream to preserve its mark position so aligns with + // this RS when eset is called. + if (!isMarked()) { + super.mark((ct > offset)? ct - offset: ct); + } + int read = super.read(b, offset, ct); + if (read != -1) { + position += read; + } + return read; + } + + public int read() throws IOException { + // Mark the underlying stream so that we'll remember what we are about + // to read unless a mark has been set in this RepositionableStream + // (We have two levels of mark). In this latter case we want the + // underlying stream to preserve its mark position so aligns with + // this RS when eset is called. + if (!isMarked()) { + super.mark(1); + } + int c = super.read(); + if (c != -1) { + position++; + } + return c; + } + + public void position(final long offset) { + if (this.position == offset) { + return; + } + int diff = (int)(offset - this.position); + long lowerBound = this.position - this.pos; + long upperBound = lowerBound + this.count; + if (offset < lowerBound || offset >= upperBound) { + throw new IllegalAccessError("Offset goes outside " + + "current this.buf (TODO: Do buffer fills if positive)"); + } + this.position = offset; + this.pos += diff; + // Clear any mark. + this.markPosition = -1; + } + + public void mark(int readlimit) { + this.markPosition = this.position; + super.mark(readlimit); + } + + public void reset() throws IOException { + super.reset(); + this.position = this.markPosition; + this.markPosition = -1; + } + + protected boolean isMarked() { + return this.markPosition != -1; + } + + public long position() { + return this.position; + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/SafeSeekInputStream.java b/src/main/java/org/archive/io/SafeSeekInputStream.java new file mode 100644 index 00000000..0d8f83b1 --- /dev/null +++ b/src/main/java/org/archive/io/SafeSeekInputStream.java @@ -0,0 +1,124 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + + +import java.io.IOException; + + +/** + * Enables multiple concurrent streams based on the same underlying stream. + * + * @author pjack + */ +public class SafeSeekInputStream extends SeekInputStream { + + + /** + * The underlying stream. + */ + private SeekInputStream input; + + + /** + * The expected position of the underlying stream. + */ + private long expected; + + + /** + * Constructor. The given stream will be positioned to 0 so that an + * accurate position can be tracked. + * + * @param input the underlying input stream + * @throws IOException if an IO error occurs + */ + public SafeSeekInputStream(SeekInputStream input) throws IOException { + this.input = input; + this.expected = input.position(); + } + + + /** + * Ensures that the underlying stream's position is what we expect to be. + * + * @throws IOException if an IO error occurs + */ + private void ensure() throws IOException { + if (expected != input.position()) { + input.position(expected); + } + } + + + @Override + public int read() throws IOException { + ensure(); + int c = input.read(); + if (c >= 0) { + expected++; + } + return c; + } + + + @Override + public int read(byte[] buf, int ofs, int len) throws IOException { + ensure(); + int r = input.read(buf, ofs, len); + if (r > 0) { + expected += r; + } + return r; + } + + + @Override + public int read(byte[] buf) throws IOException { + ensure(); + int r = input.read(buf); + if (r > 0) { + expected += r; + } + return r; + } + + + @Override + public long skip(long c) throws IOException { + ensure(); + long r = input.skip(c); + if (r > 0) { + expected += r; + } + return r; + } + + + public void position(long p) throws IOException { + input.position(p); + expected = p; + } + + + public long position() throws IOException { + return expected; + } + +} diff --git a/src/main/java/org/archive/io/SeekInputStream.java b/src/main/java/org/archive/io/SeekInputStream.java new file mode 100644 index 00000000..177724ec --- /dev/null +++ b/src/main/java/org/archive/io/SeekInputStream.java @@ -0,0 +1,81 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + + +import it.unimi.dsi.fastutil.io.RepositionableStream; + +import java.io.IOException; +import java.io.InputStream; + + +/** + * Base class for repositionable input streams. + * + * @author pjack + */ +public abstract class SeekInputStream extends InputStream +implements RepositionableStream { + + + /** + * The marked file position. A value less than zero + * indicates that no mark has been set. + */ + private long mark = -1; + + + /** + * Marks the current position of the stream. The limit parameter is + * ignored; the mark will remain valid until reset is called or the + * stream is closed. + * + * @param limit ignored + */ + public void mark(int limit) { + try { + this.mark = position(); + } catch (IOException e) { + mark = -1; + } + } + + + /** + * Resets this stream to its marked position. + * + * @throws IOException if there is no mark, or if an IO error occurs + */ + public void reset() throws IOException { + if (mark < 0) { + throw new IOException("No mark."); + } + position(mark); + } + + + /** + * Returns true, since SeekInputStreams support mark/reset by default. + * + * @return true + */ + public boolean markSupported() { + return true; + } +} diff --git a/src/main/java/org/archive/io/SeekReader.java b/src/main/java/org/archive/io/SeekReader.java new file mode 100644 index 00000000..4abf7847 --- /dev/null +++ b/src/main/java/org/archive/io/SeekReader.java @@ -0,0 +1,84 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + + +import java.io.IOException; +import java.io.Reader; + +import it.unimi.dsi.fastutil.io.RepositionableStream; + + +/** + * Base class for repositionable readers. + * + * @author pjack + */ +public abstract class SeekReader extends Reader +implements RepositionableStream { + + + /** + * The marked file position. A value less than zero + * indicates that no mark has been set. + */ + private long mark = -1; + + + /** + * Marks the current position of the stream. The limit parameter is + * ignored; the mark will remain valid until reset is called or the + * stream is closed. + * + * @param limit ignored + */ + @Override + public void mark(int limit) { + try { + this.mark = position(); + } catch (IOException e) { + mark = -1; + } + } + + + /** + * Resets this stream to its marked position. + * + * @throws IOException if there is no mark, or if an IO error occurs + */ + @Override + public void reset() throws IOException { + if (mark < 0) { + throw new IOException("No mark."); + } + position(mark); + } + + + /** + * Returns true, since SeekInputStreams support mark/reset by default. + * + * @return true + */ + @Override + public boolean markSupported() { + return true; + } +} diff --git a/src/main/java/org/archive/io/SeekReaderCharSequence.java b/src/main/java/org/archive/io/SeekReaderCharSequence.java new file mode 100644 index 00000000..a9b4880f --- /dev/null +++ b/src/main/java/org/archive/io/SeekReaderCharSequence.java @@ -0,0 +1,56 @@ +package org.archive.io; + +import java.io.IOException; + +public class SeekReaderCharSequence implements CharSequence { + + + final private SeekReader reader; + final private int size; + + + public SeekReaderCharSequence(SeekReader reader, int size) { + this.reader = reader; + this.size = size; + } + + + public int length() { + return size; + } + + + public char charAt(int index) { + if ((index < 0) || (index >= length())) { + throw new IndexOutOfBoundsException(Integer.toString(index)); + } + try { + reader.position(index); + int r = reader.read(); + if (r < 0) { + throw new IllegalStateException("EOF"); + } + return (char)reader.read(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + + public CharSequence subSequence(int start, int end) { + return new CharSubSequence(this, start, end); + } + + public String toString() { + StringBuilder sb = new StringBuilder(); + try { + reader.position(0); + for (int ch = reader.read(); ch >= 0; ch = reader.read()) { + sb.append((char)ch); + } + return sb.toString(); + } catch (IOException e) { + throw new IllegalStateException(e); + } + } +} diff --git a/src/main/java/org/archive/io/SinkHandlerLogThread.java b/src/main/java/org/archive/io/SinkHandlerLogThread.java new file mode 100644 index 00000000..0070785e --- /dev/null +++ b/src/main/java/org/archive/io/SinkHandlerLogThread.java @@ -0,0 +1,34 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + + +/** + * Implemented by threads that provide extra information. + * + * TODO: rename class, rename getCurrentProcessorName() + */ +public interface SinkHandlerLogThread { + + String getName(); + String getCurrentProcessorName(); + int getSerialNumber(); + +} diff --git a/src/main/java/org/archive/io/UTF8Bytes.java b/src/main/java/org/archive/io/UTF8Bytes.java new file mode 100644 index 00000000..c280b08d --- /dev/null +++ b/src/main/java/org/archive/io/UTF8Bytes.java @@ -0,0 +1,37 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.io.UnsupportedEncodingException; + +/** + * Marker Interface for instances that can be serialized as UTF8 bytes. + * TODO: Do we need a UTF8Stream Marker Interface? + * @author stack + * @version $Date$ $Version$ + */ +public interface UTF8Bytes { + public static final String UTF8 = "UTF-8"; + + /** + * @return Instance as UTF-8 bytes. + * @throws UnsupportedEncodingException + */ + public byte [] getUTF8Bytes() throws UnsupportedEncodingException; +} diff --git a/src/main/java/org/archive/io/WriterPool.java b/src/main/java/org/archive/io/WriterPool.java new file mode 100644 index 00000000..2dc385a1 --- /dev/null +++ b/src/main/java/org/archive/io/WriterPool.java @@ -0,0 +1,343 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + +import java.io.File; +import java.io.IOException; +import java.util.Collection; +import java.util.LinkedList; +import java.util.concurrent.ArrayBlockingQueue; +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.logging.Level; +import java.util.logging.Logger; + +import org.json.JSONArray; +import org.json.JSONException; +import org.json.JSONObject; + +/** + * Pool of Writers. + * + * Abstract. Override and pass in the Constructor a factory that creates + * {@link WriterPoolMember} implementations. + * + * @author stack + */ +public abstract class WriterPool { + private final Logger logger = Logger.getLogger(this.getClass().getName()); + + /** + * Used to generate unique filename sequences. + */ + final protected AtomicInteger serialNo; + + /** + * Default maximum active number of files in the pool. + */ + public static final int DEFAULT_MAX_ACTIVE = 1; + + /** Assumed largest possible value of maxActive; pool will have this + * maximum capacity, so dynamic changes beyond this number won't work. */ + protected static final int LARGEST_MAX_ACTIVE = 255; + + /** + * Maximum time to wait on a free file before considering + * making a new one (if not already at max) + */ + public static final int DEFAULT_MAX_WAIT_FOR_IDLE = 500; + + /** + * File settings. + * Keep in data structure rather than as individual values. + */ + protected final WriterPoolSettings settings; + + /** maximum number of writers to create at a time*/ + protected int maxActive; + /** maximum ms to wait before considering creation of a writer */ + protected int maxWait; + /** current count of active writers; only read/mutated in synchronized blocks */ + protected int currentActive = 0; + /** round-robin queue of available writers */ + protected BlockingQueue availableWriters; + + /** system time when writer was last wanted (because one was not ready in time) */ + protected long lastWriterNeededTime; + /** system time when writer was last 'rolled over' (imminent creation of new file) */ + protected long lastWriterRolloverTime; + + /** + * Constructor + * @param serial Used to generate unique filename sequences + * @param factory Factory that knows how to make a {@link WriterPoolMember}. + * @param settings Settings for this pool. + * @param poolMaximumActive + * @param poolMaximumWait + */ + public WriterPool(final AtomicInteger serial, + final WriterPoolSettings settings, + final int poolMaximumActive, final int poolMaximumWait) { + logger.info("Initial configuration:" + + " prefix=" + settings.getPrefix() + + ", template=" + settings.getTemplate() + + ", compress=" + settings.getCompress() + + ", maxSize=" + settings.getMaxFileSizeBytes() + + ", maxActive=" + poolMaximumActive + + ", maxWait=" + poolMaximumWait); + this.settings = settings; + this.maxActive = poolMaximumActive; + this.maxWait = poolMaximumWait; + availableWriters = new ArrayBlockingQueue(LARGEST_MAX_ACTIVE, true); + this.serialNo = serial; + } + + /** + * Check out a {@link WriterPoolMember}. + * + * This method should be followed by a call to + * {@link #returnFile(WriterPoolMember)} or + * {@link #invalidateFile(WriterPoolMember)} else pool starts leaking. + * + * @return Writer checked out of a pool of files or created + * @throws IOException Problem getting Writer from pool (Converted + * from Exception to IOException so this pool can live as a good citizen + * down in depths of ARCSocketFactory). + */ + public WriterPoolMember borrowFile() + throws IOException { + WriterPoolMember writer = null; + while(writer == null) { + try { + writer = availableWriters.poll(maxWait,TimeUnit.MILLISECONDS); + } catch (InterruptedException e) { + // nothing to do but proceed + } + if(writer==null) { + writer = makeNewWriterIfAppropriate(); + } + } + return writer; + } + + /** + * Create a new writer instance, if still below maxActive count. + * Remember times to help make later decision when writer should + * be discarded. + * + * @return WriterPoolMember or null if already at max + */ + protected synchronized WriterPoolMember makeNewWriterIfAppropriate() { + long now = System.currentTimeMillis(); + lastWriterNeededTime = now; + if(currentActive < maxActive) { + currentActive++; + lastWriterRolloverTime = now; + return makeWriter(); + } + return null; + } + + /** + * @return new WriterPoolMember of appropriate type + */ + protected abstract WriterPoolMember makeWriter(); + + /** + * Discard a previously-used writer, cleanly closing it and leaving it out + * of the pool. + * @param writer + * @throws IOException + */ + public synchronized void destroyWriter(WriterPoolMember writer) throws IOException { + currentActive--; + writer.close(); + } + /** + * Return a writer, for likely reuse unless (1) writer's current file has + * reached its target size; and (2) there's been no demand for additional + * writers since the last time a new writer-file was rolled-over. In that + * case, the possibly-superfluous writer instance is discarded. + * @param writer Writer to return to the pool. + * @throws IOException Problem returning File to pool. + */ + public void returnFile(WriterPoolMember writer) + throws IOException { + synchronized(this) { + if(writer.isOversize()) { + // maybe retire writer rather than recycle + if(lastWriterNeededTime<=lastWriterRolloverTime) { + // no timeouts waiting for recycled writer since last writer rollover + destroyWriter(writer); + return; + } else { + // reuse writer instance, causing new file to be created + lastWriterRolloverTime = System.currentTimeMillis(); + } + } + } + if(!availableWriters.offer(writer)) { + logger.log(Level.WARNING, "writer unreturnable to available pool; closing early"); + destroyWriter(writer); + } + } + + /** + * Close and discard a writer that experienced a potentially-corrupting + * error. + * @param f writer with problem + * @throws IOException + */ + public synchronized void invalidateFile(WriterPoolMember f) + throws IOException { + try { + destroyWriter(f); + } catch (Exception e) { + // Convert exception. + throw new IOException(e.getMessage()); + } + // It'll have been closed. Rename with an '.invalid' suffix so it + // gets attention. + File file = f.getFile(); + file.renameTo(new File(file.getAbsoluteFile() + + WriterPoolMember.INVALID_SUFFIX)); + } + + /** + * @return Number of {@link WriterPoolMember}s checked out of pool. + * @throws java.lang.UnsupportedOperationException + */ + public synchronized int getNumActive() + throws UnsupportedOperationException { + return currentActive - getNumIdle(); + } + + /** + * @return Number of {@link WriterPoolMember} instances still in the pool. + * @throws java.lang.UnsupportedOperationException + */ + public int getNumIdle() + throws UnsupportedOperationException { + return availableWriters.size(); + } + + /** + * Close all {@link WriterPoolMember}s in pool. + */ + public void close() { + Collection writers = drainAllWriters(); + for (WriterPoolMember writer: writers) { + try { + destroyWriter(writer); + } catch (IOException e) { + logger.log(Level.WARNING,"problem closing writer",e); + } + } + } + + /** + * @return Returns settings. + */ + public WriterPoolSettings getSettings() { + return this.settings; + } + + /** + * @return State of the pool string + */ + protected String getPoolState() { + StringBuffer buffer = new StringBuffer("Active "); + buffer.append(getNumActive()); + buffer.append(" of max "); + buffer.append(maxActive); + buffer.append(", idle "); + buffer.append(getNumIdle()); + return buffer.toString(); + } + + /** + * Returns the atomic integer used to generate serial numbers + * for files. + * + * @return the serial number generator + */ + public AtomicInteger getSerialNo() { + return serialNo; + } + + /** + * Drains all the active writers from {@link #availableWriters}, blocking to + * wait for any writers currently in use to become available. + * + *

+ * When finished with writers, call availableWriters.addAll(...) to put them + * back into the rotation. + * + * @return all the active writers + */ + protected synchronized Collection drainAllWriters() { + LinkedList writers = new LinkedList(); + availableWriters.drainTo(writers); + + while (writers.size() < currentActive) { + try { + WriterPoolMember w = availableWriters.take(); + writers.add(w); + } catch (InterruptedException e) { + logger.severe("caught " + e + " while waiting for writers to free up; returning only " + + writers.size() + " of " + currentActive + " active writers"); + break; + } + } + + return writers; + } + + public void flush() { + Collection writers = drainAllWriters(); + + for (WriterPoolMember writer: writers) { + try { + writer.flush(); + } catch (IOException e) { + logger.log(Level.WARNING, "problem flushing writer " + writer, e); + } + } + + availableWriters.addAll(writers); + } + + public JSONArray jsonStatus() throws JSONException { + Collection writers = drainAllWriters(); + + JSONArray ja = new JSONArray(); + for (WriterPoolMember w: writers) { + JSONObject jo = new JSONObject(); + jo.put("file", w.getFile()); + jo.put("position", w.getPosition()); + ja.put(jo); + } + + availableWriters.addAll(writers); + + return ja; + } +} diff --git a/src/main/java/org/archive/io/WriterPoolMember.java b/src/main/java/org/archive/io/WriterPoolMember.java new file mode 100644 index 00000000..6ea6b295 --- /dev/null +++ b/src/main/java/org/archive/io/WriterPoolMember.java @@ -0,0 +1,487 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.text.DecimalFormat; +import java.text.NumberFormat; +import java.util.Iterator; +import java.util.List; +import java.util.Properties; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.logging.Logger; +import java.util.zip.GZIPOutputStream; + +import org.archive.util.ArchiveUtils; +import org.archive.util.FileUtils; +import org.archive.util.PropertyUtils; + + + +/** + * Member of {@link WriterPool}. + * Implements rotating off files, file naming with some guarantee of + * uniqueness, and position in file. Subclass to pick up functionality for a + * particular Writer type. + * @author stack + * @version $Date$ $Revision$ + */ +public abstract class WriterPoolMember implements ArchiveFileConstants { + private final Logger logger = Logger.getLogger(this.getClass().getName()); + + public static final String UTF8 = "UTF-8"; + + /** + * Default archival-aggregate filename template. + * + * Under usual assumptions -- hostnames aren't shared among crawling hosts; + * processes have unique PIDs and admin ports; timestamps inside one process + * don't repeat (see UniqueTimestampService); clocks are generally + * accurate -- will generate a unique name. + * + * Stands for Internet Archive Heritrix. + */ + public static final String DEFAULT_TEMPLATE = + "${prefix}-${timestamp17}-${serialno}-${heritrix.pid}~${heritrix.hostname}~${heritrix.port}"; + + /** + * Default for file prefix. + */ + public static final String DEFAULT_PREFIX = "WEB"; + + /** + * Reference to file we're currently writing. + */ + protected File f = null; + + /** Output stream for file. */ + protected OutputStream out = null; + /** Counting stream for metering */ + protected MiserOutputStream countOut = null; + + /** reusable buffer for recycling scenarios */ + protected byte[] rebuf; + + protected WriterPoolSettings settings; + private final String extension; + + /** + * Creation date for the current file. + * Set by {@link #createFile()}. + */ + protected String currentTimestamp = "UNSET!!!"; + + protected String currentBasename; + + /** + * A running sequence used making unique file names. + */ + final private AtomicInteger serialNo; + + /** + * Directories round-robin index. + */ + protected static int roundRobinIndex = 0; + + /** + * NumberFormat instance for formatting serial number. + * + * Pads serial number with zeros. + */ + protected static NumberFormat serialNoFormatter = new DecimalFormat("00000"); + + + /** + * Buffer to reuse writing streams. + */ + protected final byte [] scratchbuffer = new byte[4 * 1024]; + + + /** + * Constructor. + * Takes a stream. Use with caution. There is no upperbound check on size. + * Will just keep writing. + * + * @param serialNo used to create unique filename sequences + * @param out Where to write. + * @param file File the out is connected to. + * @param cmprs Compress the content written. + * @param a14DigitDate If null, we'll write current time. + * @throws IOException + */ + protected WriterPoolMember(AtomicInteger serialNo, + final OutputStream out, final File file, + final WriterPoolSettings settings) + throws IOException { + this(serialNo, settings, null); + this.countOut = (out instanceof MiserOutputStream) + ? (MiserOutputStream)out + : new MiserOutputStream(out, settings.getFrequentFlushes()); + this.out = this.countOut; + this.f = file; + } + + /** + * Constructor. + * + * @param serialNo used to create unique filename sequences + * @param dirs Where to drop files. + * @param prefix File prefix to use. + * @param cmprs Compress the records written. + * @param maxSize Maximum size for ARC files written. + * @param template filenaming template to use + * @param extension Extension to give file. + */ + public WriterPoolMember(AtomicInteger serialNo, + final WriterPoolSettings settings, final String extension) { + this.settings = settings; + this.extension = extension; + this.serialNo = serialNo; + } + + /** + * Call this method just before/after any significant write. + * + * Call at the end of the writing of a record or just before we start + * writing a new record. Will close current file and open a new file + * if file size has passed out maxSize. + * + *

Creates and opens a file if none already open. One use of this method + * then is after construction, call this method to add the metadata, then + * call {@link #getPosition()} to find offset of first record. + * + * TODO: perhaps this should be called checkForNewOpen? because it also + * handles initial open, even when not rolling oversize + * + * @exception IOException + */ + public void checkSize() throws IOException { + if (this.out == null || isOversize()) { + createFile(); + } + } + + /** Check if underlying file has already reached its target size. + * @return boolean true if file has reached target size and due to be closed + */ + public boolean isOversize() { + return settings.getMaxFileSizeBytes() != -1 && (this.getPosition() > settings.getMaxFileSizeBytes()); + } + + /** + * Create a new file. + * Rotates off the current Writer and creates a new in its place + * to take subsequent writes. Usually called from {@link #checkSize()}. + * @return Name of file created. + * @throws IOException + */ + protected String createFile() throws IOException { + generateNewBasename(); + String name = currentBasename + '.' + this.extension + + ((settings.getCompress())? DOT_COMPRESSED_FILE_EXTENSION: "") + + OCCUPIED_SUFFIX; + File dir = getNextDirectory(settings.calcOutputDirs()); + return createFile(new File(dir, name)); + } + + protected String createFile(final File file) throws IOException { + close(); + this.f = file; + FileOutputStream fos = new FileOutputStream(this.f); + if(rebuf==null) { + rebuf = new byte[settings.getWriteBufferSize()]; + } + this.countOut = new MiserOutputStream(new RecyclingFastBufferedOutputStream(fos,rebuf),settings.getFrequentFlushes()); + this.out = this.countOut; + logger.fine("Opened " + this.f.getAbsolutePath()); + return this.f.getName(); + } + + /** + * @param dirs List of File objects that point at directories. + * @return Find next directory to write an arc too. If more + * than one, it tries to round-robin through each in turn. + * @throws IOException + */ + protected File getNextDirectory(List dirs) + throws IOException { + if (WriterPoolMember.roundRobinIndex >= dirs.size()) { + WriterPoolMember.roundRobinIndex = 0; + } + File d = null; + try { + d = checkWriteable((File)dirs. + get(WriterPoolMember.roundRobinIndex)); + } catch (IndexOutOfBoundsException e) { + // Dirs list might be altered underneath us. + // If so, we get this exception -- just keep on going. + } + if (d == null && dirs.size() > 1) { + for (Iterator i = dirs.iterator(); d == null && i.hasNext();) { + d = checkWriteable((File)i.next()); + } + } else { + WriterPoolMember.roundRobinIndex++; + } + if (d == null) { + throw new IOException("Directories unusable."); + } + return d; + } + + protected File checkWriteable(File d) { + if (d == null) { + return d; + } + + try { + FileUtils.ensureWriteableDirectory(d); + } catch(IOException e) { + logger.warning("Directory " + d.getPath() + " is not" + + " writeable or cannot be created: " + e.getMessage()); + d = null; + } + return d; + } + + /** + * Generate a new basename by interpolating values in the configured + * template. Values come from local state, other configured values, and + * global system properties. The recommended default template will + * generate a unique basename under reasonable assumptions. + */ + protected void generateNewBasename() { + Properties localProps = new Properties(); + localProps.setProperty("prefix", settings.getPrefix()); + synchronized(this.getClass()) { + // ensure that serialNo and timestamp are minted together (never inverted sort order) + String paddedSerialNumber = WriterPoolMember.serialNoFormatter.format(serialNo.getAndIncrement()); + String timestamp17 = ArchiveUtils.getUnique17DigitDate(); + String timestamp14 = ArchiveUtils.getUnique14DigitDate(); + currentTimestamp = timestamp17; + localProps.setProperty("serialno", paddedSerialNumber); + localProps.setProperty("timestamp17", timestamp17); + localProps.setProperty("timestamp14", timestamp14); + } + currentBasename = PropertyUtils.interpolateWithProperties(settings.getTemplate(), + localProps, System.getProperties()); + } + + + /** + * Get the file name + * + * @return the filename, as if uncompressed + */ + protected String getBaseFilename() { + String name = this.f.getName(); + if (settings.getCompress() && name.endsWith(DOT_COMPRESSED_FILE_EXTENSION)) { + return name.substring(0,name.length() - 3); + } else if(settings.getCompress() && + name.endsWith(DOT_COMPRESSED_FILE_EXTENSION + + OCCUPIED_SUFFIX)) { + return name.substring(0, name.length() - + (3 + OCCUPIED_SUFFIX.length())); + } else { + return name; + } + } + + /** + * Get this file. + * + * Used by junit test to test for creation and when {@link WriterPool} wants + * to invalidate a file. + * + * @return The current file. + */ + public File getFile() { + return this.f; + } + + /** + * Post write tasks. + * + * Has side effects. Will open new file if we're at the upper bound. + * If we're writing compressed files, it will wrap output stream with a + * GZIP writer with side effect that GZIP header is written out on the + * stream. + * + * @exception IOException + */ + protected void preWriteRecordTasks() + throws IOException { + if (this.out == null) { + createFile(); + } + if (settings.getCompress()) { + // Wrap stream in GZIP Writer. + // The below construction immediately writes the GZIP 'default' + // header out on the underlying stream. + this.out = new CompressedStream(this.out); + } + } + + /** + * Post file write tasks. + * If compressed, finishes up compression and flushes stream so any + * subsequent checks get good reading. + * + * @exception IOException + */ + protected void postWriteRecordTasks() + throws IOException { + if (settings.getCompress()) { + CompressedStream o = (CompressedStream)this.out; + o.finish(); + o.flush(); + o.end(); + this.out = o.getWrappedStream(); + } + } + + /** + * Position in raw output (typically, physical file). + * Used making accounting of bytes written. + * @return Position in final media (assuming all flushing completes) + * @throws IOException + */ + public long getPosition() { + return (countOut==null)? 0L : this.countOut.getCount(); + } + + public boolean isCompressed() { + return settings.getCompress(); + } + + protected void write(final byte [] b) throws IOException { + this.out.write(b); + } + + protected void flush() throws IOException { + this.out.flush(); + } + + protected void write(byte[] b, int off, int len) throws IOException { + this.out.write(b, off, len); + } + + protected void write(int b) throws IOException { + this.out.write(b); + } + + /** + * Copy bytes from the provided InputStream to the target file/stream being + * written. + * + * @return number of bytes written (normally equal to {@code enforceLength}) + * @param is + * InputStream to copy bytes from + * @param recordLength + * expected number of bytes to copy + * @param enforceLength + * whether to throw an exception if too many/too few bytes are + * available from stream + * @throws IOException + */ + protected long copyFrom(final InputStream is, final long recordLength, + boolean enforceLength) throws IOException { + int read = scratchbuffer.length; + long tot = 0; + while ((tot < recordLength) + && (read = is.read(scratchbuffer)) != -1) { + int write = read; + // never write more than enforced length + write = (int) Math.min(write, recordLength - tot); + tot += read; + write(scratchbuffer, 0, write); + } + if (enforceLength && tot != recordLength) { + // throw exception if desired for read vs. declared mismatches + throw new IOException("Read " + tot + " but expected " + + recordLength); + } + + return tot; + } + + public void close() throws IOException { + if (this.out == null) { + return; + } + this.out.close(); + this.out = null; + if (this.f != null && this.f.exists()) { + String path = this.f.getAbsolutePath(); + if (path.endsWith(OCCUPIED_SUFFIX)) { + File f = new File(path.substring(0, + path.length() - OCCUPIED_SUFFIX.length())); + if (f.exists() & !f.delete()) { + logger.warning("Failed delete of " + f); + } + if (!this.f.renameTo(f)) { + logger.warning("Failed rename of " + path); + } + this.f = f; + } + + logger.fine("Closed " + this.f.getAbsolutePath() + + ", size " + this.f.length()); + } + } + + protected OutputStream getOutputStream() { + return this.out; + } + + /** + * An override so we get access to underlying output stream. + * and offer an end() that does not accompany closing underlying + * stream. + * @author stack + */ + private class CompressedStream extends GZIPOutputStream { + public CompressedStream(OutputStream out) + throws IOException { + super(out); + } + + /** + * @return Reference to stream being compressed. + */ + OutputStream getWrappedStream() { + return this.out; + } + + /** + * Release the deflater's native process resources, + * which otherwise would not occur until either + * finalization or DeflaterOutputStream.close() + * (which would also close underlying stream). + */ + public void end() { + def.end(); + } + } +} diff --git a/src/main/java/org/archive/io/WriterPoolSettings.java b/src/main/java/org/archive/io/WriterPoolSettings.java new file mode 100644 index 00000000..d0805cdc --- /dev/null +++ b/src/main/java/org/archive/io/WriterPoolSettings.java @@ -0,0 +1,39 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.io.File; +import java.util.List; + +/** + * Settings object for a {@link WriterPool}. + * Used creating {@link WriterPoolMember}s. + * @author stack + * @version $Date$, $Revision$ + */ +public interface WriterPoolSettings { + public long getMaxFileSizeBytes(); + public String getPrefix(); + public String getTemplate(); + public List calcOutputDirs(); + public boolean getCompress(); + public List getMetadata(); + public boolean getFrequentFlushes(); + public int getWriteBufferSize(); +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/arc/ARC2WCDX.java b/src/main/java/org/archive/io/arc/ARC2WCDX.java new file mode 100644 index 00000000..19010131 --- /dev/null +++ b/src/main/java/org/archive/io/arc/ARC2WCDX.java @@ -0,0 +1,243 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io.arc; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.PrintStream; +import java.util.Date; +import java.util.Iterator; +import java.util.zip.GZIPOutputStream; + +import org.apache.commons.httpclient.Header; +import org.apache.commons.httpclient.HeaderGroup; +import org.apache.commons.httpclient.util.DateParseException; +import org.apache.commons.httpclient.util.DateUtil; +import org.archive.io.ArchiveRecord; +import org.archive.util.ArchiveUtils; +import org.archive.util.SURT; + +/** + * Create a 'Wide' CDX from an ARC. Takes one argument, the path to the ARC. + * Writes .wcdx.gz in same directory. + * + * @author gojomo + */ +public class ARC2WCDX { + final public static String WCDX_VERSION="0.1"; + + public static void main(String[] args) throws IOException { + String arcFilename = args[0]; + createWcdx(arcFilename); + } + + public static Object[] createWcdx(String arcFilename) throws IOException { + ARCReader reader = ARCReaderFactory.get(arcFilename); + Object[] retVal = createWcdx(reader); + reader.close(); + return retVal; + } + + public static Object[] createWcdx(ARCReader reader) { + reader.setDigest(true); + + String wcdxPath = reader.getReaderIdentifier().replaceAll("\\.arc(\\.gz)?$",".wcdx.gz"); + File wcdxFile = new File(wcdxPath+".open"); + PrintStream writer = null; + long count = 0; + try { + writer = new PrintStream(new GZIPOutputStream(new FileOutputStream(wcdxFile))); + + // write header: legend + timestamp + StringBuilder legend = new StringBuilder(); + appendField(legend,"CDX"); + appendField(legend,"surt-uri"); + appendField(legend,"b"); // ARC timestamp + appendField(legend,"http-date"); + appendField(legend,"s"); // status code + appendField(legend,"m"); // media type + appendField(legend,"sha1"); // content sha1 + appendField(legend,"g"); // ARC name + appendField(legend,"V"); // start offset + appendField(legend,"end-offset"); // TODO: implement + appendField(legend,"n"); // ARC record length TODO: verify + appendField(legend,"http-content-length"); + appendField(legend,"http-last-modified"); + appendField(legend,"http-expires"); + appendField(legend,"http-etag"); + appendField(legend,"http-location"); + appendField(legend,"e"); // IP + appendField(legend,"a"); // original URL + // WCDX version+creation time: crude version control + appendField(legend,WCDX_VERSION+"@"+ArchiveUtils.get14DigitDate()); + writer.println(legend.toString()); + + Iterator iter = reader.iterator(); + count = 0; + while(iter.hasNext()) { + ARCRecord record = (ARCRecord) iter.next(); + record.close(); + ARCRecordMetaData h = (ARCRecordMetaData) record.getHeader(); + Header[] httpHeaders = record.getHttpHeaders(); + if(httpHeaders==null) { + httpHeaders = new Header[0]; + } + HeaderGroup hg = new HeaderGroup(); + hg.setHeaders(httpHeaders); + StringBuilder builder = new StringBuilder(); + + // SURT-form URI + appendField(builder,SURT.fromURI(h.getUrl())); + // record timestamp ('b') + appendField(builder,h.getDate()); + // http header date + appendTimeField(builder,hg.getFirstHeader("Date")); + // response code ('s') + appendField(builder,h.getStatusCode()); + // media type ('m') + appendField(builder,h.getMimetype()); + // content checksum (like 'c', but here Base32 SHA1) + appendField(builder,record.getDigestStr()); + // arc name ('g') + appendField(builder,reader.getFileName()); + // compressed start offset ('V') + appendField(builder,h.getOffset()); + + // compressed end offset (?) +// appendField(builder, +// reader.getInputStream() instanceof RepositionableStream +// ? ((GzippedInputStream)reader.getInputStream()).vPosition() +// : "-"); + // TODO; leave unavail for now + appendField(builder, "-"); + + // uncompressed (declared in ARC headerline) record length + appendField(builder,h.getLength()); + // http header content-length + appendField(builder,hg.getFirstHeader("Content-Length")); + + // http header mod-date + appendTimeField(builder,hg.getFirstHeader("Last-Modified")); + // http header expires + appendTimeField(builder,hg.getFirstHeader("Expires")); + + // http header etag + appendField(builder,hg.getFirstHeader("ETag")); + // http header redirect ('Location' header?) + appendField(builder,hg.getFirstHeader("Location")); + // ip ('e') + appendField(builder,h.getIp()); + // original URI + appendField(builder,h.getUrl()); + // TODO MAYBE - a title from inside content? + + writer.println(builder.toString()); + count++; + } + wcdxFile.renameTo(new File(wcdxPath)); + } catch (IOException e) { + // soldier on: but leave '.open' wcdx file as indicator of error + if(!wcdxFile.exists()) { + try { + wcdxFile.createNewFile(); + } catch (IOException e1) { + // TODO Auto-generated catch block + throw new RuntimeException(e1); + } + } + } catch (RuntimeException e) { + // soldier on: but leave '.open' wcdx file as indicator of error + if(!wcdxFile.exists()) { + try { + wcdxFile.createNewFile(); + } catch (IOException e1) { + // TODO Auto-generated catch block + throw new RuntimeException(e1); + } + } + } finally { + if(writer!=null) { + writer.close(); + } + } + + return new Object[] {wcdxPath, count}; + } + + protected static void appendField(StringBuilder builder, Object obj) { + if(builder.length()>0) { + // prepend with delimiter + builder.append(' '); + } + if(obj instanceof Header) { + obj = ((Header)obj).getValue().trim(); + } + + builder.append((obj==null||obj.toString().length()==0)?"-":obj); + } + + protected static void appendTimeField(StringBuilder builder, Object obj) { + if(builder.length()>0) { + // prepend with delimiter + builder.append(' '); + } + if(obj==null) { + builder.append("-"); + return; + } + if(obj instanceof Header) { + String s = ((Header)obj).getValue().trim(); + try { + Date date = DateUtil.parseDate(s); + String d = ArchiveUtils.get14DigitDate(date); + if(d.startsWith("209")) { + d = "199"+d.substring(3); + } + obj = d; + } catch (DateParseException e) { + builder.append('e'); + return; + } + + } + builder.append(obj); + } +} + +//'wide' CDX +//a original url +//b timestamp +//s resp code +//m type +//? content md5 (full 'k'? 'c'? +//g arc name +//V compressed start offset +//? compressed length +//n? uncompressed length +//? mod date +//? expires +//? server 'date' hdr +//? etag +//r redirect ('Location'?) +//e ip +//MAYBE: +//? TITLE from HTML or other format? + + diff --git a/src/main/java/org/archive/io/arc/ARCConstants.java b/src/main/java/org/archive/io/arc/ARCConstants.java new file mode 100644 index 00000000..c44cfef7 --- /dev/null +++ b/src/main/java/org/archive/io/arc/ARCConstants.java @@ -0,0 +1,29 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io.arc; + + +/** + * Constants used by ARC files and in ARC file processing. + * + * @author stack + * @deprecated + */ +public interface ARCConstants extends org.archive.format.arc.ARCConstants { +} diff --git a/src/main/java/org/archive/io/arc/ARCLocation.java b/src/main/java/org/archive/io/arc/ARCLocation.java new file mode 100644 index 00000000..c6c64437 --- /dev/null +++ b/src/main/java/org/archive/io/arc/ARCLocation.java @@ -0,0 +1,37 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io.arc; + +/** + * Datastructure to hold ARC record location. + * Used by wayback machine. + * @author stack + */ +public interface ARCLocation { + /** + * @return Returns the ARC filename. Can be full path to ARC, URL to an + * ARC or just the portion of an ARC name that is unique to a collection. + */ + public String getName(); + + /** + * @return Returns the offset into the ARC. + */ + public long getOffset(); +} diff --git a/src/main/java/org/archive/io/arc/ARCReader.java b/src/main/java/org/archive/io/arc/ARCReader.java new file mode 100644 index 00000000..7f85cc2a --- /dev/null +++ b/src/main/java/org/archive/io/arc/ARCReader.java @@ -0,0 +1,553 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io.arc; + +import java.io.ByteArrayOutputStream; +import java.io.Closeable; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.logging.Logger; + +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.Options; +import org.apache.commons.cli.ParseException; +import org.apache.commons.cli.PosixParser; +import org.archive.io.ArchiveReader; +import org.archive.io.ArchiveRecord; +import org.archive.io.ArchiveRecordHeader; +import org.archive.io.RecoverableIOException; +import org.archive.io.WriterPoolMember; +import org.archive.util.ArchiveUtils; + + +/** + * Get an iterator on an ARC file or get a record by absolute position. + * + * ARC files are described here: + * Arc + * File Format. + * + *

This class knows how to parse an ARC file. Pass it a file path + * or an URL to an ARC. It can parse ARC Version 1 and 2. + * + *

Iterator returns ARCRecord + * though {@link Iterator#next()} is returning + * java.lang.Object. Cast the return. + * + *

Profiling java.io vs. memory-mapped ByteBufferInputStream shows the + * latter slightly slower -- but not by much. TODO: Test more. Just + * change {@link #getInputStream(File, long)}. + * + * @author stack + * @version $Date$ $Revision$ + */ +public abstract class ARCReader extends ArchiveReader +implements ARCConstants, Closeable { + private final Logger logger = Logger.getLogger(ARCReader.class.getName()); + + /** + * Set to true if we are aligned on first record of Archive file. + * We used depend on offset. If offset was zero, then we were + * aligned on first record. This is no longer necessarily the case when + * Reader is created at an offset into an Archive file: The offset is zero + * but its relative to where we started reading. + */ + private boolean alignedOnFirstRecord = true; + + private boolean parseHttpHeaders = true; + + protected ARCReader() { + super(); + } + + /** + * Skip over any trailing new lines at end of the record so we're lined up + * ready to read the next. + * @param record + * @throws IOException + */ + protected void gotoEOR(ArchiveRecord record) throws IOException { + if (getIn().available() <= 0) { + return; + } + + // Remove any trailing LINE_SEPARATOR + int c = -1; + while (getIn().available() > 0) { + if (getIn().markSupported()) { + getIn().mark(1); + } + c = getIn().read(); + if (c != -1) { + if (c == LINE_SEPARATOR) { + continue; + } + if (getIn().markSupported()) { + // We've overread. We're probably in next record. There is + // no way of telling for sure. It may be dross at end of + // current record. Backup. + getIn().reset(); + break; + } + ArchiveRecordHeader h = (getCurrentRecord() != null)? + record.getHeader(): null; + throw new IOException("Read " + (char)c + + " when only " + LINE_SEPARATOR + " expected. " + + getReaderIdentifier() + ((h != null)? + h.getHeaderFields().toString(): "")); + } + } + } + + /** + * Create new arc record. + * + * Encapsulate housekeeping that has to do w/ creating a new record. + * + *

Call this method at end of constructor to read in the + * arcfile header. Will be problems reading subsequent arc records + * if you don't since arcfile header has the list of metadata fields for + * all records that follow. + * + *

When parsing through ARCs writing out CDX info, we spend about + * 38% of CPU in here -- about 30% of which is in getTokenizedHeaderLine + * -- of which 16% is reading. + * + * @param is InputStream to use. + * @param offset Absolute offset into arc file. + * @return An arc record. + * @throws IOException + */ + protected ARCRecord createArchiveRecord(InputStream is, long offset) + throws IOException { + try { + String version = super.getVersion(); + ARCRecord record = new ARCRecord(is, getReaderIdentifier(), offset, + isDigest(), isStrict(), isParseHttpHeaders(), + isAlignedOnFirstRecord(), version); + if (version != null && super.getVersion() == null) + super.setVersion(version); + currentRecord(record); + } catch (IOException e) { + if (e instanceof RecoverableIOException) { + // Don't mess with RecoverableIOExceptions. Let them out. + throw e; + } + IOException newE = new IOException(e.getMessage() + " (Offset " + + offset + ")."); + newE.setStackTrace(e.getStackTrace()); + throw newE; + } + return (ARCRecord)getCurrentRecord(); + } + + /** + * Returns version of this ARC file. Usually read from first record of ARC. + * If we're reading without having first read the first record -- e.g. + * random access into middle of an ARC -- then version will not have been + * set. For now, we return a default, version 1.1. Later, if more than + * just one version of ARC, we could look at such as the meta line to see + * what version of ARC this is. + * @return Version of this ARC file. + */ + public String getVersion() { + return (super.getVersion() == null)? "1.1": super.getVersion(); + } + + protected boolean isAlignedOnFirstRecord() { + return alignedOnFirstRecord; + } + + protected void setAlignedOnFirstRecord(boolean alignedOnFirstRecord) { + this.alignedOnFirstRecord = alignedOnFirstRecord; + } + + /** + * @return Returns the parseHttpHeaders. + */ + public boolean isParseHttpHeaders() { + return this.parseHttpHeaders; + } + + /** + * @param parse The parseHttpHeaders to set. + */ + public void setParseHttpHeaders(boolean parse) { + this.parseHttpHeaders = parse; + } + + public String getFileExtension() { + return ARC_FILE_EXTENSION; + } + + public String getDotFileExtension() { + return DOT_ARC_FILE_EXTENSION; + } + + protected boolean output(final String format) + throws IOException, java.text.ParseException { + boolean result = super.output(format); + if(!result && (format.equals(NOHEAD) || format.equals(HEADER))) { + throw new IOException(format + + " format only supported for single Records"); + } + return result; + } + + public boolean outputRecord(final String format) throws IOException { + boolean result = super.outputRecord(format); + if (result) { + return result; + } + if (format.equals(NOHEAD)) { + // No point digesting if dumping content. + setDigest(false); + ARCRecord r = (ARCRecord) get(); + r.skipHttpHeader(); + r.dump(); + result = true; + } else if (format.equals(HEADER)) { + // No point digesting if dumping content. + setDigest(false); + ARCRecord r = (ARCRecord) get(); + r.dumpHttpHeader(); + result = true; + } + + return result; + } + + public void dump(final boolean compress) + throws IOException, java.text.ParseException { + // No point digesting if we're doing a dump. + setDigest(false); + boolean firstRecord = true; + ARCWriter writer = null; + for (Iterator ii = iterator(); ii.hasNext();) { + ARCRecord r = (ARCRecord)ii.next(); + // We're to dump the arc on stdout. + // Get the first record's data if any. + ARCRecordMetaData meta = r.getMetaData(); + if (firstRecord) { + firstRecord = false; + // Get an ARCWriter. + ByteArrayOutputStream baos = + new ByteArrayOutputStream(r.available()); + // This is slow but done only once at top of ARC. + while (r.available() > 0) { + baos.write(r.read()); + } + List listOfMetadata = new ArrayList(); + listOfMetadata.add(baos.toString(WriterPoolMember.UTF8)); + // Assume getArc returns full path to file. ARCWriter + // or new File will complain if it is otherwise. + List outDirs = new ArrayList(); + WriterPoolSettingsData settings = + new WriterPoolSettingsData("","",-1L,compress,outDirs,listOfMetadata); + writer = new ARCWriter(new AtomicInteger(), System.out, + new File(meta.getArc()), settings); + continue; + } + + writer.write(meta.getUrl(), meta.getMimetype(), meta.getIp(), + ArchiveUtils.parse14DigitDate(meta.getDate()).getTime(), + (int)meta.getLength(), r); + } + // System.out.println(System.currentTimeMillis() - start); + } + + /** + * @return an ArchiveReader that will delete a local file on close. Used + * when we bring Archive files local and need to clean up afterward. + */ + public ARCReader getDeleteFileOnCloseReader(final File f) { + final ARCReader d = this; + return new ARCReader() { + private final ARCReader delegate = d; + private File archiveFile = f; + + public void close() throws IOException { + this.delegate.close(); + if (this.archiveFile != null) { + if (archiveFile.exists()) { + archiveFile.delete(); + } + this.archiveFile = null; + } + } + + public ArchiveRecord get(long o) throws IOException { + return this.delegate.get(o); + } + + public boolean isDigest() { + return this.delegate.isDigest(); + } + + public boolean isStrict() { + return this.delegate.isStrict(); + } + + public Iterator iterator() { + return this.delegate.iterator(); + } + + public void setDigest(boolean d) { + this.delegate.setDigest(d); + } + + public void setStrict(boolean s) { + this.delegate.setStrict(s); + } + + public List validate() throws IOException { + return this.delegate.validate(); + } + + @Override + public ArchiveRecord get() throws IOException { + return this.delegate.get(); + } + + @Override + public String getVersion() { + return this.delegate.getVersion(); + } + + @Override + public List validate(int noRecords) throws IOException { + return this.delegate.validate(noRecords); + } + + @Override + protected ARCRecord createArchiveRecord(InputStream is, + long offset) + throws IOException { + return this.delegate.createArchiveRecord(is, offset); + } + + @Override + protected void gotoEOR(ArchiveRecord record) throws IOException { + this.delegate.gotoEOR(record); + } + + @Override + public void dump(boolean compress) + throws IOException, java.text.ParseException { + this.delegate.dump(compress); + } + + @Override + public String getDotFileExtension() { + return this.delegate.getDotFileExtension(); + } + + @Override + public String getFileExtension() { + return this.delegate.getFileExtension(); + } + }; + } + + // Static methods follow. + + /** + * + * @param formatter Help formatter instance. + * @param options Usage options. + * @param exitCode Exit code. + */ + private static void usage(HelpFormatter formatter, Options options, + int exitCode) { + formatter.printHelp("java org.archive.io.arc.ARCReader" + + " [--digest=true|false] \\\n" + + " [--format=cdx|cdxfile|dump|gzipdump|header|nohead]" + + " [--offset=#] \\\n[--strict] [--parse] ARC_FILE|ARC_URL", + options); + System.exit(exitCode); + } + + /** + * Write out the arcfile. + * + * @param reader + * @param format Format to use outputting. + * @throws IOException + * @throws java.text.ParseException + */ + protected static void output(ARCReader reader, String format) + throws IOException, java.text.ParseException { + if (!reader.output(format)) { + throw new IOException("Unsupported format: " + format); + } + } + + /** + * Generate a CDX index file for an ARC file. + * + * @param urlOrPath The ARC file to generate a CDX index for + * @throws IOException + * @throws java.text.ParseException + */ + public static void createCDXIndexFile(String urlOrPath) + throws IOException, java.text.ParseException { + ARCReader r = ARCReaderFactory.get(urlOrPath); + r.setStrict(false); + r.setParseHttpHeaders(true); + r.setDigest(true); + output(r, CDX_FILE); + } + + /** + * Command-line interface to ARCReader. + * + * Here is the command-line interface: + *

+     * usage: java org.archive.io.arc.ARCReader [--offset=#] ARCFILE
+     *  -h,--help      Prints this message and exits.
+     *  -o,--offset    Outputs record at this offset into arc file.


+     *
+     * See in $HERITRIX_HOME/bin/arcreader for a script that'll
+     * take care of classpaths and the calling of ARCReader.
+     *
+     * 
Outputs using a pseudo-CDX format as described here:
+     * CDX
+     * Legent and here
+     * Example.
+     * Legend used in below is: 'CDX b e a m s c V (or v if uncompressed) n g'.
+     * Hash is hard-coded straight SHA-1 hash of content.
+     *
+     * @param args Command-line arguments.
+     * @throws ParseException Failed parse of the command line.
+     * @throws IOException
+     * @throws java.text.ParseException
+     */
+    @SuppressWarnings("unchecked")
+    public static void main(String [] args)
+    throws ParseException, IOException, java.text.ParseException {
+        Options options = getOptions();
+        options.addOption(new Option("p","parse", false, "Parse headers."));
+        PosixParser parser = new PosixParser();
+        CommandLine cmdline = parser.parse(options, args, false);
+        List cmdlineArgs = cmdline.getArgList();
+        Option [] cmdlineOptions = cmdline.getOptions();
+        HelpFormatter formatter = new HelpFormatter();
+
+        // If no args, print help.
+        if (cmdlineArgs.size() <= 0) {
+            usage(formatter, options, 0);
+        }
+
+        // Now look at options passed.
+        long offset = -1;
+        boolean digest = false;
+        boolean strict = false;
+        boolean parse = false;
+        String format = CDX;
+        for (int i = 0; i < cmdlineOptions.length; i++) {
+            switch(cmdlineOptions[i].getId()) {
+                case 'h':
+                    usage(formatter, options, 0);
+                    break;
+
+                case 'o':
+                    offset =
+                        Long.parseLong(cmdlineOptions[i].getValue());
+                    break;
+                    
+                case 's':
+                    strict = true;
+                    break;
+                    
+                case 'p':
+                        parse = true;
+                    break;
+                    
+                case 'd':
+                        digest = getTrueOrFalse(cmdlineOptions[i].getValue());
+                    break;
+                    
+                case 'f':
+                    format = cmdlineOptions[i].getValue().toLowerCase();
+                    boolean match = false;
+                    // List of supported formats.
+                    final String [] supportedFormats =
+                                {CDX, DUMP, GZIP_DUMP, HEADER, NOHEAD, CDX_FILE};
+                    for (int ii = 0; ii < supportedFormats.length; ii++) {
+                        if (supportedFormats[ii].equals(format)) {
+                            match = true;
+                            break;
+                        }
+                    }
+                    if (!match) {
+                        usage(formatter, options, 1);
+                    }
+                    break;
+
+                default:
+                    throw new RuntimeException("Unexpected option: " +
+                        + cmdlineOptions[i].getId());
+            }
+        }
+        
+        if (offset >= 0) {
+            if (cmdlineArgs.size() != 1) {
+                System.out.println("Error: Pass one arcfile only.");
+                usage(formatter, options, 1);
+            }
+            ARCReader arc = ARCReaderFactory.get((String)cmdlineArgs.get(0),
+                offset);
+            arc.setStrict(strict);
+            // We must parse headers if we need to skip them.
+            if (format.equals(NOHEAD) || format.equals(HEADER)) {
+                parse = true;
+            }
+            arc.setParseHttpHeaders(parse);
+            outputRecord(arc, format);
+        } else {
+            for (String urlOrPath : cmdlineArgs) {
+                try {
+                        ARCReader r = ARCReaderFactory.get(urlOrPath);
+                        r.setStrict(strict);
+                        r.setParseHttpHeaders(parse);
+                        r.setDigest(digest);
+                    output(r, format);
+                } catch (RuntimeException e) {
+                    // Write out name of file we failed on to help with
+                    // debugging.  Then print stack trace and try to keep
+                    // going.  We do this for case where we're being fed
+                    // a bunch of ARCs; just note the bad one and move
+                    // on to the next.
+                    System.err.println("Exception processing " + urlOrPath +
+                        ": " + e.getMessage());
+                    e.printStackTrace(System.err);
+                    System.exit(1);
+                }
+            }
+        }
+    }
+}
diff --git a/src/main/java/org/archive/io/arc/ARCReaderFactory.java b/src/main/java/org/archive/io/arc/ARCReaderFactory.java
new file mode 100644
index 00000000..e7dc1625
--- /dev/null
+++ b/src/main/java/org/archive/io/arc/ARCReaderFactory.java
@@ -0,0 +1,454 @@
+/*
+ *  This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ *  Licensed to the Internet Archive (IA) by one or more individual 
+ *  contributors. 
+ *
+ *  The IA licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+package org.archive.io.arc;
+
+import java.io.BufferedInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Iterator;
+import java.util.logging.Level;
+
+import org.archive.io.ArchiveReader;
+import org.archive.io.ArchiveReaderFactory;
+import org.archive.io.ArchiveRecord;
+import org.archive.io.ArchiveRecordHeader;
+import org.archive.util.FileUtils;
+import org.archive.util.zip.GZIPMembersInputStream;
+import org.archive.util.zip.GzipHeader;
+import org.archive.util.zip.NoGzipMagicException;
+
+import com.google.common.io.CountingInputStream;
+
+
+/**
+ * Factory that returns an ARCReader.
+ * 
+ * Can handle compressed and uncompressed ARCs.
+ *
+ * @author stack
+ */
+public class ARCReaderFactory extends ArchiveReaderFactory
+implements ARCConstants {
+    /**
+     * This factory instance.
+     */
+    private static final ARCReaderFactory factory = new ARCReaderFactory();
+
+    /**
+     * Shutdown any access to default constructor.
+     */
+    protected ARCReaderFactory() {
+        super();
+    }
+    
+    public static ARCReader get(String arcFileOrUrl)
+    throws MalformedURLException, IOException {
+    	return (ARCReader)ARCReaderFactory.factory.
+    		getArchiveReader(arcFileOrUrl);
+    }
+    
+    public static ARCReader get(String arcFileOrUrl, final long offset)
+    throws MalformedURLException, IOException {
+    	return (ARCReader)ARCReaderFactory.factory.
+    		getArchiveReader(arcFileOrUrl, offset);
+    }
+    
+    public static ARCReader get(final File f) throws IOException {
+    	return (ARCReader)ARCReaderFactory.factory.getArchiveReader(f);
+    }
+    
+    public static ARCReader get(final File f, final long offset)
+    throws IOException {
+    	return (ARCReader)ARCReaderFactory.factory.getArchiveReader(f, offset);
+    }
+    
+    protected ArchiveReader getArchiveReader(final File f, final long offset)
+    throws IOException {
+    	return getArchiveReader(f, true, offset);
+	}
+    
+    /**
+     * @param f An arcfile to read.
+     * @param skipSuffixTest Set to true if want to test that ARC has proper
+     * suffix. Use this method and pass false to open ARCs
+     * with the .open or otherwise suffix.
+     * @param offset Have returned ARCReader set to start reading at passed
+     * offset.
+     * @return An ARCReader.
+     * @throws IOException 
+     */
+    public static ARCReader get(final File f,
+            final boolean skipSuffixTest, final long offset)
+    throws IOException {
+    	return (ARCReader)ARCReaderFactory.factory.getArchiveReader(f,
+    		skipSuffixTest, offset);
+    }
+    
+    protected ArchiveReader getArchiveReader(final File arcFile,
+            final boolean skipSuffixTest, final long offset)
+    throws IOException {
+        boolean compressed = testCompressedARCFile(arcFile, skipSuffixTest);
+        if (!compressed) {
+            if (!FileUtils.isReadableWithExtensionAndMagic(arcFile,
+                    ARC_FILE_EXTENSION, ARC_MAGIC_NUMBER)) {
+                throw new IOException(arcFile.getAbsolutePath() +
+                    " is not an Internet Archive ARC file.");
+            }
+        }
+        return compressed?
+            (ARCReader)ARCReaderFactory.factory.
+                new CompressedARCReader(arcFile, offset):
+            (ARCReader)ARCReaderFactory.factory.
+                new UncompressedARCReader(arcFile, offset);
+	}
+    
+    public static ArchiveReader get(final String s, final InputStream is,
+            final boolean atFirstRecord)
+    throws IOException {
+        return ARCReaderFactory.factory.getArchiveReader(s, is,
+            atFirstRecord);
+    }
+    
+    protected ArchiveReader getArchiveReader(final String arc,
+			final InputStream is, final boolean atFirstRecord)
+			throws IOException {
+
+        // We do this mark() reset() stuff, wrapping in a BufferedInputStream if
+        // necessary to make it work, because testCompressedARCStream() consumes
+        // some bytes from the input stream
+        InputStream possiblyWrapped;
+        if (is.markSupported()) {
+            possiblyWrapped = is;
+        } else {
+            possiblyWrapped = new BufferedInputStream(is);
+        }
+
+        possiblyWrapped.mark(100);
+        boolean compressed = testCompressedARCStream(possiblyWrapped);
+        possiblyWrapped.reset();
+
+        if (compressed) {
+            return new CompressedARCReader(arc, possiblyWrapped, atFirstRecord);
+        } else {
+            return new UncompressedARCReader(arc, possiblyWrapped);
+        }
+	}
+    
+    /**
+	 * Get an ARCReader aligned at offset. This version of get
+	 * will not bring the ARC local but will try to stream across the net making
+	 * an HTTP 1.1 Range request on remote http server (RFC1435 Section 14.35).
+	 * 
+	 * @param arcUrl HTTP URL for an ARC (All ARCs considered remote).
+	 * @param offset Offset into ARC at which to start fetching.
+	 * @return An ARCReader aligned at offset.
+	 * @throws IOException
+	 */
+    public static ARCReader get(final URL arcUrl, final long offset)
+    throws IOException {
+        return (ARCReader)ARCReaderFactory.factory.getArchiveReader(arcUrl,
+            offset);
+    }
+    
+    /**
+     * Get an ARCReader.
+     * Pulls the ARC local into whereever the System Property
+     * java.io.tmpdir points. It then hands back an ARCReader that
+     * points at this local copy.  A close on this ARCReader instance will
+     * remove the local copy.
+     * @param arcUrl An URL that points at an ARC.
+     * @return An ARCReader.
+     * @throws IOException 
+     */
+    public static ARCReader get(final URL arcUrl)
+    throws IOException {
+        return (ARCReader)ARCReaderFactory.factory.getArchiveReader(arcUrl);
+    }
+    
+    /**
+     * @param arcFile File to test.
+     * @return True if arcFile is compressed ARC.
+     * @throws IOException
+     */
+    public boolean isCompressed(File arcFile) throws IOException {
+        return testCompressedARCFile(arcFile);
+    }
+    
+    /**
+     * Check file is compressed and in ARC GZIP format.
+     *
+     * @param arcFile File to test if its Internet Archive ARC file
+     * GZIP compressed.
+     *
+     * @return True if this is an Internet Archive GZIP'd ARC file (It begins
+     * w/ the Internet Archive GZIP header and has the
+     * COMPRESSED_ARC_FILE_EXTENSION suffix).
+     *
+     * @exception IOException If file does not exist or is not unreadable.
+     */
+    public static boolean testCompressedARCFile(File arcFile)
+    throws IOException {
+        return testCompressedARCFile(arcFile, false);
+    }
+
+    /**
+     * Check file is compressed and in ARC GZIP format.
+     *
+     * @param arcFile File to test if its Internet Archive ARC file
+     * GZIP compressed.
+     * @param skipSuffixCheck Set to true if we're not to test on the
+     * '.arc.gz' suffix.
+     *
+     * @return True if this is an Internet Archive GZIP'd ARC file (It begins
+     * w/ the Internet Archive GZIP header).
+     *
+     * @exception IOException If file does not exist or is not unreadable.
+     */
+    public static boolean testCompressedARCFile(File arcFile,
+            boolean skipSuffixCheck)
+    throws IOException {
+        boolean compressedARCFile = false;
+        FileUtils.assertReadable(arcFile);
+        if(!skipSuffixCheck && !arcFile.getName().toLowerCase()
+                .endsWith(COMPRESSED_ARC_FILE_EXTENSION)) {
+            return compressedARCFile;
+        }
+        
+        final InputStream is = new FileInputStream(arcFile);
+        try {
+            compressedARCFile = testCompressedARCStream(is);
+        } finally {
+            is.close();
+        }
+        return compressedARCFile;
+    }
+    
+    public static boolean isARCSuffix(final String arcName) {
+    	return (arcName == null)?
+    		false:
+    		(arcName.toLowerCase().endsWith(DOT_COMPRESSED_ARC_FILE_EXTENSION))?
+    		    true:
+    			(arcName.toLowerCase().endsWith(DOT_ARC_FILE_EXTENSION))?
+    			true: false;
+    }
+    
+    /**
+     * Tests passed stream is gzip stream by reading in the HEAD.
+     * Does not reposition the stream.  That is left up to the caller.
+     * @param is An InputStream.
+     * @return True if compressed stream.
+     * @throws IOException
+     */
+    public static boolean testCompressedARCStream(final InputStream is)
+            throws IOException {
+        boolean compressedARCFile = false;
+        GzipHeader gh = null;
+        try {
+            gh = new GzipHeader(is);
+        } catch (NoGzipMagicException e) {
+            return false;
+        }
+        
+        byte[] fextra = gh.getFextra();
+        // Now make sure following bytes are IA GZIP comment.
+        // First check length. ARC_GZIP_EXTRA_FIELD includes length
+        // so subtract two and start compare to ARC_GZIP_EXTRA_FIELD
+        // at +2.
+        // some Alexa ARC files gzip extra fields have changed slightly 
+        // after the first two bytes, so we'll just look for the 'LX' 
+        // extension for valid IA ARC files.
+        if (fextra != null) {
+        	if (fextra.length >= ARC_GZIP_EXTRA_FIELD.length - 2) {
+        		if (fextra[0] == ARC_GZIP_EXTRA_FIELD[2] && 
+        				fextra[1] == ARC_GZIP_EXTRA_FIELD[3]) {
+        			compressedARCFile = true;
+        		}
+        	}
+        } else {
+        	// Some old arcs don't have an extra header at all, but they're still compressed
+        	compressedARCFile = true;
+        }
+        
+        return compressedARCFile;
+    }
+
+    /**
+     * Uncompressed arc file reader.
+     * @author stack
+     */
+    public class UncompressedARCReader extends ARCReader {
+        /**
+         * Constructor.
+         * @param f Uncompressed arcfile to read.
+         * @throws IOException
+         */
+        public UncompressedARCReader(final File f)
+        throws IOException {
+            this(f, 0);
+        }
+
+        /**
+         * Constructor.
+         * 
+         * @param f Uncompressed arcfile to read.
+         * @param offset Offset at which to position ARCReader.
+         * @throws IOException
+         */
+        public UncompressedARCReader(final File f, final long offset)
+        throws IOException {
+            // Arc file has been tested for existence by time it has come
+            // to here.
+            setIn(new CountingInputStream(getInputStream(f, offset)));
+            getIn().skip(offset); 
+            initialize(f.getAbsolutePath());
+        }
+        
+        /**
+         * Constructor.
+         * 
+         * @param f Uncompressed arc to read.
+         * @param is InputStream.
+         */
+        public UncompressedARCReader(final String f, final InputStream is) {
+            // Arc file has been tested for existence by time it has come
+            // to here.
+            setIn(new CountingInputStream(is));
+            initialize(f);
+        }
+    }
+    
+    /**
+     * Compressed arc file reader.
+     * 
+     * @author stack
+     */
+    public class CompressedARCReader extends ARCReader {
+
+        /**
+         * Constructor.
+         * 
+         * @param f
+         *            Compressed arcfile to read.
+         * @throws IOException
+         */
+        public CompressedARCReader(final File f) throws IOException {
+            this(f, 0);
+        }
+
+        /**
+         * Constructor.
+         * 
+         * @param f Compressed arcfile to read.
+         * @param offset Position at where to start reading file.
+         * @throws IOException
+         */
+        public CompressedARCReader(final File f, final long offset)
+                throws IOException {
+            // Arc file has been tested for existence by time it has come
+            // to here.
+            setIn(new GZIPMembersInputStream(getInputStream(f, offset)));
+            ((GZIPMembersInputStream)getIn()).compressedSeek(offset); 
+            setCompressed((offset == 0)); // TODO: does this make sense???
+            initialize(f.getAbsolutePath());
+        }
+        
+        /**
+         * Constructor.
+         * 
+         * @param f Compressed arcfile.
+         * @param is InputStream to use.
+         * @throws IOException
+         */
+        public CompressedARCReader(final String f, final InputStream is,
+            final boolean atFirstRecord)
+        throws IOException {
+            // Arc file has been tested for existence by time it has come
+            // to here.
+            setIn(new GZIPMembersInputStream(is));
+            setCompressed(true);
+            setAlignedOnFirstRecord(atFirstRecord);
+            initialize(f);
+        }
+        
+        /**
+         * Get record at passed offset.
+         * 
+         * @param offset
+         *            Byte index into arcfile at which a record starts.
+         * @return An ARCRecord reference.
+         * @throws IOException
+         */
+        public ARCRecord get(long offset) throws IOException {
+            cleanupCurrentRecord();
+            ((GZIPMembersInputStream)getIn()).compressedSeek(offset);
+            return createArchiveRecord(getIn(), offset);
+        }
+        
+        public Iterator iterator() {
+            /**
+             * Override ARCRecordIterator so can base returned iterator on
+             * GzippedInputStream iterator.
+             */
+            return new ArchiveRecordIterator() {
+                private GZIPMembersInputStream gis =
+                    (GZIPMembersInputStream)getIn();
+
+                private Iterator gzipIterator = this.gis.memberIterator();
+
+                protected boolean innerHasNext() {
+                    return this.gzipIterator.hasNext();
+                }
+
+                protected ArchiveRecord innerNext() throws IOException {
+                    InputStream is = this.gzipIterator.next();
+                    return createArchiveRecord(is, Math.max(gis.getCurrentMemberStart(), gis.getCurrentMemberEnd()));
+                }
+            };
+        }
+        
+        protected void gotoEOR(ArchiveRecord rec) throws IOException {
+            int c;
+            while ((c = getIn().read())==LINE_SEPARATOR);
+            if(c==-1) {
+                return; 
+            }
+            long skipped = 1; 
+            while (getIn().read()>-1) {
+                skipped++;
+            }
+            // Report on system error the number of unexpected characters
+            // at the end of this record.
+            ArchiveRecordHeader meta = (getCurrentRecord() != null)?
+                rec.getHeader(): null;
+            String message = "Record STARTING at " +
+                ((GZIPMembersInputStream)getIn()).getCurrentMemberStart() +
+                " has " + skipped + " trailing byte(s): " +
+                ((meta != null)? meta.toString(): "");
+            if (isStrict()) {
+                throw new IOException(message);
+            }
+            logStdErr(Level.WARNING, message);
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/io/arc/ARCRecord.java b/src/main/java/org/archive/io/arc/ARCRecord.java
new file mode 100644
index 00000000..21bea07c
--- /dev/null
+++ b/src/main/java/org/archive/io/arc/ARCRecord.java
@@ -0,0 +1,835 @@
+/*
+ *  This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ *  Licensed to the Internet Archive (IA) by one or more individual 
+ *  contributors. 
+ *
+ *  The IA licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+package org.archive.io.arc;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import java.util.regex.Matcher;
+
+import org.apache.commons.httpclient.Header;
+import org.apache.commons.httpclient.StatusLine;
+import org.apache.commons.httpclient.util.EncodingUtil;
+import org.apache.commons.lang.StringUtils;
+import org.archive.io.ArchiveRecord;
+import org.archive.io.ArchiveRecordHeader;
+import org.archive.io.RecoverableIOException;
+import org.archive.util.InetAddressUtil;
+import org.archive.util.LaxHttpParser;
+import org.archive.util.TextUtils;
+
+/**
+ * An ARC file record.
+ * Does not compass the ARCRecord metadata line, just the record content.
+ * @author stack
+ */
+public class ARCRecord extends ArchiveRecord implements ARCConstants {
+    /**
+     * Http status line object.
+     * 
+     * May be null if record is not http.
+     */
+    private StatusLine httpStatus = null;
+
+    /**
+     * Http header bytes.
+     * 
+     * If non-null and bytes available, give out its contents before we
+     * go back to the underlying stream.
+     */
+    private InputStream httpHeaderStream = null;
+    
+    /**
+     * Http headers.
+     * 
+     * Only populated after reading of headers.
+     */
+    private Header [] httpHeaders = null;
+
+    /**
+     * Array of field names.
+     * 
+     * Used to initialize headerFieldNameKeys.
+     */
+    private final String [] headerFieldNameKeysArray = {
+        URL_FIELD_KEY,
+        IP_HEADER_FIELD_KEY,
+        DATE_FIELD_KEY,
+        MIMETYPE_FIELD_KEY,
+        LENGTH_FIELD_KEY
+    };
+    
+    /**
+     * An array of the header field names found in the ARC file header on
+     * the 3rd line.
+     * 
+     * We used to read these in from the arc file first record 3rd line but
+     * now we hardcode them for sake of improved performance.
+     */
+    private final List headerFieldNameKeys =
+        Arrays.asList(this.headerFieldNameKeysArray);
+
+    /**
+     * Http header bytes read while trying to read http header
+     */
+    public long httpHeaderBytesRead = -1;
+    
+    /**
+     * record length from metadata line
+     */
+    public long recordDeclaredLength;
+    
+    /**
+     * null if source was not compressed
+     */
+    public long compressedBytes; 
+    
+    /**
+     * actual payload data (not including trailing newline), 
+     * should match record-declared-length 
+     */
+    public long uncompressedBytes;
+
+    /**
+     * content-length header, iff HTTP and present, null otherwise 
+     */
+    public long httpPayloadDeclaredLength;
+
+    /**
+     * actual http payload length, should match http-payload-declared-length 
+     */
+    public long httpPayloadActualLength;
+    
+    /**
+     * errors encountered reading record
+     */
+    public List errors = new ArrayList();
+
+    /**
+     * verbatim ARC record header string
+     */
+    private String headerString;
+    public String getHeaderString() {
+        return this.headerString;
+    }
+    
+    /**
+     * Constructor.
+     *
+     * @param in Stream cue'd up to be at the start of the record this instance
+     * is to represent.
+     * @param metaData Meta data.
+     * @throws IOException
+     */
+    public ARCRecord(InputStream in, ArchiveRecordHeader metaData)
+                throws IOException {
+        this(in, metaData, 0, true, false, true);
+    }
+
+    /**
+     * Constructor.
+     *
+     * @param in Stream cue'd up to be at the start of the record this instance
+     * is to represent.
+     * @param metaData Meta data.
+     * @param bodyOffset Offset into the body.  Usually 0.
+     * @param digest True if we're to calculate digest for this record.  Not
+     * digesting saves about ~15% of cpu during an ARC parse.
+     * @param strict Be strict parsing (Parsing stops if ARC inproperly
+     * formatted).
+     * @param parseHttpHeaders True if we are to parse HTTP headers.  Costs
+     * about ~20% of CPU during an ARC parse.
+     * @throws IOException
+     */
+    public ARCRecord(InputStream in, ArchiveRecordHeader metaData,
+        int bodyOffset, boolean digest, boolean strict,
+        final boolean parseHttpHeaders) 
+    throws IOException {
+        super(in, metaData, bodyOffset, digest, strict);
+        if (parseHttpHeaders) {
+            this.httpHeaderStream = readHttpHeader();
+        }
+    }
+    
+    /**
+     * Constructor.
+     *
+     * @param in Stream cue'd up to be at the start of the records metadata 
+     * this instance is to represent. 
+     * @param identifier Identifier for this the hosting Reader.
+     * @param offset Current offset into in (Used to keep
+     * position properly aligned).  Usually 0.
+     * @param digest True if we're to calculate digest for this record.  Not
+     * digesting saves about ~15% of cpu during an ARC parse.
+     * @param strict Be strict parsing (Parsing stops if ARC inproperly
+     * formatted).
+     * @param parseHttpHeaders True if we are to parse HTTP headers.  Costs
+     * about ~20% of CPU during an ARC parse.
+     * @param isAllignedOnFirstRecord True if this is the first record to be
+     * read from an archive
+     * @param String version Version information to be returned to the
+     * ARCReader constructing this record 
+     * 
+     * @throws IOException
+     */
+    public ARCRecord(InputStream in, final String identifier, 
+                final long offset, boolean digest,      boolean strict, 
+                final boolean parseHttpHeaders, 
+                final boolean isAlignedOnFirstRecord, String version) 
+    throws IOException {
+        super(in, null, 0, digest, strict);
+        setHeader(parseHeaders(in, identifier, offset, strict, isAlignedOnFirstRecord, version));
+        if (parseHttpHeaders) {
+            this.httpHeaderStream = readHttpHeader();
+        }
+    }
+    
+    /**
+     * Constructor.
+     *
+     * @param in Stream cue'd up to be at the start of the records metadata 
+     * this instance is to represent.
+     * @param identifier Identifier for this the hosting Reader.
+     * @param offset Current offset into in (Used to keep
+     * position properly aligned).  Usually 0.
+     * @param digest True if we're to calculate digest for this record.  Not
+     * digesting saves about ~15% of cpu during an ARC parse.
+     * @param strict Be strict parsing (Parsing stops if ARC inproperly
+     * formatted).
+     * @param parseHttpHeaders True if we are to parse HTTP headers.  Costs
+     * about ~20% of CPU during an ARC parse.
+     * 
+     * @throws IOException
+     */
+    public ARCRecord(InputStream in, final String identifier, 
+                final long offset, boolean digest,      boolean strict, 
+                final boolean parseHttpHeaders) 
+    throws IOException {
+        this(in, identifier, offset, digest, strict, parseHttpHeaders, 
+                false, null);
+    }
+    
+    private ArchiveRecordHeader parseHeaders(final InputStream in,
+        final String identifier, final long offset, final boolean strict, 
+        final boolean isAlignedOnFirstRecord, String version)
+    throws IOException {
+        
+        ArrayList firstLineValues = new ArrayList(20);
+        getTokenizedHeaderLine(in, firstLineValues);
+        
+        int bodyOffset = 0;
+        if (offset == 0 && isAlignedOnFirstRecord) {
+            // If offset is zero and we were aligned at first record on
+            // creation (See #alignedOnFirstRecord for more on this), then no
+            // records have been read yet and we're reading our first one, the
+            // record of ARC file meta info.  Its special.  In ARC versions
+            // 1.x, first record has three lines of meta info. We've just read
+            // the first line. There are two more.  The second line has misc.
+            // info.  We're only interested in the first field, the version
+            // number.  The third line is the list of field names. Here's what
+            // ARC file version 1.x meta content looks like:
+            //
+            // filedesc://testIsBoundary-JunitIAH200401070157520.arc 0.0.0.0 \\
+            //      20040107015752 text/plain 77
+            // 1 0 InternetArchive
+            // URL IP-address Archive-date Content-type Archive-length
+            //
+            ArrayList secondLineValues = new ArrayList(20);
+            bodyOffset += getTokenizedHeaderLine(in, secondLineValues);
+            version = ((String)secondLineValues.get(0) +
+                "." + (String)secondLineValues.get(1));
+            // Just read over the 3rd line.  We used to parse it and use
+            // values found here but now we just hardcode them to avoid
+            // having to read this 3rd line even for random arc file accesses.
+            bodyOffset += getTokenizedHeaderLine(in, null);
+            // this.position = bodyOffset;
+        }
+        setBodyOffset(bodyOffset);
+        
+        return computeMetaData(this.headerFieldNameKeys, firstLineValues, version, offset, identifier);
+    }
+    
+    /**
+     * Get a record header line as list of tokens.
+     *
+     * We keep reading till we find a LINE_SEPARATOR or we reach the end
+     * of file w/o finding a LINE_SEPARATOR or the line length is crazy.
+     *
+     * @param stream InputStream to read from.
+     * @param list Empty list that gets filled w/ string tokens.
+     * @return Count of characters read.
+     * @exception IOException If problem reading stream or no line separator
+     * found or EOF before EOL or we didn't get minimum header fields.
+     */
+    private int getTokenizedHeaderLine(final InputStream stream,
+            List list) throws IOException {
+        // Preallocate usual line size.
+        StringBuilder buffer = new StringBuilder(2048 + 20);
+        int read = 0;
+        int previous = -1;
+        for (int c = -1; true;) {
+                previous = c;
+            c = stream.read();
+            if (c == -1) {
+                throw new RecoverableIOException("Hit EOF before header EOL.");
+            }
+            c &= 0xff; 
+            read++;
+            if (read > MAX_HEADER_LINE_LENGTH) {
+                throw new IOException("Header line longer than max allowed " +
+                    " -- " + String.valueOf(MAX_HEADER_LINE_LENGTH) +
+                    " -- or passed buffer doesn't contain a line (Read: " +
+                    buffer.length() + ").  Here's" +
+                    " some of what was read: " +
+                    buffer.substring(0, Math.min(buffer.length(), 256)));
+            }
+
+            if (c == LINE_SEPARATOR) {
+                if (buffer.length() == 0) {
+                    // Empty line at start of buffer.  Skip it and try again.
+                    continue;
+                }
+
+                if (list != null) {
+                    list.add(buffer.toString());
+                }
+                // LOOP TERMINATION.
+                break;
+            } else if (c == HEADER_FIELD_SEPARATOR) {
+                if (!isStrict() && previous == HEADER_FIELD_SEPARATOR) {
+                        // Early ARCs sometimes had multiple spaces between fields.
+                        continue;
+                }
+                if (list != null) {
+                    list.add(buffer.toString());
+                }
+                // reset to empty
+                buffer.setLength(0);
+            } else {
+                buffer.append((char)c);
+            }
+        }
+
+        // List must have at least 3 elements in it and no more than 10.  If
+        // it has other than this, then bogus parse.
+        if (list != null && (list.size() < 3 || list.size() > 100)) {
+            throw new IOException("Unparseable header line: " + list);
+        }
+
+        // save verbatim header String
+        this.headerString = StringUtils.join(list," ");
+        
+        return read;
+    }
+    
+    /**
+     * Compute metadata fields.
+     *
+     * Here we check the meta field has right number of items in it.
+     *
+     * @param keys Keys to use composing headerFields map.
+     * @param values Values to set into the headerFields map.
+     * @param v The version of this ARC file.
+     * @param offset Offset into arc file.
+     *
+     * @return Metadata structure for this record.
+     *
+     * @exception IOException  If no. of keys doesn't match no. of values.
+     */
+    private ARCRecordMetaData computeMetaData(List keys,
+                List values, String v, long offset, final String identifier)
+    throws IOException {
+        if (keys.size() != values.size()) {
+            List originalValues = values;
+            if (!isStrict()) {
+                values = fixSpaceInURL(values, keys.size());
+                // If values still doesn't match key size, try and do
+                // further repair.
+                    if (keys.size() != values.size()) {
+                        // Early ARCs had a space in mimetype.
+                        if (values.size() == (keys.size() + 1) &&
+                                        values.get(4).toLowerCase().startsWith("charset=")) {
+                                List nuvalues =
+                                        new ArrayList(keys.size());
+                                nuvalues.add(0, values.get(0));
+                                nuvalues.add(1, values.get(1));
+                                nuvalues.add(2, values.get(2));
+                                nuvalues.add(3, values.get(3) + values.get(4));
+                                nuvalues.add(4, values.get(5));
+                                values = nuvalues;
+                        } else if((values.size() + 1) == keys.size() &&
+                            isLegitimateIPValue(values.get(1)) &&
+                            isDate(values.get(2)) && isNumber(values.get(3))) {
+                        // Mimetype is empty.
+                        List nuvalues =
+                            new ArrayList(keys.size());
+                        nuvalues.add(0, values.get(0));
+                        nuvalues.add(1, values.get(1));
+                        nuvalues.add(2, values.get(2));
+                        nuvalues.add(3, "-");
+                        nuvalues.add(4, values.get(3));
+                        values = nuvalues;
+                    }
+                    }
+                }
+            if (keys.size() != values.size()) {
+                throw new IOException("Size of field name keys does" +
+                    " not match count of field values: " + values);
+            }
+            // Note that field was fixed on stderr.
+            System.err.println(Level.WARNING.toString() + "Fixed spaces in metadata line at " +
+                "offset " + offset +
+                " Original: " + originalValues + ", New: " + values);
+        }
+        
+        Map headerFields =
+                new HashMap(keys.size() + 2);
+        for (int i = 0; i < keys.size(); i++) {
+            headerFields.put(keys.get(i), values.get(i));
+        }
+        
+        // Add a check for tabs in URLs.  If any, replace with '%09'.
+        // See https://sourceforge.net/tracker/?group_id=73833&atid=539099&func=detail&aid=1010966,
+        // [ 1010966 ] crawl.log has URIs with spaces in them.
+        String url = (String)headerFields.get(URL_FIELD_KEY);
+        if (url != null && url.indexOf('\t') >= 0) {
+            headerFields.put(URL_FIELD_KEY,
+                TextUtils.replaceAll("\t", url, "%09"));
+        }
+
+        headerFields.put(VERSION_FIELD_KEY, v);
+        headerFields.put(ABSOLUTE_OFFSET_KEY, new  Long(offset));
+
+        return new ARCRecordMetaData(identifier, headerFields);
+    }
+
+    /**
+     * Fix space in URLs.
+     * The ARCWriter used to write into the ARC URLs with spaces in them.
+     * See [ 1010966 ]
+     * crawl.log has URIs with spaces in them.
+     * This method does fix up on such headers converting all spaces found
+     * to '%20'.
+     * @param values List of metadata values.
+     * @param requiredSize Expected size of resultant values list.
+     * @return New list if we successfully fixed up values or original if
+     * fixup failed.
+     */
+    private List fixSpaceInURL(List values, int requiredSize) {
+        // Do validity check. 3rd from last is a date of 14 numeric
+        // characters. The 4th from last is IP, all before the IP
+        // should be concatenated together with a '%20' joiner.
+        // In the below, '4' is 4th field from end which has the IP.
+        if (!(values.size() > requiredSize) || values.size() < 4) {
+            return values;
+        }
+        // Test 3rd field is valid date.
+        if (!isDate((String) values.get(values.size() - 3))) {
+            return values;
+        }
+
+        // Test 4th field is valid IP.
+        if (!isLegitimateIPValue((String) values.get(values.size() - 4))) {
+            return values;
+        }
+
+        List newValues = new ArrayList(requiredSize);
+        StringBuffer url = new StringBuffer();
+        for (int i = 0; i < (values.size() - 4); i++) {
+            if (i > 0) {
+                url.append("%20");
+            }
+            url.append(values.get(i));
+        }
+        newValues.add(url.toString());
+        for (int i = values.size() - 4; i < values.size(); i++) {
+            newValues.add(values.get(i));
+        }
+        return newValues;
+    }
+    
+    private boolean isDate(final String date) {
+        if (date.length() != 14) {
+            return false;
+        }
+        return isNumber(date);
+    }
+    
+    private boolean isNumber(final String n) {
+        for (int i = 0; i < n.length(); i++) {
+            if (!Character.isDigit(n.charAt(i))) {
+                return false;
+            }
+        }
+        return true;
+    }
+    
+    private boolean isLegitimateIPValue(final String ip) {
+        if ("-".equals(ip)) {
+            return true;
+        }
+        Matcher m = InetAddressUtil.IPV4_QUADS.matcher(ip);
+        return m != null && m.matches();
+    }
+    
+    /**
+     * Skip over the the http header if one present.
+     * 
+     * Subsequent reads will get the body.
+     * 
+     * 
Calling this method in the midst of reading the header
+     * will make for strange results.  Otherwise, safe to call
+     * at any time though before reading any of the arc record
+     * content is only time that it makes sense.
+     * 
+     * 
After calling this method, you can call
+     * {@link #getHttpHeaders()} to get the read http header.
+     * 
+     * @throws IOException
+     */
+    public void skipHttpHeader() throws IOException {
+        if (this.httpHeaderStream != null) {
+            // Empty the httpHeaderStream
+            for (int available = this.httpHeaderStream.available();
+                        this.httpHeaderStream != null &&
+                                (available = this.httpHeaderStream.available()) > 0;) {
+                // We should be in this loop once only we should only do this
+                // buffer allocation once.
+                byte [] buffer = new byte[available];
+                // The read nulls out httpHeaderStream when done with it so
+                // need check for null in the loop control line.
+                read(buffer, 0, available);
+            }
+        }
+    }
+    
+    public void dumpHttpHeader() throws IOException {
+                if (this.httpHeaderStream == null) {
+                        return;
+                }
+                // Dump the httpHeaderStream to STDOUT
+                for (int available = this.httpHeaderStream.available();
+                        this.httpHeaderStream != null
+                                && (available = this.httpHeaderStream.available()) > 0;) {
+                        // We should be in this loop only once and should do this
+                        // buffer allocation once.
+                        byte[] buffer = new byte[available];
+                        // The read nulls out httpHeaderStream when done with it so
+                        // need check for null in the loop control line.
+                        int read = read(buffer, 0, available);
+                        System.out.write(buffer, 0, read);
+                }
+        }
+    
+    /**
+         * Read http header if present. Technique borrowed from HttpClient HttpParse
+         * class. set errors when found.
+         * 
+         * @return ByteArrayInputStream with the http header in it or null if no
+         *         http header.
+         * @throws IOException
+         */
+    private InputStream readHttpHeader() throws IOException {
+    	
+    	// this can be helpful when simply iterating over records, 
+    	// looking for problems.
+        Logger logger = Logger.getLogger(this.getClass().getName());
+    	ArchiveRecordHeader h = this.getHeader();
+    	
+        // If judged a record that doesn't have an http header, return
+        // immediately.
+        String url = getHeader().getUrl();
+        if(!url.startsWith("http") ||
+            getHeader().getLength() <= MIN_HTTP_HEADER_LENGTH) {
+            return null;
+        }
+        
+        String statusLine;
+        byte[] statusBytes;
+        int eolCharCount = 0;
+        int errOffset = 0;
+        
+        // Read status line, skipping any errant http headers found before it
+        // This allows a larger number of 'corrupt' arcs -- where headers were accidentally
+        // inserted before the status line to be readable
+        while (true) {
+        	statusBytes = LaxHttpParser.readRawLine(getIn());
+        	eolCharCount = getEolCharsCount(statusBytes);
+        	if (eolCharCount <= 0) {
+        		throw new RecoverableIOException(
+                "Failed to read http status where one was expected: " 
+                + ((statusBytes == null) ? "" : new String(statusBytes)));
+        	}
+        
+        	statusLine = EncodingUtil.getString(statusBytes, 0,
+        			statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING);
+        	
+        	// If a null or DELETED break immediately
+        	if ((statusLine == null) || statusLine.startsWith("DELETED")) {
+        		break;
+        	}
+        	
+        	// If it's actually the status line, break, otherwise continue skipping any
+        	// previous header values
+        	if (!statusLine.contains(":") && StatusLine.startsWithHTTP(statusLine)) {
+        		break;
+        	}
+        	
+        	// Add bytes read to error "offset" to add to position
+        	errOffset += statusBytes.length;
+        }
+        
+        if (errOffset > 0) {
+            this.incrementPosition(errOffset);
+        }
+        
+        if ((statusLine == null) ||
+                !StatusLine.startsWithHTTP(statusLine)) {
+            if (statusLine.startsWith("DELETED")) {
+                // Some old ARCs have deleted records like following:
+                // http://vireo.gatech.edu:80/ebt-bin/nph-dweb/dynaweb/SGI_Developer/SGITCL_PG/@Generic__BookTocView/11108%3Btd%3D2 130.207.168.42 19991010131803 text/html 29202
+                // DELETED_TIME=20000425001133_DELETER=Kurt_REASON=alexalist
+                // (follows ~29K spaces)
+                // For now, throw a RecoverableIOException so if iterating over
+                // records, we keep going.  TODO: Later make a legitimate
+                // ARCRecord from the deleted record rather than throw
+                // exception.
+                throw new DeletedARCRecordIOException(statusLine);
+            } else {
+            	this.errors.add(ArcRecordErrors.HTTP_STATUS_LINE_INVALID);
+            }
+        }
+
+        try {
+        	this.httpStatus = new StatusLine(statusLine);
+        } catch(IOException e) {
+        	logger.warning(e.getMessage() + " at offset: " + h.getOffset());
+        	this.errors.add(ArcRecordErrors.HTTP_STATUS_LINE_EXCEPTION);
+        }
+        
+        // Save off all bytes read.  Keep them as bytes rather than
+        // convert to strings so we don't have to worry about encodings
+        // though this should never be a problem doing http headers since
+        // its all supposed to be ascii.
+        ByteArrayOutputStream baos =
+            new ByteArrayOutputStream(statusBytes.length + 4 * 1024);
+        baos.write(statusBytes);
+        
+        // Now read rest of the header lines looking for the separation
+        // between header and body.
+        for (byte [] lineBytes = null; true;) {
+            lineBytes = LaxHttpParser.readRawLine(getIn());
+            eolCharCount = getEolCharsCount(lineBytes);
+            if (eolCharCount <= 0) {
+            	if (getIn().available() == 0) {
+            		httpHeaderBytesRead += statusBytes.length;
+                	logger.warning("HTTP header truncated at offset: " + h.getOffset());
+            		this.errors.add(ArcRecordErrors.HTTP_HEADER_TRUNCATED);
+            		this.setEor(true);
+            		break;
+            	} else {
+            		throw new IOException("Failed reading http headers: " +
+            				((lineBytes != null)? new String(lineBytes): null));
+            	}
+            } else {
+            	httpHeaderBytesRead += lineBytes.length;
+            }
+            // Save the bytes read.
+            baos.write(lineBytes);
+            if ((lineBytes.length - eolCharCount) <= 0) {
+                // We've finished reading the http header.
+                break;
+            }
+        }
+        
+        byte [] headerBytes = baos.toByteArray();
+        // Save off where body starts.
+        this.getMetaData().setContentBegin(headerBytes.length);
+        ByteArrayInputStream bais =
+            new ByteArrayInputStream(headerBytes);
+        if (!bais.markSupported()) {
+            throw new IOException("ByteArrayInputStream does not support mark");
+        }
+        bais.mark(headerBytes.length);
+        // Read the status line.  Don't let it into the parseHeaders function.
+        // It doesn't know what to do with it.
+        bais.read(statusBytes, 0, statusBytes.length);
+        this.httpHeaders = LaxHttpParser.parseHeaders(bais,
+            ARCConstants.DEFAULT_ENCODING);
+        this.getMetaData().setStatusCode(Integer.toString(getStatusCode()));
+        bais.reset();
+        return bais;
+    }
+    
+    private static class DeletedARCRecordIOException
+    extends RecoverableIOException {
+        private static final long serialVersionUID = 1L;
+
+        public DeletedARCRecordIOException(final String reason) {
+            super(reason);
+        }
+    }
+    
+    /**
+     * Return status code for this record.
+     * 
+     * This method will return -1 until the http header has been read.
+     * @return Status code.
+     */
+    public int getStatusCode() {
+        return (this.httpStatus == null)? -1: this.httpStatus.getStatusCode();
+    }
+    
+    /**
+     * @param bytes Array of bytes to examine for an EOL.
+     * @return Count of end-of-line characters or zero if none.
+     */
+    private int getEolCharsCount(byte [] bytes) {
+        int count = 0;
+        if (bytes != null && bytes.length >=1 &&
+                bytes[bytes.length - 1] == '\n') {
+            count++;
+            if (bytes.length >=2 && bytes[bytes.length -2] == '\r') {
+                count++;
+            }
+        }
+        return count;
+    }
+
+    /**
+     * @return Meta data for this record.
+     */
+    public ARCRecordMetaData getMetaData() {
+        return (ARCRecordMetaData)getHeader();
+    }
+    
+    /**
+     * @return http headers (Only available after header has been read).
+     */
+    public Header [] getHttpHeaders() {
+        return this.httpHeaders;
+    }
+    
+    /**
+     * @return ArcRecordErrors encountered when reading 
+     */
+    public List getErrors() {
+    	return this.errors;
+    }
+    
+    /**
+     * @return true if ARC record errors found 
+     */
+    public boolean hasErrors() {
+    	return !this.errors.isEmpty();
+    }
+    
+    /**
+     * @return Next character in this ARCRecord's content else -1 if at end of
+     * this record.
+     * @throws IOException
+     */
+    public int read() throws IOException {
+        int c = -1;
+        if (this.httpHeaderStream != null &&
+                (this.httpHeaderStream.available() > 0)) {
+            // If http header, return bytes from it before we go to underlying
+            // stream.
+            c = this.httpHeaderStream.read();
+            // If done with the header stream, null it out.
+            if (this.httpHeaderStream.available() <= 0) {
+                this.httpHeaderStream = null;
+            }
+            incrementPosition();
+        } else {
+            c = super.read();
+        }
+        return c;
+    }
+
+    public int read(byte [] b, int offset, int length) throws IOException {
+        int read = -1;
+        if (this.httpHeaderStream != null &&
+                (this.httpHeaderStream.available() > 0)) {
+            // If http header, return bytes from it before we go to underlying
+            // stream.
+            read = Math.min(length, this.httpHeaderStream.available());
+            if (read == 0) {
+                read = -1;
+            } else {
+                read = this.httpHeaderStream.read(b, offset, read);
+            }
+            // If done with the header stream, null it out.
+            if (this.httpHeaderStream.available() <= 0) {
+                this.httpHeaderStream = null;
+            }
+            incrementPosition(read);
+        } else {
+            read = super.read(b, offset, length);
+        }
+        return read;
+    }
+
+    /**
+     * @return Offset at which the body begins (Only known after
+     * header has been read) or -1 if none or if we haven't read
+     * headers yet.  Usually length of HTTP headers (does not include ARC
+     * metadata line length).
+     */
+    public int getBodyOffset() {
+        return this.getMetaData().getContentBegin();
+    }
+    
+    @Override
+    protected String getIp4Cdx(ArchiveRecordHeader h) {
+        String result = null;
+        if (h instanceof ARCRecordMetaData) {
+                result = ((ARCRecordMetaData)h).getIp();
+        }
+        return (result != null)? result: super.getIp4Cdx(h);
+    }
+    
+    @Override
+        protected String getStatusCode4Cdx(ArchiveRecordHeader h) {
+                String result = null;
+                if (h instanceof ARCRecordMetaData) {
+                        result = ((ARCRecordMetaData) h).getStatusCode();
+                }
+                return (result != null) ? result: super.getStatusCode4Cdx(h);
+        }
+    
+    @Override
+        protected String getDigest4Cdx(ArchiveRecordHeader h) {
+                String result = null;
+                if (h instanceof ARCRecordMetaData) {
+                        result = ((ARCRecordMetaData) h).getDigest();
+                }
+                return (result != null) ? result: super.getDigest4Cdx(h);
+        }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/io/arc/ARCRecordMetaData.java b/src/main/java/org/archive/io/arc/ARCRecordMetaData.java
new file mode 100644
index 00000000..3f617041
--- /dev/null
+++ b/src/main/java/org/archive/io/arc/ARCRecordMetaData.java
@@ -0,0 +1,267 @@
+/*
+ *  This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ *  Licensed to the Internet Archive (IA) by one or more individual 
+ *  contributors. 
+ *
+ *  The IA licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+package org.archive.io.arc;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Set;
+
+import org.archive.io.ArchiveRecordHeader;
+
+
+/**
+ * An immutable class to hold an ARC record meta data.
+ *
+ * @author stack
+ */
+public class ARCRecordMetaData implements ArchiveRecordHeader, ARCConstants {
+    /**
+     * Map of record header fields.
+     *
+     * We store all in a hashmap.  This way we can hold version 1 or
+     * version 2 record meta data.
+     *
+     * 
Keys are lowercase.
+     */
+    protected Map headerFields = null;
+    
+    /**
+     * Digest for the record.
+     * 
+     * Only available after the record has been read in totality.
+     */
+    private String digest = null;
+    
+    /**
+     * Status for this request.
+     * 
+     * There may be no status.
+     */
+    private String statusCode = null;
+    
+    /**
+     * The arc this metadata came out.
+     * Descriptive String, either path or URL.
+     */
+    private String arc = null;
+    
+    private int contentBegin = 0;
+    
+    /**
+     * Shut down the default constructor.
+     */
+    protected ARCRecordMetaData() {
+        super();
+    }
+
+    /**
+     * Constructor.
+     *
+     * @param arc The arc file this metadata came out of.
+     * @param headerFields Hash of meta fields.
+     *
+     * @throws IOException
+     */
+    public ARCRecordMetaData(final String arc, Map headerFields)
+        throws IOException {
+        // Make sure the minimum required fields are present,
+        for (Iterator i = REQUIRED_VERSION_1_HEADER_FIELDS.iterator();
+            i.hasNext(); ) {
+            testRequiredField(headerFields, (String)i.next());
+        }
+        this.headerFields = headerFields;
+        this.arc = arc;
+    }
+
+    /**
+     * Test required field is present in hash.
+     *
+     * @param fields Map of fields.
+     * @param requiredField Field to test for.
+     *
+     * @exception IOException If required field is not present.
+     */
+    protected void testRequiredField(Map fields, String requiredField)
+        throws IOException {
+        if (!fields.containsKey(requiredField)) {
+            throw new IOException("Required field " + requiredField +
+            " not in meta data.");
+        }
+    }
+
+    /**
+     * Get the time when the record was harvested.
+     * 

+     * Returns the date in Heritrix 14 digit time format (UTC). See the
+     * {@link org.archive.util.ArchiveUtils} class for converting to Java
+     * dates.
+     * 
+     * @return Header date in Heritrix 14 digit format.
+     * @see org.archive.util.ArchiveUtils#parse14DigitDate(String)
+     */
+    public String getDate() {
+        return (String) this.headerFields.get(DATE_FIELD_KEY);
+    }
+
+    /**
+     * @return Return length of the record.
+     */
+    public long getLength() {
+        return Long.parseLong((String)this.headerFields.
+            get(LENGTH_FIELD_KEY));
+    }
+    
+    /**
+     * @return Return Content-Length of the contents of the record
+     * Same as record length for arcs? TODO
+     */
+    public long getContentLength() {
+    	return getLength();
+    }
+
+    /**
+     * @return Header url.
+     */
+    public String getUrl() {
+        return (String)this.headerFields.get(URL_FIELD_KEY);
+    }
+
+    /**
+     * @return IP.
+     */
+    public String getIp()
+    {
+        return (String)this.headerFields.get(IP_HEADER_FIELD_KEY);
+    }
+
+    /**
+     * @return mimetype The mimetype that is in the ARC metaline -- NOT the http
+     * content-type content.
+     */
+    public String getMimetype() {
+        return (String)this.headerFields.get(MIMETYPE_FIELD_KEY);
+    }
+
+    /**
+     * @return Arcfile version.
+     */
+    public String getVersion() {
+        return (String)this.headerFields.get(VERSION_FIELD_KEY);
+    }
+
+    /**
+     * @return Offset into arcfile at which this record begins.
+     */
+    public long getOffset() {
+        return ((Long)this.headerFields.get(ABSOLUTE_OFFSET_KEY)).longValue();
+    }
+
+    /**
+     * @param key Key to use looking up field value.
+     * @return value for passed key of null if no such entry.
+     */
+    public Object getHeaderValue(String key) {
+        return this.headerFields.get(key);
+    }
+
+    /**
+     * @return Header field name keys.
+     */
+    public Set getHeaderFieldKeys()
+    {
+        return this.headerFields.keySet();
+    }
+
+    /**
+     * @return Map of header fields.
+     */
+    public Map getHeaderFields() {
+        return this.headerFields;
+    }
+    
+    /**
+     * @return Returns identifier for ARC.
+     */
+    public String getArc() {
+        return this.arc;
+    }
+    
+    /**
+     * @return Convenience method that does a
+     * return new File(this.arc) (Be aware this.arc is not always
+     * full path to an ARC file -- may be an URL).  Test
+     * returned file for existence.
+     */
+    public File getArcFile() {
+        return new File(this.arc);
+    }
+    
+    /**
+     * @return Returns the digest.
+     */
+    public String getDigest() {
+        return this.digest;
+    }
+    
+    /**
+     * @param d The digest to set.
+     */
+    public void setDigest(String d) {
+        this.digest = d;
+    }
+    
+    /**
+     * @return Returns the statusCode.  May be null.
+     */
+    public String getStatusCode() {
+        return this.statusCode;
+    }
+    
+    /**
+     * @param statusCode The statusCode to set.
+     */
+    public void setStatusCode(String statusCode) {
+        this.statusCode = statusCode;
+    }
+    
+    public String toString() {
+        return ((this.arc != null)? this.arc: "") +
+           ": " +
+           ((this.headerFields != null)? this.headerFields.toString():  "");
+    }
+
+	public String getReaderIdentifier() {
+		return this.getArc();
+	}
+
+	public String getRecordIdentifier() {
+	    return getDate() + "/" + getUrl();
+	}
+
+    public int getContentBegin() {
+        return this.contentBegin;
+    }
+    
+    protected void setContentBegin(final int offset) {
+        this.contentBegin = offset;
+    }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/io/arc/ARCUtils.java b/src/main/java/org/archive/io/arc/ARCUtils.java
new file mode 100644
index 00000000..985457e2
--- /dev/null
+++ b/src/main/java/org/archive/io/arc/ARCUtils.java
@@ -0,0 +1,240 @@
+/*
+ *  This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ *  Licensed to the Internet Archive (IA) by one or more individual 
+ *  contributors. 
+ *
+ *  The IA licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.archive.io.arc;
+
+import it.unimi.dsi.fastutil.io.RepositionableStream;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URI;
+import java.net.URISyntaxException;
+
+import org.archive.url.UsableURI;
+import org.archive.util.zip.GzipHeader;
+import org.archive.util.zip.NoGzipMagicException;
+
+public class ARCUtils implements ARCConstants {
+    /**
+     * @param pathOrUri Path or URI to extract arc filename from.
+     * @return Extracted arc file name.
+     * @throws URISyntaxException 
+     */
+    public static String parseArcFilename(final String pathOrUri)
+    throws URISyntaxException {
+        String path = pathOrUri;
+        if (UsableURI.hasScheme(pathOrUri)) {
+            URI url = new URI(pathOrUri);
+            path = url.getPath();
+        }
+        return (new File(path)).getName();
+    }
+    
+    /**
+     * @param arcFile File to test.
+     * @return True if arcFile is compressed ARC.
+     * @throws IOException
+     */
+    public static boolean isCompressed(File arcFile) throws IOException {
+        return testCompressedARCFile(arcFile);
+    }
+    
+    /**
+     * Check file is compressed and in ARC GZIP format.
+     *
+     * @param arcFile File to test if its Internet Archive ARC file
+     * GZIP compressed.
+     *
+     * @return True if this is an Internet Archive GZIP'd ARC file (It begins
+     * w/ the Internet Archive GZIP header and has the
+     * COMPRESSED_ARC_FILE_EXTENSION suffix).
+     *
+     * @exception IOException If file does not exist or is not unreadable.
+     */
+    public static boolean testCompressedARCFile(File arcFile)
+    throws IOException {
+        return testCompressedARCFile(arcFile, false);
+    }
+
+    /**
+     * Check file is compressed and in ARC GZIP format.
+     *
+     * @param arcFile File to test if its Internet Archive ARC file
+     * GZIP compressed.
+     * @param skipSuffixCheck Set to true if we're not to test on the
+     * '.arc.gz' suffix.
+     *
+     * @return True if this is an Internet Archive GZIP'd ARC file (It begins
+     * w/ the Internet Archive GZIP header).
+     *
+     * @exception IOException If file does not exist or is not unreadable.
+     */
+    public static boolean testCompressedARCFile(File arcFile,
+            boolean skipSuffixCheck)
+    throws IOException {
+        boolean compressedARCFile = false;
+        isReadable(arcFile);
+        if(!skipSuffixCheck && !arcFile.getName().toLowerCase()
+                .endsWith(COMPRESSED_ARC_FILE_EXTENSION)) {
+            return compressedARCFile;
+        }
+        
+        final InputStream is = new FileInputStream(arcFile);
+        try {
+            compressedARCFile = testCompressedARCStream(is);
+        } finally {
+            is.close();
+        }
+        return compressedARCFile;
+    }
+    
+    /**
+     * Tests passed stream is gzip stream by reading in the HEAD.
+     * Does not reposition the stream.  That is left up to the caller.
+     * @param is An InputStream.
+     * @return True if compressed stream.
+     * @throws IOException
+     */
+    public static boolean testCompressedARCStream(final InputStream is)
+            throws IOException {
+        boolean compressedARCFile = false;
+        GzipHeader gh = null;
+        try {
+            gh = new GzipHeader(is);
+        } catch (NoGzipMagicException e ) {
+            return compressedARCFile;
+        }
+        
+        byte[] fextra = gh.getFextra();
+        // Now make sure following bytes are IA GZIP comment.
+        // First check length. ARC_GZIP_EXTRA_FIELD includes length
+        // so subtract two and start compare to ARC_GZIP_EXTRA_FIELD
+        // at +2.
+        if (fextra != null &&
+        		ARC_GZIP_EXTRA_FIELD.length - 2 == fextra.length) {
+            compressedARCFile = true;
+            for (int i = 0; i < fextra.length; i++) {
+                if (fextra[i] != ARC_GZIP_EXTRA_FIELD[i + 2]) {
+                    compressedARCFile = false;
+                    break;
+                }
+            }
+        }
+        return compressedARCFile;
+    }
+    
+    /**
+     * Tests passed stream is gzip stream by reading in the HEAD.
+     * Does reposition of stream when done.
+     * @param rs An InputStream that is Repositionable.
+     * @return True if compressed stream.
+     * @throws IOException
+     */
+    public static boolean testCompressedRepositionalStream(
+            final RepositionableStream rs)
+    throws IOException {
+        boolean compressedARCFile = false;
+        long p = rs.position();
+        try {
+            compressedARCFile = testCompressedStream((InputStream)rs);
+        } finally {
+            rs.position(p);
+        }
+        return compressedARCFile; 
+    }
+    
+    /**
+     * Tests passed stream is gzip stream by reading in the HEAD.
+     * Does reposition of stream when done.
+     * @param is An InputStream.
+     * @return True if compressed stream.
+     * @throws IOException
+     */
+    public static boolean testCompressedStream(final InputStream is)
+    throws IOException {
+        boolean compressedARCFile = false;
+        try {
+            new GzipHeader(is);
+            compressedARCFile = true;
+        } catch (NoGzipMagicException e) {
+            return compressedARCFile;
+        }
+        return compressedARCFile;
+    }
+    
+    /**
+     * Check file is uncompressed ARC file.
+     * 
+     * @param arcFile
+     *            File to test if its Internet Archive ARC file uncompressed.
+     * 
+     * @return True if this is an Internet Archive ARC file.
+     * 
+     * @exception IOException
+     *                If file does not exist or is not unreadable.
+     */
+    public static boolean testUncompressedARCFile(File arcFile)
+    throws IOException {
+        boolean uncompressedARCFile = false;
+        isReadable(arcFile);
+        if(arcFile.getName().toLowerCase().endsWith(ARC_FILE_EXTENSION)) {
+            FileInputStream fis = new FileInputStream(arcFile);
+            try {
+                byte [] b = new byte[ARC_MAGIC_NUMBER.length()];
+                int read = fis.read(b, 0, ARC_MAGIC_NUMBER.length());
+                fis.close();
+                if (read == ARC_MAGIC_NUMBER.length()) {
+                    StringBuffer beginStr
+                        = new StringBuffer(ARC_MAGIC_NUMBER.length());
+                    for (int i = 0; i < ARC_MAGIC_NUMBER.length(); i++) {
+                        beginStr.append((char)b[i]);
+                    }
+                    
+                    if (beginStr.toString().
+                            equalsIgnoreCase(ARC_MAGIC_NUMBER)) {
+                        uncompressedARCFile = true;
+                    }
+                }
+            } finally {
+                fis.close();
+            }
+        }
+
+        return uncompressedARCFile;
+    }
+    
+
+    /**
+     * @param arcFile File to test.
+     * @exception IOException If file does not exist or is not unreadable.
+     */
+    private static void isReadable(File arcFile) throws IOException {
+        if (!arcFile.exists()) {
+            throw new FileNotFoundException(arcFile.getAbsolutePath() +
+                " does not exist.");
+        }
+
+        if (!arcFile.canRead()) {
+            throw new FileNotFoundException(arcFile.getAbsolutePath() +
+                " is not readable.");
+        }
+    }
+}
diff --git a/src/main/java/org/archive/io/arc/ARCWriter.java b/src/main/java/org/archive/io/arc/ARCWriter.java
new file mode 100644
index 00000000..b5825d50
--- /dev/null
+++ b/src/main/java/org/archive/io/arc/ARCWriter.java
@@ -0,0 +1,459 @@
+/*
+ *  This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ *  Licensed to the Internet Archive (IA) by one or more individual 
+ *  contributors. 
+ *
+ *  The IA licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+package org.archive.io.arc;
+
+import java.io.BufferedInputStream;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.Closeable;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PrintStream;
+import java.io.UnsupportedEncodingException;
+import java.util.Iterator;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.logging.Logger;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.archive.io.ReplayInputStream;
+import org.archive.io.WriterPoolMember;
+import org.archive.io.WriterPoolSettings;
+import org.archive.util.ArchiveUtils;
+import org.archive.util.DevUtils;
+import org.archive.util.MimetypeUtils;
+
+
+/**
+ * Write ARC files.
+ *
+ * Assumption is that the caller is managing access to this ARCWriter ensuring
+ * only one thread of control accessing this ARC file instance at any one time.
+ *
+ * 
ARC files are described here:
+ * Arc
+ * File Format.  This class does version 1 of the ARC file format.  It also
+ * writes version 1.1 which is version 1 with data stuffed into the body of the
+ * first arc record in the file, the arc file meta record itself.
+ *
+ * 
An ARC file is three lines of meta data followed by an optional 'body' and
+ * then a couple of '\n' and then: record, '\n', record, '\n', record, etc.
+ * If we are writing compressed ARC files, then each of the ARC file records is
+ * individually gzipped and concatenated together to make up a single ARC file.
+ * In GZIP terms, each ARC record is a GZIP member of a total gzip'd
+ * file.
+ *
+ * 
The GZIPping of the ARC file meta data is exceptional.  It is GZIPped
+ * w/ an extra GZIP header, a special Internet Archive (IA) extra header field
+ * (e.g. FEXTRA is set in the GZIP header FLG field and an extra field is
+ * appended to the GZIP header).  The extra field has little in it but its
+ * presence denotes this GZIP as an Internet Archive gzipped ARC.  See RFC1952
+ * to learn about the GZIP header structure.
+ *
+ * 
This class then does its GZIPping in the following fashion.  Each GZIP
+ * member is written w/ a new instance of GZIPOutputStream -- actually
+ * ARCWriterGZIPOututStream so we can get access to the underlying stream.
+ * The underlying stream stays open across GZIPoutputStream instantiations.
+ * For the 'special' GZIPing of the ARC file meta data, we cheat by catching the
+ * GZIPOutputStream output into a byte array, manipulating it adding the
+ * IA GZIP header, before writing to the stream.
+ *
+ * 
I tried writing a resettable GZIPOutputStream and could make it work w/
+ * the SUN JDK but the IBM JDK threw NPE inside in the deflate.reset -- its zlib
+ * native call doesn't seem to like the notion of resetting -- so I gave up on
+ * it.
+ *
+ * 
Because of such as the above and troubles with GZIPInputStream, we should
+ * write our own GZIP*Streams, ones that resettable and consious of gzip
+ * members.
+ *
+ * 
This class will write until we hit >= maxSize.  The check is done at
+ * record boundary.  Records do not span ARC files.  We will then close current
+ * file and open another and then continue writing.
+ *
+ * 
TESTING: Here is how to test that produced ARC files are good
+ * using the
+ * alexa
+ * ARC c-tools:
+ * 
+ * % av_procarc hx20040109230030-0.arc.gz | av_ziparc > \
+ *     /tmp/hx20040109230030-0.dat.gz
+ * % av_ripdat /tmp/hx20040109230030-0.dat.gz > /tmp/hx20040109230030-0.cdx
+ * 
+ * Examine the produced cdx file to make sure it makes sense.  Search
+ * for 'no-type 0'.  If found, then we're opening a gzip record w/o data to
+ * write.  This is bad.
+ *
+ * You can also do gzip -t FILENAME and it will tell you if the
+ * ARC makes sense to GZIP.
+ * 
+ * 
While being written, ARCs have a '.open' suffix appended.
+ *
+ * @author stack
+ */
+public class ARCWriter extends WriterPoolMember implements ARCConstants, Closeable {
+    private static final Logger logger =
+        Logger.getLogger(ARCWriter.class.getName());
+    
+    /**
+     * Metadata line pattern.
+     */
+    private static final Pattern METADATA_LINE_PATTERN =
+        Pattern.compile("^\\S+ \\S+ \\S+ \\S+ \\S+(" + LINE_SEPARATOR + "?)$");
+    
+      
+    /**
+     * Constructor.
+     * Takes a stream. Use with caution. There is no upperbound check on size.
+     * Will just keep writing.
+     * 
+     * @param serialNo  used to generate unique file name sequences
+     * @param out Where to write.
+     * @param arc File the out is connected to.
+     * @param cmprs Compress the content written.
+     * @param metadata File meta data.  Can be null.  Is list of File and/or
+     * String objects.
+     * @param a14DigitDate If null, we'll write current time.
+     * @throws IOException
+     */
+    public ARCWriter(final AtomicInteger serialNo, final PrintStream out,
+    	final File arc, final WriterPoolSettings settings)
+    throws IOException {
+        super(serialNo, out, arc, settings);
+        writeFirstRecord(ArchiveUtils.get14DigitDate());
+    }
+          
+    /**
+     * Constructor.
+     *
+     * @param serialNo  used to generate unique file name sequences
+     * @param settings all creation parameters
+     */
+    public ARCWriter(final AtomicInteger serialNo, final WriterPoolSettings settings) {
+        super(serialNo, settings, ARC_FILE_EXTENSION);
+
+    }
+
+    protected String createFile()
+    throws IOException {
+        String name = super.createFile();
+        writeFirstRecord(currentTimestamp);
+        return name;
+    }
+    
+    private void writeFirstRecord(final String ts)
+    throws IOException {
+        write(generateARCFileMetaData(ts));
+    }
+        
+	/**
+     * Write out the ARCMetaData.
+     *
+     * 
Generate ARC file meta data.  Currently we only do version 1 of the
+     * ARC file formats or version 1.1 when metadata has been supplied (We
+     * write it into the body of the first record in the arc file).
+     *
+     * 
Version 1 metadata looks roughly like this:
+     *
+     * 
filedesc://testWriteRecord-JunitIAH20040110013326-2.arc 0.0.0.0 \\
+     *  20040110013326 text/plain 77
+     * 1 0 InternetArchive
+     * URL IP-address Archive-date Content-type Archive-length
+     * 
+     *
+     * If compress is set, then we generate a header that has been gzipped
+     * in the Internet Archive manner.   Such a gzipping enables the FEXTRA
+     * flag in the FLG field of the gzip header.  It then appends an extra
+     * header field: '8', '0', 'L', 'X', '0', '0', '0', '0'.  The first two
+     * bytes are the length of the field and the last 6 bytes the Internet
+     * Archive header.  To learn about GZIP format, see RFC1952.  To learn
+     * about the Internet Archive extra header field, read the source for
+     * av_ziparc which can be found at
+     * alexa/vista/alexa-tools-1.2/src/av_ziparc.cc.
+     *
+     * 
We do things in this roundabout manner because the java
+     * GZIPOutputStream does not give access to GZIP header fields.
+     *
+     * @param date Date to put into the ARC metadata; if 17-digit will be 
+     * truncated to traditional 14-digits
+     *
+     * @return Byte array filled w/ the arc header.
+	 * @throws IOException
+     */
+    private byte [] generateARCFileMetaData(String date)
+    throws IOException {
+        if(date!=null && date.length()>14) {
+            date = date.substring(0,14);
+        }
+        int metadataBodyLength = getMetadataLength();
+        // If metadata body, then the minor part of the version is '1' rather
+        // than '0'.
+        String metadataHeaderLinesTwoAndThree =
+            getMetadataHeaderLinesTwoAndThree("1 " +
+                ((metadataBodyLength > 0)? "1": "0"));
+        int recordLength = metadataBodyLength +
+            metadataHeaderLinesTwoAndThree.getBytes(DEFAULT_ENCODING).length;
+        String metadataHeaderStr = ARC_MAGIC_NUMBER + getBaseFilename() +
+            " 0.0.0.0 " + date + " text/plain " + recordLength +
+            metadataHeaderLinesTwoAndThree;
+        ByteArrayOutputStream metabaos =
+            new ByteArrayOutputStream(recordLength);
+        // Write the metadata header.
+        metabaos.write(metadataHeaderStr.getBytes(DEFAULT_ENCODING));
+        // Write the metadata body, if anything to write.
+        if (metadataBodyLength > 0) {
+            writeMetaData(metabaos);
+        }
+        
+        // Write out a LINE_SEPARATORs to end this record.
+        metabaos.write(LINE_SEPARATOR);
+        
+        // Now get bytes of all just written and compress if flag set.
+        byte [] bytes = metabaos.toByteArray();
+        
+        if(isCompressed()) {
+            // GZIP the header but catch the gzipping into a byte array so we
+            // can add the special IA GZIP header to the product.  After
+            // manipulations, write to the output stream (The JAVA GZIP
+            // implementation does not give access to GZIP header. It
+            // produces a 'default' header only).  We can get away w/ these
+            // maniupulations because the GZIP 'default' header doesn't
+            // do the 'optional' CRC'ing of the header.
+            byte [] gzippedMetaData = ArchiveUtils.gzip(bytes);
+            if (gzippedMetaData[3] != 0) {
+                throw new IOException("The GZIP FLG header is unexpectedly " +
+                    " non-zero.  Need to add smarter code that can deal " +
+                    " when already extant extra GZIP header fields.");
+            }
+            // Set the GZIP FLG header to '4' which says that the GZIP header
+            // has extra fields.  Then insert the alex {'L', 'X', '0', '0', '0,
+            // '0'} 'extra' field.  The IA GZIP header will also set byte
+            // 9 (zero-based), the OS byte, to 3 (Unix).  We'll do the same.
+            gzippedMetaData[3] = 4;
+            gzippedMetaData[9] = 3;
+            byte [] assemblyBuffer = new byte[gzippedMetaData.length +
+                ARC_GZIP_EXTRA_FIELD.length];
+            // '10' in the below is a pointer past the following bytes of the
+            // GZIP header: ID1 ID2 CM FLG + MTIME(4-bytes) XFL OS.  See
+            // RFC1952 for explaination of the abbreviations just used.
+            System.arraycopy(gzippedMetaData, 0, assemblyBuffer, 0, 10);
+            System.arraycopy(ARC_GZIP_EXTRA_FIELD, 0, assemblyBuffer, 10,
+                ARC_GZIP_EXTRA_FIELD.length);
+            System.arraycopy(gzippedMetaData, 10, assemblyBuffer,
+                10 + ARC_GZIP_EXTRA_FIELD.length, gzippedMetaData.length - 10);
+            bytes = assemblyBuffer;
+        }
+        return bytes;
+    }
+    
+    public String getMetadataHeaderLinesTwoAndThree(String version) {
+        StringBuffer buffer = new StringBuffer();
+        buffer.append(LINE_SEPARATOR);
+        buffer.append(version);
+        buffer.append(" InternetArchive");
+        buffer.append(LINE_SEPARATOR);
+        buffer.append("URL IP-address Archive-date Content-type Archive-length");
+        buffer.append(LINE_SEPARATOR);
+        return buffer.toString();
+    }
+
+    /**
+     * Write all metadata to passed baos.
+     *
+     * @param baos Byte array to write to.
+     * @throws UnsupportedEncodingException
+     * @throws IOException
+     */
+    private void writeMetaData(ByteArrayOutputStream baos)
+            throws UnsupportedEncodingException, IOException {
+        if (settings.getMetadata() == null) {
+            return;
+        }
+
+        for (Iterator i = settings.getMetadata().iterator();
+                i.hasNext();) {
+            Object obj = i.next();
+            if (obj instanceof String) {
+                baos.write(((String)obj).getBytes(DEFAULT_ENCODING));
+            } else if (obj instanceof File) {
+                InputStream is = null;
+                try {
+                    is = new BufferedInputStream(
+                        new FileInputStream((File)obj));
+                    byte [] buffer = new byte[4096];
+                    for (int read = -1; (read = is.read(buffer)) != -1;) {
+                        baos.write(buffer, 0, read);
+                    }
+                } finally {
+                    if (is != null) {
+                        is.close();
+                    }
+                }
+            } else if (obj != null) {
+                logger.severe("Unsupported metadata type: " + obj);
+            }
+        }
+        return;
+    }
+
+    /**
+     * @return Total length of metadata.
+     * @throws UnsupportedEncodingException
+     */
+    private int getMetadataLength()
+    throws UnsupportedEncodingException {
+        int result = -1;
+        if (settings.getMetadata()  == null) {
+            result = 0;
+        } else {
+            for (Iterator i = settings.getMetadata().iterator();
+                    i.hasNext();) {
+                Object obj = i.next();
+                if (obj instanceof String) {
+                    result += ((String)obj).getBytes(DEFAULT_ENCODING).length;
+                } else if (obj instanceof File) {
+                    result += ((File)obj).length();
+                } else {
+                    logger.severe("Unsupported metadata type: " + obj);
+                }
+            }
+        }
+        return result;
+    }
+
+    /**
+     * @deprecated use input-stream version directly instead
+     */
+    public void write(String uri, String contentType, String hostIP,
+            long fetchBeginTimeStamp, long recordLength,
+            ByteArrayOutputStream baos)
+    throws IOException {
+        write(uri, contentType, hostIP, fetchBeginTimeStamp, recordLength, 
+                new ByteArrayInputStream(baos.toByteArray()), false);
+    }
+    
+    public void write(String uri, String contentType, String hostIP,
+            long fetchBeginTimeStamp, long recordLength, InputStream in)
+    throws IOException {
+        write(uri,contentType,hostIP,fetchBeginTimeStamp,recordLength,in,true);
+    }
+    
+    /**
+     * Write a record with the given metadata/content.
+     * 
+     * @param uri
+     *            URI for metadata-line
+     * @param contentType
+     *            MIME content-type for metadata-line
+     * @param hostIP
+     *            IP for metadata-line
+     * @param fetchBeginTimeStamp
+     *            timestamp for metadata-line
+     * @param recordLength
+     *            length for metadata-line; also may be enforced
+     * @param in
+     *            source InputStream for record content
+     * @param enforceLength
+     *            whether to enforce the declared length; should be true
+     *            unless intentionally writing bad records for testing
+     * @throws IOException
+     */
+    public void write(String uri, String contentType, String hostIP,
+            long fetchBeginTimeStamp, long recordLength, InputStream in,
+            boolean enforceLength) throws IOException {
+        preWriteRecordTasks();
+        try {
+            write(getMetaLine(uri, contentType, hostIP, fetchBeginTimeStamp,
+                    recordLength).getBytes(UTF8));
+            copyFrom(in, recordLength, enforceLength);
+            if (in instanceof ReplayInputStream) {
+                // check for consumption of entire recorded material
+                long remaining = ((ReplayInputStream) in).remaining();
+                // Should be zero at this stage. If not, something is
+                // wrong.
+                if (remaining != 0) {
+                    String message = "Gap between expected and actual: "
+                            + remaining + LINE_SEPARATOR + DevUtils.extraInfo()
+                            + " writing arc "
+                            + this.getFile().getAbsolutePath();
+                    DevUtils.warnHandle(new Throwable(message), message);
+                    throw new IOException(message);
+                }
+            }
+            write(LINE_SEPARATOR);
+        } finally {
+            postWriteRecordTasks();
+        }
+    }
+    
+    /**
+     * @param uri
+     * @param contentType
+     * @param hostIP
+     * @param fetchBeginTimeStamp
+     * @param recordLength
+     * @return Metadata line for an ARCRecord made of passed components.
+     * @exception IOException
+     */
+    protected String getMetaLine(String uri, String contentType, String hostIP,
+        long fetchBeginTimeStamp, long recordLength)
+    throws IOException {
+        if (fetchBeginTimeStamp <= 0) {
+            throw new IOException("Bogus fetchBeginTimestamp: " +
+                Long.toString(fetchBeginTimeStamp));
+        }
+
+        return validateMetaLine(createMetaline(uri, hostIP, 
+            ArchiveUtils.get14DigitDate(fetchBeginTimeStamp),
+            MimetypeUtils.truncate(contentType),
+            Long.toString(recordLength)));
+    }
+    
+    public String createMetaline(String uri, String hostIP,
+            String timeStamp, String mimetype, String recordLength) {
+        return uri + HEADER_FIELD_SEPARATOR + hostIP +
+            HEADER_FIELD_SEPARATOR + timeStamp +
+            HEADER_FIELD_SEPARATOR + mimetype +
+            HEADER_FIELD_SEPARATOR + recordLength + LINE_SEPARATOR;
+    }
+    
+    /**
+     * Test that the metadata line is valid before writing.
+     * @param metaLineStr
+     * @throws IOException
+     * @return The passed in metaline.
+     */
+    protected String validateMetaLine(String metaLineStr)
+    throws IOException {
+        if (metaLineStr.length() > MAX_METADATA_LINE_LENGTH) {
+            throw new IOException("Metadata line too long ("
+                + metaLineStr.length() + ">" + MAX_METADATA_LINE_LENGTH 
+                + "): " + metaLineStr);
+        }
+     	Matcher m = METADATA_LINE_PATTERN.matcher(metaLineStr);
+        if (!m.matches()) {
+            throw new IOException("Metadata line doesn't match expected" +
+                " pattern: " + metaLineStr);
+        }
+        return metaLineStr;
+    }
+}
diff --git a/src/main/java/org/archive/io/arc/ARCWriterPool.java b/src/main/java/org/archive/io/arc/ARCWriterPool.java
new file mode 100644
index 00000000..b55b3ed4
--- /dev/null
+++ b/src/main/java/org/archive/io/arc/ARCWriterPool.java
@@ -0,0 +1,69 @@
+/*
+ *  This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ *  Licensed to the Internet Archive (IA) by one or more individual 
+ *  contributors. 
+ *
+ *  The IA licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.archive.io.arc;
+
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.archive.io.WriterPool;
+import org.archive.io.WriterPoolMember;
+import org.archive.io.WriterPoolSettings;
+
+
+/**
+ * A pool of ARCWriters.
+ *
+ * @author stack
+ */
+public class ARCWriterPool extends WriterPool {
+    /**
+     * Constructor
+     *
+     * @param settings Settings for this pool.
+     * @param poolMaximumActive
+     * @param poolMaximumWait
+     */
+    public ARCWriterPool(final WriterPoolSettings settings,
+            final int poolMaximumActive, final int poolMaximumWait) {
+        this(new AtomicInteger(), settings, poolMaximumActive, poolMaximumWait);
+    }
+
+    /**
+     * Constructor
+     *
+     * @param serial  Used to generate unique filename sequences
+     * @param settings Settings for this pool.
+     * @param poolMaximumActive
+     * @param poolMaximumWait
+     */
+    public ARCWriterPool(final AtomicInteger serial,
+    		final WriterPoolSettings settings,
+            final int poolMaximumActive, final int poolMaximumWait) {
+    	super(serial, settings, poolMaximumActive, poolMaximumWait);
+    }
+    
+    /* (non-Javadoc)
+     * @see org.archive.io.WriterPool#makeWriter()
+     */
+    protected WriterPoolMember makeWriter() {
+        return new ARCWriter(serialNo, settings);
+    }
+    
+
+        
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/io/arc/WriterPoolSettingsData.java b/src/main/java/org/archive/io/arc/WriterPoolSettingsData.java
new file mode 100644
index 00000000..7396f2d8
--- /dev/null
+++ b/src/main/java/org/archive/io/arc/WriterPoolSettingsData.java
@@ -0,0 +1,80 @@
+/*
+ *  This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ *  Licensed to the Internet Archive (IA) by one or more individual 
+ *  contributors. 
+ *
+ *  The IA licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.archive.io.arc;
+
+import java.io.File;
+import java.util.List;
+
+import org.archive.io.WriterPoolSettings;
+
+public class WriterPoolSettingsData implements WriterPoolSettings {
+    protected long maxFileSizeBytes;
+    protected String prefix;
+    protected String template; 
+    protected List outputDirs;
+    protected boolean compress;
+    protected List metadata;
+    protected boolean frequentFlushes = true;
+    protected int writeBufferSize = 16*1024;
+    
+    public WriterPoolSettingsData(String prefix, String template,
+            long maxFileSizeBytes, boolean compress, List outputDirs,
+            List metadata) {
+        super();
+        this.maxFileSizeBytes = maxFileSizeBytes;
+        this.prefix = prefix;
+        this.template = template;
+        this.outputDirs = outputDirs;
+        this.compress = compress;
+        this.metadata = metadata;
+    }
+    
+    @Override
+    public boolean getCompress() {
+        return compress;
+    }
+    @Override
+    public long getMaxFileSizeBytes() {
+        return maxFileSizeBytes;
+    }
+    @Override
+    public List getMetadata() {
+        return metadata;
+    }
+    @Override
+    public List calcOutputDirs() {
+        return outputDirs;
+    }
+    @Override
+    public String getPrefix() {
+        return prefix;
+    }
+    @Override
+    public String getTemplate() {
+        return template;
+    }
+    @Override
+    public boolean getFrequentFlushes() {
+        return frequentFlushes;
+    }
+    @Override
+    public int getWriteBufferSize() {
+        return writeBufferSize;
+    }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/io/package.html b/src/main/java/org/archive/io/package.html
new file mode 100644
index 00000000..d1798b80
--- /dev/null
+++ b/src/main/java/org/archive/io/package.html
@@ -0,0 +1,9 @@
+
+
+
+org.archive.io.arc package
+
+
+ARC file reading and writing.
+
+
diff --git a/src/main/java/org/archive/io/warc/WARCConstants.java b/src/main/java/org/archive/io/warc/WARCConstants.java
new file mode 100644
index 00000000..83cc8a6d
--- /dev/null
+++ b/src/main/java/org/archive/io/warc/WARCConstants.java
@@ -0,0 +1,24 @@
+/*
+ *  This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ *  Licensed to the Internet Archive (IA) by one or more individual 
+ *  contributors. 
+ *
+ *  The IA licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+package org.archive.io.warc;
+
+@Deprecated
+public interface WARCConstants extends org.archive.format.warc.WARCConstants {
+}
diff --git a/src/main/java/org/archive/io/warc/WARCReader.java b/src/main/java/org/archive/io/warc/WARCReader.java
new file mode 100644
index 00000000..a34854ef
--- /dev/null
+++ b/src/main/java/org/archive/io/warc/WARCReader.java
@@ -0,0 +1,287 @@
+/*
+ *  This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ *  Licensed to the Internet Archive (IA) by one or more individual 
+ *  contributors. 
+ *
+ *  The IA licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+package org.archive.io.warc;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+import org.apache.commons.cli.PosixParser;
+import org.apache.commons.lang.NotImplementedException;
+import org.archive.io.ArchiveReader;
+import org.archive.io.ArchiveRecord;
+
+/**
+ * WARCReader.
+ * Go via {@link WARCReaderFactory} to get instance.
+ * @author stack
+ * @version $Date: 2006-11-27 18:03:03 -0800 (Mon, 27 Nov 2006) $ $Version$
+ */
+public class WARCReader extends ArchiveReader implements WARCConstants {
+    protected WARCReader() {
+        super();
+    }
+    
+    @Override
+    protected void initialize(String i) {
+        super.initialize(i);
+        setVersion(WARC_VERSION);
+    }
+    
+    /**
+     * Skip over any trailing new lines at end of the record so we're lined up
+     * ready to read the next.
+     * @param record
+     * @throws IOException
+     */
+    protected void gotoEOR(ArchiveRecord record) throws IOException {
+        if (record.available() != 0) {
+            throw new IOException("Record should be exhausted before coming " +
+                "in here");
+        }
+
+        // Records end in 2*CRLF.  Suck it up.
+        readExpectedChar(getIn(), CRLF.charAt(0));
+        readExpectedChar(getIn(), CRLF.charAt(1));
+        readExpectedChar(getIn(), CRLF.charAt(0));
+        readExpectedChar(getIn(), CRLF.charAt(1));
+    }
+    
+    protected void readExpectedChar(final InputStream is, final int expected)
+    throws IOException {
+        int c = is.read();
+        if (c != expected) {
+            throw new IOException("Unexpected character " +
+                Integer.toHexString(c) + "(Expecting " +
+                Integer.toHexString(expected) + ")");
+        }
+    }
+    
+    /**
+     * Create new WARC record.
+     * Encapsulate housekeeping that has to do w/ creating new Record.
+     * @param is InputStream to use.
+     * @param offset Absolute offset into WARC file.
+     * @return A WARCRecord.
+     * @throws IOException
+     */
+    protected WARCRecord createArchiveRecord(InputStream is, long offset)
+    throws IOException {
+        return (WARCRecord)currentRecord(new WARCRecord(is,
+        	getReaderIdentifier(), offset, isDigest(), isStrict()));
+    }
+    
+	@Override
+	public void dump(boolean compress)
+	throws IOException, java.text.ParseException {
+	    for (final Iterator i = iterator(); i.hasNext();) {
+            ArchiveRecord r = i.next();
+            System.out.println(r.getHeader().toString());
+            r.dump();
+            System.out.println();
+        }
+	}
+    
+
+    @Override
+    public ArchiveReader getDeleteFileOnCloseReader(final File f) {
+        throw new NotImplementedException("TODO");
+    }  
+
+	@Override
+	public String getDotFileExtension() {
+		return DOT_WARC_FILE_EXTENSION;
+	}
+
+	@Override
+	public String getFileExtension() {
+		return WARC_FILE_EXTENSION;
+	} 
+    
+    // Static methods follow.  Mostly for command-line processing.
+
+    /**
+     *
+     * @param formatter Help formatter instance.
+     * @param options Usage options.
+     * @param exitCode Exit code.
+     */
+    private static void usage(HelpFormatter formatter, Options options,
+            int exitCode) {
+        formatter.printHelp("java org.archive.io.arc.WARCReader" +
+            " [--digest=true|false] \\\n" +
+            " [--format=cdx|cdxfile|dump|gzipdump]" +
+            " [--offset=#] \\\n[--strict] [--parse] WARC_FILE|WARC_URL",
+                options);
+        System.exit(exitCode);
+    }
+
+    /**
+     * Write out the arcfile.
+     * 
+     * @param reader
+     * @param format Format to use outputting.
+     * @throws IOException
+     * @throws java.text.ParseException
+     */
+    protected static void output(WARCReader reader, String format)
+    throws IOException, java.text.ParseException {
+    	if (!reader.output(format)) {
+            throw new IOException("Unsupported format: " + format);
+    	}
+    }
+
+    /**
+     * Generate a CDX index file for an ARC file.
+     *
+     * @param urlOrPath The ARC file to generate a CDX index for
+     * @throws IOException
+     * @throws java.text.ParseException
+     */
+    public static void createCDXIndexFile(String urlOrPath)
+    throws IOException, java.text.ParseException {
+    	WARCReader r = WARCReaderFactory.get(urlOrPath);
+    	r.setStrict(false);
+    	r.setDigest(true);
+    	output(r, CDX_FILE);
+    }
+
+    /**
+     * Command-line interface to WARCReader.
+     *
+     * Here is the command-line interface:
+     * 
+     * usage: java org.archive.io.arc.WARCReader [--offset=#] ARCFILE
+     *  -h,--help      Prints this message and exits.
+     *  -o,--offset    Outputs record at this offset into arc file.
+     *
+     * Outputs using a pseudo-CDX format as described here:
+     * CDX
+     * Legent and here
+     * Example.
+     * Legend used in below is: 'CDX b e a m s c V (or v if uncompressed) n g'.
+     * Hash is hard-coded straight SHA-1 hash of content.
+     *
+     * @param args Command-line arguments.
+     * @throws ParseException Failed parse of the command line.
+     * @throws IOException
+     * @throws java.text.ParseException
+     */
+    public static void main(String [] args)
+    throws ParseException, IOException, java.text.ParseException {
+        Options options = getOptions();
+        PosixParser parser = new PosixParser();
+        CommandLine cmdline = parser.parse(options, args, false);
+        @SuppressWarnings("unchecked")
+        List cmdlineArgs = cmdline.getArgList();
+        Option [] cmdlineOptions = cmdline.getOptions();
+        HelpFormatter formatter = new HelpFormatter();
+
+        // If no args, print help.
+        if (cmdlineArgs.size() <= 0) {
+            usage(formatter, options, 0);
+        }
+
+        // Now look at options passed.
+        long offset = -1;
+        boolean digest = false;
+        boolean strict = false;
+        String format = CDX;
+        for (int i = 0; i < cmdlineOptions.length; i++) {
+            switch(cmdlineOptions[i].getId()) {
+                case 'h':
+                    usage(formatter, options, 0);
+                    break;
+
+                case 'o':
+                    offset =
+                        Long.parseLong(cmdlineOptions[i].getValue());
+                    break;
+                    
+                case 's':
+                    strict = true;
+                    break;
+                    
+                case 'd':
+                	digest = getTrueOrFalse(cmdlineOptions[i].getValue());
+                    break;
+                    
+                case 'f':
+                    format = cmdlineOptions[i].getValue().toLowerCase();
+                    boolean match = false;
+                    // List of supported formats.
+                    final String [] supportedFormats =
+                		{CDX, DUMP, GZIP_DUMP, CDX_FILE};
+                    for (int ii = 0; ii < supportedFormats.length; ii++) {
+                        if (supportedFormats[ii].equals(format)) {
+                            match = true;
+                            break;
+                        }
+                    }
+                    if (!match) {
+                        usage(formatter, options, 1);
+                    }
+                    break;
+
+                default:
+                    throw new RuntimeException("Unexpected option: " +
+                        + cmdlineOptions[i].getId());
+            }
+        }
+        
+        if (offset >= 0) {
+            if (cmdlineArgs.size() != 1) {
+                System.out.println("Error: Pass one arcfile only.");
+                usage(formatter, options, 1);
+            }
+            WARCReader r = WARCReaderFactory.get(
+            	new File((String)cmdlineArgs.get(0)), offset);
+            r.setStrict(strict);
+            outputRecord(r, format);
+        } else {
+            for (Iterator i = cmdlineArgs.iterator(); i.hasNext();) {
+                String urlOrPath = (String)i.next();
+                try {
+                	WARCReader r = WARCReaderFactory.get(urlOrPath);
+                	r.setStrict(strict);
+                	r.setDigest(digest);
+                    output(r, format);
+                } catch (RuntimeException e) {
+                    // Write out name of file we failed on to help with
+                    // debugging.  Then print stack trace and try to keep
+                    // going.  We do this for case where we're being fed
+                    // a bunch of ARCs; just note the bad one and move
+                    // on to the next.
+                    System.err.println("Exception processing " + urlOrPath +
+                        ": " + e.getMessage());
+                    e.printStackTrace(System.err);
+                    System.exit(1);
+                }
+            }
+        }
+    } 
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/io/warc/WARCReaderFactory.java b/src/main/java/org/archive/io/warc/WARCReaderFactory.java
new file mode 100644
index 00000000..9c6c7e77
--- /dev/null
+++ b/src/main/java/org/archive/io/warc/WARCReaderFactory.java
@@ -0,0 +1,307 @@
+/*
+ *  This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ *  Licensed to the Internet Archive (IA) by one or more individual 
+ *  contributors. 
+ *
+ *  The IA licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+package org.archive.io.warc;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Iterator;
+
+import org.archive.io.ArchiveReader;
+import org.archive.io.ArchiveReaderFactory;
+import org.archive.io.ArchiveRecord;
+import org.archive.io.warc.WARCConstants;
+import org.archive.util.ArchiveUtils;
+import org.archive.util.FileUtils;
+import org.archive.util.zip.GZIPMembersInputStream;
+
+import com.google.common.io.CountingInputStream;
+
+/**
+ * Factory for WARC Readers.
+ * Figures whether to give out a compressed file Reader or an uncompressed
+ * Reader.
+ * @author stack
+ * @version $Date: 2006-08-23 17:59:04 -0700 (Wed, 23 Aug 2006) $ $Version$
+ */
+public class WARCReaderFactory extends ArchiveReaderFactory
+implements WARCConstants {
+    private static final WARCReaderFactory factory = new WARCReaderFactory();
+
+    /**
+     * Shutdown any access to default constructor.
+     * This factory is Singleton.
+     */
+    private WARCReaderFactory() {
+        super();
+    }
+    
+    public static WARCReader get(String arcFileOrUrl)
+    throws MalformedURLException, IOException {
+    	return (WARCReader)WARCReaderFactory.factory.
+    		getArchiveReader(arcFileOrUrl);
+    }
+    
+    public static WARCReader get(final File f) throws IOException {
+    	return (WARCReader)WARCReaderFactory.factory.getArchiveReader(f);
+    }
+    
+    /**
+     * @param f An arcfile to read.
+     * @param offset Have returned Reader set to start reading at this offset.
+     * @return A WARCReader.
+     * @throws IOException 
+     */
+    public static WARCReader get(final File f, final long offset)
+    throws IOException {
+    	return (WARCReader)WARCReaderFactory.factory.
+    		getArchiveReader(f, offset);
+    }
+    
+    protected ArchiveReader getArchiveReader(final File f, final long offset)
+    throws IOException {
+		boolean compressed = testCompressedWARCFile(f);
+		if (!compressed) {
+			if (!FileUtils.isReadableWithExtensionAndMagic(f,
+					DOT_WARC_FILE_EXTENSION, WARC_MAGIC)) {
+				throw new IOException(f.getAbsolutePath()
+						+ " is not a WARC file.");
+			}
+		}
+		return (WARCReader)(compressed?
+			WARCReaderFactory.factory.new CompressedWARCReader(f, offset):
+			WARCReaderFactory.factory.new UncompressedWARCReader(f, offset));
+	}
+    
+    public static ArchiveReader get(final String s, final InputStream is,
+            final boolean atFirstRecord)
+    throws IOException {
+        return WARCReaderFactory.factory.getArchiveReader(s, is,
+            atFirstRecord);
+    }
+    
+    protected ArchiveReader getArchiveReader(final String f,
+			final InputStream is, final boolean atFirstRecord)
+			throws IOException {
+		// For now, assume stream is compressed. Later add test of input
+		// stream or handle exception thrown when figure not compressed stream.
+		return new CompressedWARCReader(f, is, atFirstRecord);
+	}
+    
+    public static WARCReader get(final URL arcUrl, final long offset)
+    throws IOException {
+        return (WARCReader)WARCReaderFactory.factory.getArchiveReader(arcUrl,
+            offset);
+    }
+    
+    /**
+     * Get an ARCReader.
+     * Pulls the ARC local into whereever the System Property
+     * java.io.tmpdir points. It then hands back an ARCReader that
+     * points at this local copy.  A close on this ARCReader instance will
+     * remove the local copy.
+     * @param arcUrl An URL that points at an ARC.
+     * @return An ARCReader.
+     * @throws IOException 
+     */
+    public static WARCReader get(final URL arcUrl)
+    throws IOException {
+        return (WARCReader)WARCReaderFactory.factory.getArchiveReader(arcUrl);
+    }
+    
+    /**
+     * Check file is compressed WARC.
+     *
+     * @param f File to test.
+     *
+     * @return True if this is compressed WARC (TODO: Just tests if file is
+     * GZIP'd file (It begins w/ GZIP MAGIC)).
+     *
+     * @exception IOException If file does not exist or is not unreadable.
+     */
+    public static boolean testCompressedWARCFile(final File f)
+    throws IOException {
+        FileUtils.assertReadable(f);
+        boolean compressed = false;
+        final InputStream is = new FileInputStream(f);
+        try {
+            compressed = ArchiveUtils.isGzipped(is);
+        } finally {
+            is.close();
+        }
+        return compressed;
+    }
+
+    /**
+     * Uncompressed WARC file reader.
+     * @author stack
+     */
+    public class UncompressedWARCReader extends WARCReader {
+        /**
+         * Constructor.
+         * @param f Uncompressed arcfile to read.
+         * @throws IOException
+         */
+        public UncompressedWARCReader(final File f)
+        throws IOException {
+            this(f, 0);
+        }
+
+        /**
+         * Constructor.
+         * 
+         * @param f Uncompressed file to read.
+         * @param offset Offset at which to position Reader.
+         * @throws IOException
+         */
+        public UncompressedWARCReader(final File f, final long offset)
+        throws IOException {
+            // File has been tested for existence by time it has come to here.
+            setIn(new CountingInputStream(getInputStream(f, offset)));
+            getIn().skip(offset);
+            initialize(f.getAbsolutePath());
+        }
+        
+        /**
+         * Constructor.
+         * 
+         * @param f Uncompressed file to read.
+         * @param is InputStream.
+         */
+        public UncompressedWARCReader(final String f, final InputStream is) {
+            // Arc file has been tested for existence by time it has come
+            // to here.
+            setIn(new CountingInputStream(is));
+            initialize(f);
+        }
+    }
+    
+    /**
+     * Compressed WARC file reader.
+     * 
+     * @author stack
+     */
+    public class CompressedWARCReader extends WARCReader {
+        /**
+         * Constructor.
+         * 
+         * @param f Compressed file to read.
+         * @throws IOException
+         */
+        public CompressedWARCReader(final File f) throws IOException {
+            this(f, 0);
+        }
+
+        /**
+         * Constructor.
+         * 
+         * @param f Compressed arcfile to read.
+         * @param offset Position at where to start reading file.
+         * @throws IOException
+         */
+        public CompressedWARCReader(final File f, final long offset)
+                throws IOException {
+            // File has been tested for existence by time it has come to here.
+            setIn(new GZIPMembersInputStream(getInputStream(f, offset)));
+            ((GZIPMembersInputStream)getIn()).compressedSeek(offset); 
+            setCompressed((offset == 0)); // TODO: does this make sense?!?!
+            initialize(f.getAbsolutePath());
+        }
+        
+        /**
+         * Constructor.
+         * 
+         * @param f Compressed arcfile.
+         * @param is InputStream to use.
+         * @param atFirstRecord
+         * @throws IOException
+         */
+        public CompressedWARCReader(final String f, final InputStream is,
+            final boolean atFirstRecord)
+        throws IOException {
+            // Arc file has been tested for existence by time it has come
+            // to here.
+            setIn(new GZIPMembersInputStream(is));
+            setCompressed(true);
+            initialize(f);
+            // TODO: Ignore atFirstRecord. Probably doesn't apply in WARC world.
+        }
+        
+        /**
+         * Get record at passed offset.
+         * 
+         * @param offset Byte index into file at which a record starts.
+         * @return A WARCRecord reference.
+         * @throws IOException
+         */
+        public WARCRecord get(long offset) throws IOException {
+            cleanupCurrentRecord();
+            ((GZIPMembersInputStream)getIn()).compressedSeek(offset);
+            return (WARCRecord) createArchiveRecord(getIn(), offset);
+        }
+        
+        public Iterator iterator() {
+            /**
+             * Override ArchiveRecordIterator so can base returned iterator on
+             * GzippedInputStream iterator.
+             */
+            return new ArchiveRecordIterator() {
+                private GZIPMembersInputStream gis =
+                    (GZIPMembersInputStream)getIn();
+
+                private Iterator gzipIterator = this.gis.memberIterator();
+
+                protected boolean innerHasNext() {
+                    return this.gzipIterator.hasNext();
+                }
+
+                protected ArchiveRecord innerNext() throws IOException {
+                    // Get the position before gzipIterator.next moves
+                    // it on past the gzip header.
+                    InputStream is = (InputStream) this.gzipIterator.next();
+                    return createArchiveRecord(is, Math.max(gis.getCurrentMemberStart(), gis.getCurrentMemberEnd()));
+                }
+            };
+        }
+        
+        protected void gotoEOR(ArchiveRecord rec) throws IOException {
+            long skipped = 0; 
+            while (getIn().read()>-1) {
+                skipped++;
+            }
+            if(skipped>4) {
+                System.err.println("unexpected extra data after record "+rec);
+            }
+            return;
+        }
+    }
+    
+    public static boolean isWARCSuffix(final String f) {
+    	return (f == null)?
+    		false:
+    		(f.toLowerCase().endsWith(DOT_COMPRESSED_WARC_FILE_EXTENSION))?
+    		    true:
+    			(f.toLowerCase().endsWith(DOT_WARC_FILE_EXTENSION))?
+    			true: false;
+    }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/io/warc/WARCRecord.java b/src/main/java/org/archive/io/warc/WARCRecord.java
new file mode 100644
index 00000000..635d1c3b
--- /dev/null
+++ b/src/main/java/org/archive/io/warc/WARCRecord.java
@@ -0,0 +1,233 @@
+/*
+ *  This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ *  Licensed to the Internet Archive (IA) by one or more individual 
+ *  contributors. 
+ *
+ *  The IA licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+package org.archive.io.warc;
+
+import it.unimi.dsi.fastutil.io.RepositionableStream;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.httpclient.Header;
+import org.apache.commons.httpclient.HttpParser;
+import org.archive.io.ArchiveRecord;
+import org.archive.io.ArchiveRecordHeader;
+import org.archive.util.LaxHttpParser;
+
+
+/**
+ * A WARC file Record.
+ *
+ * @author stack
+ */
+public class WARCRecord extends ArchiveRecord implements WARCConstants {
+    private Pattern WHITESPACE = Pattern.compile("\\s");
+    
+    /**
+     * Constructor.
+     *
+     * @param in Stream cue'd up to be at the start of the record this instance
+     * is to represent.
+     * @throws IOException
+     */
+    public WARCRecord(InputStream in, final String identifier,
+    	final long offset)
+    throws IOException {
+        this(in, identifier, offset, true, false);
+    }
+    
+    /**
+     * Constructor.
+     * @param in Stream cue'd up just past Header Line and Named Fields.
+     * @param headers Header Line and ANVL Named fields.
+     * @throws IOException
+     */
+    public WARCRecord(InputStream in, ArchiveRecordHeader headers)
+    		throws IOException {
+        super(in, headers, 0, true, false);
+    }
+
+    /**
+     * Constructor.
+     *
+     * @param in Stream cue'd up to be at the start of the record this instance
+     * is to represent or, if headers is not null, just past the
+     * Header Line and Named Fields.
+     * @param identifier Identifier for this the hosting Reader.
+     * @param offset Current offset into in (Used to keep
+     * position properly aligned).  Usually 0.
+     * @param digest True if we're to calculate digest for this record.  Not
+     * digesting saves about ~15% of cpu during parse.
+     * @param strict Be strict parsing (Parsing stops if file inproperly
+     * formatted).
+     * @throws IOException
+     */
+    public WARCRecord(final InputStream in, final String identifier,
+    	final long offset, boolean digest, boolean strict) 
+    throws IOException {
+        super(in, null, 0, digest, strict);
+        setHeader(parseHeaders(in, identifier, offset, strict));
+    }
+    
+    /**
+     * Parse WARC Header Line and Named Fields.
+     * @param in Stream to read.
+     * @param identifier Identifier for the hosting Reader.
+     * @param offset Absolute offset into Reader.
+     * @param strict Whether to be loose parsing or not.
+     * @return An ArchiveRecordHeader.
+     * @throws IOException 
+     */
+    protected ArchiveRecordHeader parseHeaders(final InputStream in,
+        final String identifier, final long offset, final boolean strict)
+    throws IOException {
+    	final Map m = new HashMap();
+    	m.put(ABSOLUTE_OFFSET_KEY, new Long(offset));
+    	m.put(READER_IDENTIFIER_FIELD_KEY, identifier);
+        
+        long startPosition = -1;
+        if (in instanceof RepositionableStream) {
+            startPosition = ((RepositionableStream)in).position();
+        }
+        String firstLine =
+            new String(LaxHttpParser.readLine(in, WARC_HEADER_ENCODING));
+        if (firstLine == null || firstLine.length() <=0) {
+            throw new IOException("Failed to read WARC_MAGIC");
+        }
+        if (!firstLine.startsWith(WARC_MAGIC)) {
+            throw new IOException("Failed to find WARC MAGIC: " + firstLine);
+        }
+        // Here we start reading off the inputstream but we're reading the
+        // stream direct rather than going via WARCRecord#read.  The latter will
+        // keep count of bytes read, digest and fail properly if EOR too soon...
+        // We don't want digesting while reading Headers.
+        // 
+        Header [] h = LaxHttpParser.parseHeaders(in, WARC_HEADER_ENCODING);
+        for (int i = 0; i < h.length; i++) {
+            m.put(h[i].getName(), h[i].getValue());
+        }
+        int headerLength = -1;
+        if (in instanceof RepositionableStream) {
+            headerLength =
+                (int)(((RepositionableStream)in).position() - startPosition);
+        }
+        final int contentOffset = headerLength;
+        incrementPosition(contentOffset);
+   
+    	return new ArchiveRecordHeader() {
+    		private Map headers = m;
+            private int contentBegin = contentOffset;
+
+			public String getDate() {
+				return (String)this.headers.get(HEADER_KEY_DATE);
+			}
+
+			public String getDigest() {
+                return null;
+                // TODO: perhaps return block-digest? 
+                // superclass def implies this is calculated ("only after
+                // read in totality"), not pulled from header, so
+                // below prior implementation was misleading
+//              return (String)this.headers.get(HEADER_KEY_CHECKSUM);
+			}
+
+			public String getReaderIdentifier() {
+				return (String)this.headers.get(READER_IDENTIFIER_FIELD_KEY);
+			}
+
+			public Set getHeaderFieldKeys() {
+				return this.headers.keySet();
+			}
+
+			public Map getHeaderFields() {
+				return this.headers;
+			}
+
+			public Object getHeaderValue(String key) {
+				return this.headers.get(key);
+			}
+
+			// Returns just the Content-Length of the warc record
+			public long getContentLength() {
+				Object o = this.headers.get(CONTENT_LENGTH);
+				if (o == null) {
+					return -1;
+				}
+				long contentLength = (o instanceof Long)?
+                    ((Long)o).longValue(): Long.parseLong((String)o);
+                return contentLength;
+			}
+			
+			// Returns the full record length
+			public long getLength()
+			{
+				return getContentLength() + contentOffset;
+			}
+
+			public String getMimetype() {
+				return (String)this.headers.get(CONTENT_TYPE);
+			}
+
+			public long getOffset() {
+				Object o = this.headers.get(ABSOLUTE_OFFSET_KEY);
+				if (o == null) {
+					return -1;
+				}
+				return (o instanceof Long)?
+                    ((Long)o).longValue(): Long.parseLong((String)o);
+			}
+
+			public String getRecordIdentifier() {
+				return (String)this.headers.get(RECORD_IDENTIFIER_FIELD_KEY);
+			}
+
+			public String getUrl() {
+				return (String)this.headers.get(HEADER_KEY_URI);
+			}
+
+			public String getVersion() {
+				return (String)this.headers.get(VERSION_FIELD_KEY);
+			}
+            
+            public int getContentBegin() {
+                return this.contentBegin;
+            }
+            
+            @Override
+            public String toString() {
+                return this.headers.toString();
+            }
+    	};
+    }
+    
+    @Override
+    protected String getMimetype4Cdx(ArchiveRecordHeader h) {
+        final String m = super.getMimetype4Cdx(h);
+        // Mimetypes can have spaces in WARCs.  Emitting for CDX, just
+        // squash them for now.  Later, quote them since squashing spaces won't
+        // work for params that have quoted-string values.
+        Matcher matcher = WHITESPACE.matcher(m);
+        return matcher.replaceAll("");
+    }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/io/warc/WARCRecordInfo.java b/src/main/java/org/archive/io/warc/WARCRecordInfo.java
new file mode 100644
index 00000000..a6198c44
--- /dev/null
+++ b/src/main/java/org/archive/io/warc/WARCRecordInfo.java
@@ -0,0 +1,139 @@
+/*
+ *  This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ *  Licensed to the Internet Archive (IA) by one or more individual 
+ *  contributors. 
+ *
+ *  The IA licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.archive.io.warc;
+
+import java.io.InputStream;
+import java.net.URI;
+
+import org.archive.format.warc.WARCConstants.WARCRecordType;
+import org.archive.util.anvl.ANVLRecord;
+
+public class WARCRecordInfo {
+
+    protected WARCRecordType type;
+    protected String url;
+    protected String create14DigitDate;
+    protected String mimetype;
+    protected URI recordId; 
+    protected ANVLRecord extraHeaders;
+    protected InputStream contentStream;
+    protected long contentLength;
+    protected boolean enforceLength;
+    protected String warcFilename;
+    protected Long warcFileOffset;
+
+    public void setType(WARCRecordType type) {
+        this.type = type;
+    }
+    
+    public void setUrl(String url) {
+        this.url = url;
+    }
+
+    public String getCreate14DigitDate() {
+        return create14DigitDate;
+    }
+
+    public void setCreate14DigitDate(String create14DigitDate) {
+        this.create14DigitDate = create14DigitDate;
+    }
+
+    public String getMimetype() {
+        return mimetype;
+    }
+
+    public void setMimetype(String mimetype) {
+        this.mimetype = mimetype;
+    }
+
+    public URI getRecordId() {
+        return recordId;
+    }
+
+    public void setRecordId(URI recordId) {
+        this.recordId = recordId;
+    }
+
+    public ANVLRecord getExtraHeaders() {
+        return extraHeaders;
+    }
+
+    public void setExtraHeaders(ANVLRecord extraHeaders) {
+        this.extraHeaders = extraHeaders;
+    }
+
+    public InputStream getContentStream() {
+        return contentStream;
+    }
+
+    public void setContentStream(InputStream contentStream) {
+        this.contentStream = contentStream;
+    }
+
+    public long getContentLength() {
+        return contentLength;
+    }
+
+    public void setContentLength(long contentLength) {
+        this.contentLength = contentLength;
+    }
+
+    public boolean isEnforceLength() {
+        return enforceLength;
+    }
+    
+    public boolean getEnforceLength() {
+        return enforceLength;
+    }
+    
+    public void setEnforceLength(boolean enforceLength) {
+        this.enforceLength = enforceLength;
+    }
+
+    public WARCRecordType getType() {
+        return type;
+    }
+
+    public String getUrl() {
+        return url;
+    }
+
+    public void addExtraHeader(String label, String value) {
+        if (extraHeaders == null) {
+            extraHeaders = new ANVLRecord();
+        }
+        extraHeaders.addLabelValue(label, value);
+    }
+
+    public void setWARCFilename(String warcFilenameWithoutOccupiedSuffix) {
+        this.warcFilename = warcFilenameWithoutOccupiedSuffix;
+    }
+    
+    public String getWARCFilename() {
+        return warcFilename;
+    }
+
+    public void setWARCFileOffset(Long startPosition) {
+        this.warcFileOffset = startPosition;
+    }
+    
+    public Long getWARCFileOffset() {
+        return warcFileOffset;
+    }
+}
diff --git a/src/main/java/org/archive/io/warc/WARCWriter.java b/src/main/java/org/archive/io/warc/WARCWriter.java
new file mode 100644
index 00000000..b9558263
--- /dev/null
+++ b/src/main/java/org/archive/io/warc/WARCWriter.java
@@ -0,0 +1,436 @@
+/*
+ *  This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ *  Licensed to the Internet Archive (IA) by one or more individual 
+ *  contributors. 
+ *
+ *  The IA licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+package org.archive.io.warc;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.net.URI;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import org.apache.commons.lang.StringUtils;
+import org.archive.io.ArchiveFileConstants;
+import org.archive.io.UTF8Bytes;
+import org.archive.io.WriterPoolMember;
+import org.archive.util.ArchiveUtils;
+import org.archive.util.anvl.Element;
+
+
+/**
+ * WARC implementation.
+ *
+ * 
Assumption is that the caller is managing access to this
+ * WARCWriter ensuring only one thread accessing this WARC instance
+ * at any one time.
+ * 
+ * 
While being written, WARCs have a '.open' suffix appended.
+ *
+ * @contributor stack
+ * @version $Revision: 4604 $ $Date: 2006-09-05 22:38:18 -0700 (Tue, 05 Sep 2006) $
+ */
+public class WARCWriter extends WriterPoolMember
+implements WARCConstants {
+    public static final String TOTALS = "totals";
+    public static final String SIZE_ON_DISK = "sizeOnDisk";
+    public static final String TOTAL_BYTES = "totalBytes";
+    public static final String CONTENT_BYTES = "contentBytes";
+    public static final String NUM_RECORDS = "numRecords";
+
+    private static final Logger logger = 
+        Logger.getLogger(WARCWriter.class.getName());
+    
+    /**
+     * NEWLINE as bytes.
+     */
+    public static byte [] CRLF_BYTES;
+    static {
+        try {
+            CRLF_BYTES = CRLF.getBytes(DEFAULT_ENCODING);
+        } catch(Exception e) {
+            e.printStackTrace();
+        }
+    };
+
+    /**
+     * Temporarily accumulates stats managed externally by
+     * {@link WARCWriterProcessor}. WARCWriterProcessor will call
+     * {@link #resetTmpStats()}, write some records, then add
+     * {@link #getTmpStats()} into its long-term running totals.
+     */
+    private Map> tmpStats;
+    
+    /** Temporarily accumulates info on written warc records for use externally. */
+    private LinkedList tmpRecordLog = new LinkedList();
+    
+    /**
+     * Constructor.
+     * Takes a stream. Use with caution. There is no upperbound check on size.
+     * Will just keep writing.  Only pass Streams that are bounded. 
+     * @param serialNo  used to generate unique file name sequences
+     * @param out Where to write.
+     * @param f File the out is connected to.
+     * @param cmprs Compress the content written.
+     * @param a14DigitDate If null, we'll write current time.
+     * @throws IOException
+     */
+    public WARCWriter(final AtomicInteger serialNo,
+    		final OutputStream out, final File f,
+    		final WARCWriterPoolSettings settings)
+    throws IOException {
+        super(serialNo, out, f, settings);
+    }
+            
+    /**
+     * Constructor.
+     *
+     * @param dirs Where to drop files.
+     * @param prefix File prefix to use.
+     * @param cmprs Compress the records written. 
+     * @param maxSize Maximum size for ARC files written.
+     * @param suffix File tail to use.  If null, unused.
+     * @param warcinfoData File metadata for warcinfo record.
+     */
+    public WARCWriter(final AtomicInteger serialNo,
+            final WARCWriterPoolSettings settings) {
+        super(serialNo, settings, WARC_FILE_EXTENSION);
+    }
+    
+    @Override
+    protected String createFile(File file) throws IOException {
+    	String filename = super.createFile(file);
+    	writeWarcinfoRecord(filename);
+        return filename;
+    }
+    
+    protected void baseCharacterCheck(final char c, final String parameter)
+    throws IllegalArgumentException {
+        // TODO: Too strict?  UNICODE control characters?
+        if (Character.isISOControl(c) || !Character.isValidCodePoint(c)) {
+            throw new IllegalArgumentException("Contains illegal character 0x" +
+                Integer.toHexString(c) + ": " + parameter);
+        }
+    }
+    
+    protected String checkHeaderValue(final String value)
+    throws IllegalArgumentException {
+        for (int i = 0; i < value.length(); i++) {
+        	final char c = value.charAt(i);
+        	baseCharacterCheck(c, value);
+        	if (Character.isWhitespace(c)) {
+                throw new IllegalArgumentException("Contains disallowed white space 0x" +
+                    Integer.toHexString(c) + ": " + value);
+        	}
+        }
+        return value;
+    }
+    
+    protected String checkHeaderLineMimetypeParameter(final String parameter)
+    throws IllegalArgumentException {
+    	StringBuilder sb = new StringBuilder(parameter.length());
+    	boolean wasWhitespace = false;
+        for (int i = 0; i < parameter.length(); i++) {
+        	char c = parameter.charAt(i);
+        	if (Character.isWhitespace(c)) {
+        		// Map all to ' ' and collapse multiples into one.
+        		// TODO: Make sure white space occurs in legal location --
+        		// before parameter or inside quoted-string.
+        		if (wasWhitespace) {
+        			continue;
+        		}
+        		wasWhitespace = true;
+        		c = ' ';
+        	} else {
+        		wasWhitespace = false;
+        		baseCharacterCheck(c, parameter);
+        	}
+        	sb.append(c);
+        }
+        
+        return sb.toString();
+    }
+
+//    protected String createRecordHeader(final String type,
+//    		final String url, final String create14DigitDate,
+//    		final String mimetype, final URI recordId,
+//    		final ANVLRecord xtraHeaders, final long contentLength)
+    protected String createRecordHeader(WARCRecordInfo metaRecord)
+    throws IllegalArgumentException {
+    	final StringBuilder sb =
+    		new StringBuilder(2048/*A SWAG: TODO: Do analysis.*/);
+    	sb.append(WARC_ID).append(CRLF);
+        sb.append(HEADER_KEY_TYPE).append(COLON_SPACE).append(metaRecord.getType()).
+            append(CRLF);
+        // Do not write a subject-uri if not one present.
+        if (!StringUtils.isEmpty(metaRecord.getUrl())) {
+            sb.append(HEADER_KEY_URI).append(COLON_SPACE).
+                append(checkHeaderValue(metaRecord.getUrl())).append(CRLF);
+        }
+        sb.append(HEADER_KEY_DATE).append(COLON_SPACE).
+            append(metaRecord.getCreate14DigitDate()).append(CRLF);
+        if (metaRecord.getExtraHeaders() != null) {
+            for (final Iterator i = metaRecord.getExtraHeaders().iterator(); i.hasNext();) {
+                sb.append(i.next()).append(CRLF);
+            }
+        }
+
+        sb.append(HEADER_KEY_ID).append(COLON_SPACE).append('<').
+            append(metaRecord.getRecordId().toString()).append('>').append(CRLF);
+        if (metaRecord.getContentLength() > 0) {
+            sb.append(CONTENT_TYPE).append(COLON_SPACE).append(
+                checkHeaderLineMimetypeParameter(metaRecord.getMimetype())).append(CRLF);
+        }
+        sb.append(CONTENT_LENGTH).append(COLON_SPACE).
+            append(Long.toString(metaRecord.getContentLength())).append(CRLF);
+    	
+    	return sb.toString();
+    }
+
+    public void writeRecord(WARCRecordInfo recordInfo)
+    throws IOException {
+
+        if (recordInfo.getContentLength() == 0 &&
+                (recordInfo.getExtraHeaders() == null || recordInfo.getExtraHeaders().size() <= 0)) {
+            throw new IllegalArgumentException("Cannot write record " +
+            "of content-length zero and base headers only.");
+        }
+
+        String header;
+        try {
+            header = createRecordHeader(recordInfo);
+
+        } catch (IllegalArgumentException e) {
+            logger.log(Level.SEVERE,"could not write record type: " + recordInfo.getType() 
+                    + "for URL: " + recordInfo.getUrl(), e);
+            return;
+        }
+
+        long contentBytes = 0;
+        long totalBytes = 0;
+        long startPosition;
+
+    	try {
+    	    startPosition = getPosition();
+            preWriteRecordTasks();
+
+            // TODO: Revisit encoding of header.
+            byte[] bytes = header.getBytes(WARC_HEADER_ENCODING);
+            write(bytes);
+            totalBytes += bytes.length;
+
+            if (recordInfo.getContentStream() != null && recordInfo.getContentLength() > 0) {
+                // Write out the header/body separator.
+                write(CRLF_BYTES); // TODO: should this be written even for zero-length?
+                totalBytes += CRLF_BYTES.length;
+                contentBytes += copyFrom(recordInfo.getContentStream(),
+                        recordInfo.getContentLength(),
+                        recordInfo.getEnforceLength());
+                totalBytes += contentBytes;
+            }
+
+            // Write out the two blank lines at end of all records.
+            write(CRLF_BYTES);
+            write(CRLF_BYTES);
+            totalBytes += 2 * CRLF_BYTES.length;
+            
+            tally(recordInfo.getType(), contentBytes, totalBytes, getPosition() - startPosition);
+            
+            recordInfo.setWARCFilename(getFilenameWithoutOccupiedSuffix());
+            recordInfo.setWARCFileOffset(startPosition);
+            tmpRecordLog.add(recordInfo);
+        } finally {
+            postWriteRecordTasks();
+        }
+    }
+
+    public String getFilenameWithoutOccupiedSuffix() {
+        String name = getFile().getName();
+        if (name.endsWith(ArchiveFileConstants.OCCUPIED_SUFFIX)) {
+            name = name.substring(0, name.length() - ArchiveFileConstants.OCCUPIED_SUFFIX.length());
+        }
+        return name;
+    }
+    
+    // if compression is enabled, sizeOnDisk means compressed bytes; if not, it
+    // should be the same as totalBytes (right?)
+    protected void tally(WARCRecordType warcRecordType, long contentBytes, long totalBytes, long sizeOnDisk) {
+        if (tmpStats == null) {
+            tmpStats = new HashMap>();
+        }
+
+        // add to stats for this record type
+        Map substats = tmpStats.get(warcRecordType.toString());
+        if (substats == null) {
+            substats = new HashMap();
+            tmpStats.put(warcRecordType.toString(), substats);
+        }
+        subtally(substats, contentBytes, totalBytes, sizeOnDisk);
+        
+        // add to totals
+        substats = tmpStats.get(TOTALS);
+        if (substats == null) {
+            substats = new HashMap();
+            tmpStats.put(TOTALS, substats);
+        }
+        subtally(substats, contentBytes, totalBytes, sizeOnDisk);
+    }
+
+    protected void subtally(Map substats, long contentBytes,
+            long totalBytes, long sizeOnDisk) {
+        
+        if (substats.get(NUM_RECORDS) == null) {
+            substats.put(NUM_RECORDS, 1l);
+        } else {
+            substats.put(NUM_RECORDS, substats.get(NUM_RECORDS) + 1);
+        }
+        
+        if (substats.get(CONTENT_BYTES) == null) {
+            substats.put(CONTENT_BYTES, contentBytes);
+        } else {
+            substats.put(CONTENT_BYTES, substats.get(CONTENT_BYTES) + contentBytes);
+        }
+        
+        if (substats.get(TOTAL_BYTES) == null) {
+            substats.put(TOTAL_BYTES, totalBytes);
+        } else {
+            substats.put(TOTAL_BYTES, substats.get(TOTAL_BYTES) + totalBytes);
+        }
+        
+        if (substats.get(SIZE_ON_DISK) == null) {
+            substats.put(SIZE_ON_DISK, sizeOnDisk);
+        } else {
+            substats.put(SIZE_ON_DISK, substats.get(SIZE_ON_DISK) + sizeOnDisk);
+        }
+    }
+
+    protected URI generateRecordId(final Map qualifiers)
+    throws IOException {
+        return ((WARCWriterPoolSettings)settings).getRecordIDGenerator().getQualifiedRecordID(qualifiers);
+    }
+    
+    protected URI generateRecordId(final String key, final String value)
+    throws IOException {
+    	return ((WARCWriterPoolSettings)settings).getRecordIDGenerator().getQualifiedRecordID(key, value);
+    }
+    
+    public URI writeWarcinfoRecord(String filename)
+	throws IOException {
+    	return writeWarcinfoRecord(filename, null);
+    }
+    
+    public URI writeWarcinfoRecord(String filename, final String description)
+        	throws IOException {
+        WARCRecordInfo recordInfo = new WARCRecordInfo();
+        recordInfo.setType(WARCRecordType.warcinfo);
+        recordInfo.setCreate14DigitDate(ArchiveUtils.getLog14Date());
+        recordInfo.setMimetype("application/warc-fields");
+
+        // Strip .open suffix if present.
+        if (filename.endsWith(WriterPoolMember.OCCUPIED_SUFFIX)) {
+        	filename = filename.substring(0,
+        		filename.length() - WriterPoolMember.OCCUPIED_SUFFIX.length());
+        }
+        recordInfo.addExtraHeader(HEADER_KEY_FILENAME, filename);
+        if (description != null && description.length() > 0) {
+            recordInfo.addExtraHeader(CONTENT_DESCRIPTION, description);
+        }
+        
+        // Add warcinfo body.
+        byte [] warcinfoBody = null;
+        if (settings.getMetadata() == null) {
+        	// TODO: What to write into a warcinfo?  What to associate?
+        	warcinfoBody = "TODO: Unimplemented".getBytes();
+        } else {
+        	ByteArrayOutputStream baos = new ByteArrayOutputStream();
+        	for (final Iterator i = settings.getMetadata().iterator();
+        			i.hasNext();) {
+        		baos.write(i.next().toString().getBytes(UTF8Bytes.UTF8));
+        	}
+        	warcinfoBody = baos.toByteArray();
+        }
+        recordInfo.setContentStream(new ByteArrayInputStream(warcinfoBody));
+        recordInfo.setContentLength((long) warcinfoBody.length);
+        recordInfo.setEnforceLength(true);
+        
+        recordInfo.setRecordId(generateRecordId(TYPE, WARCRecordType.warcinfo.toString()));
+        
+        writeRecord(recordInfo);
+        
+        // TODO: If at start of file, and we're writing compressed,
+        // write out our distinctive GZIP extensions.
+        return recordInfo.getRecordId();
+    }
+    
+    /**
+     * @see WARCWriter#tmpStats for usage model
+     */
+    public void resetTmpStats() {
+        if (tmpStats != null) {
+            for (Map substats : tmpStats.values()) {
+                for (Entry entry : substats.entrySet()) {
+                    entry.setValue(0l);
+                }
+            }
+        }
+    }
+
+    public Map> getTmpStats() {
+        return tmpStats;
+    }
+
+    public static long getStat(Map> map, String key,
+            String subkey) {
+        if (map != null && map.get(key) != null
+                && map.get(key).get(subkey) != null) {
+            return map.get(key).get(subkey);
+        } else {
+            return 0l;
+        }
+    }
+
+    public static long getStat(
+            ConcurrentMap> map,
+            String key, String subkey) {
+        if (map != null && map.get(key) != null
+                && map.get(key).get(subkey) != null) {
+            return map.get(key).get(subkey).get();
+        } else {
+            return 0l;
+        }
+    }
+
+    public void resetTmpRecordLog() {
+        tmpRecordLog.clear();
+    }
+
+    public Iterable getTmpRecordLog() {
+        return tmpRecordLog;
+    }
+}
diff --git a/src/main/java/org/archive/io/warc/WARCWriterPool.java b/src/main/java/org/archive/io/warc/WARCWriterPool.java
new file mode 100644
index 00000000..fdc97162
--- /dev/null
+++ b/src/main/java/org/archive/io/warc/WARCWriterPool.java
@@ -0,0 +1,64 @@
+/*
+ *  This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ *  Licensed to the Internet Archive (IA) by one or more individual 
+ *  contributors. 
+ *
+ *  The IA licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.archive.io.warc;
+
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.archive.io.WriterPool;
+import org.archive.io.WriterPoolMember;
+
+
+/**
+ * A pool of WARCWriters.
+ * @contributor stack
+ * @contributor gojomo
+ * @version $Revision: 4566 $ $Date: 2006-08-31 09:51:41 -0700 (Thu, 31 Aug 2006) $
+ */
+public class WARCWriterPool extends WriterPool {
+    /**
+     * Constructor
+     * @param settings Settings for this pool.
+     * @param poolMaximumActive
+     * @param poolMaximumWait
+     */
+    public WARCWriterPool(final WARCWriterPoolSettings settings,
+            final int poolMaximumActive, final int poolMaximumWait) {
+    	this(new AtomicInteger(), settings, poolMaximumActive, poolMaximumWait);
+    }
+    
+    /**
+     * Constructor
+     * @param serial  Used to generate unique filename sequences
+     * @param settings Settings for this pool.
+     * @param poolMaximumActive
+     * @param poolMaximumWait
+     */
+    public WARCWriterPool(final AtomicInteger serial,
+    		final WARCWriterPoolSettings settings,
+            final int poolMaximumActive, final int poolMaximumWait) {
+    	super(serial, settings, poolMaximumActive, poolMaximumWait);
+    }
+    
+    /* (non-Javadoc)
+     * @see org.archive.io.WriterPool#makeWriter()
+     */
+    protected WriterPoolMember makeWriter() {
+        return new WARCWriter(serialNo, (WARCWriterPoolSettings)settings);
+    }
+}
diff --git a/src/main/java/org/archive/io/warc/WARCWriterPoolSettings.java b/src/main/java/org/archive/io/warc/WARCWriterPoolSettings.java
new file mode 100644
index 00000000..b028a8b7
--- /dev/null
+++ b/src/main/java/org/archive/io/warc/WARCWriterPoolSettings.java
@@ -0,0 +1,32 @@
+/*
+ *  This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ *  Licensed to the Internet Archive (IA) by one or more individual 
+ *  contributors. 
+ *
+ *  The IA licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.archive.io.warc;
+
+import org.archive.io.WriterPoolSettings;
+import org.archive.uid.RecordIDGenerator;
+
+/**
+ * Settings object for a {@link WARCWriterPool}.
+ * Used creating {@link WARCWriter}s.
+ * 
+ * @version $Date: 2010-08-19 17:21:43 -0700 (Thu, 19 Aug 2010) $, $Revision: 6927 $
+ */
+public interface WARCWriterPoolSettings extends WriterPoolSettings {
+    public RecordIDGenerator getRecordIDGenerator();
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/io/warc/WARCWriterPoolSettingsData.java b/src/main/java/org/archive/io/warc/WARCWriterPoolSettingsData.java
new file mode 100644
index 00000000..d56c9971
--- /dev/null
+++ b/src/main/java/org/archive/io/warc/WARCWriterPoolSettingsData.java
@@ -0,0 +1,40 @@
+/*
+ *  This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ *  Licensed to the Internet Archive (IA) by one or more individual 
+ *  contributors. 
+ *
+ *  The IA licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.archive.io.warc;
+
+import java.io.File;
+import java.util.List;
+
+import org.archive.io.arc.WriterPoolSettingsData;
+import org.archive.uid.RecordIDGenerator;
+
+public class WARCWriterPoolSettingsData extends WriterPoolSettingsData implements WARCWriterPoolSettings {
+    RecordIDGenerator generator;
+    
+    public WARCWriterPoolSettingsData(String prefix, String template,
+            long maxFileSizeBytes, boolean compress, List outputDirs,
+            List metadata, RecordIDGenerator generator) {
+        super(prefix,template,maxFileSizeBytes,compress,outputDirs,metadata);
+        this.generator = generator;
+    }
+    @Override
+    public RecordIDGenerator getRecordIDGenerator() {
+        return generator; 
+    }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/io/warc/package.html b/src/main/java/org/archive/io/warc/package.html
new file mode 100644
index 00000000..f52aa95b
--- /dev/null
+++ b/src/main/java/org/archive/io/warc/package.html
@@ -0,0 +1,38 @@
+
+
+
+org.archive.io.warc package
+
+
+Experimental WARC Writer and Readers.  Code and specification subject to change
+with no guarantees of backward compatibility: i.e. newer readers
+may not be able to parse WARCs written with older writers. This package
+contains prototyping code for revision 0.12 of the WARC specification.
+See latest revision
+for current state (Version 0.10 code and its documentation has been moved into the
+v10 subpackage).
+
+
+
Implementation Notes
+Tools
+Initial implementations of Arc2Warc and Warc2Arc
+tools can be found in the package above this one, at
+{@link org.archive.io.Arc2Warc} and {@link org.archive.io.Warc2Arc}
+respectively.  Pass --help to learn how to use each tool.
+
+
+TODO
+
+Is MIME-Version header needed?  MIME Parsers seem fine without (python email
+lib and java mail).
+Should we write out a Content-Transfer-Encoding
+header (Currently we do not). Need section in spec. explicit about our
+interpretation of MIME and deviations (e.g. content-transfer-encoding should
+be assumed binary in case of WARCs, multipart is not disallowed but not
+encouraged, etc.)
+Minor: Do WARC-Version: 0.12 like MIME-Version: 1.0 rather than 
+WARC/0.12 for lead in to an ARCRecord?
+
+
+
+
diff --git a/src/main/java/org/archive/net/DownloadURLConnection.java b/src/main/java/org/archive/net/DownloadURLConnection.java
new file mode 100644
index 00000000..fbcee421
--- /dev/null
+++ b/src/main/java/org/archive/net/DownloadURLConnection.java
@@ -0,0 +1,131 @@
+/*
+ *  This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ *  Licensed to the Internet Archive (IA) by one or more individual 
+ *  contributors. 
+ *
+ *  The IA licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.archive.net;
+
+import java.io.BufferedInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.net.URLConnection;
+import java.util.Arrays;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import org.archive.util.ProcessUtils;
+import org.archive.util.ProcessUtils.ProcessResult;
+
+/**
+ * An URL Connection that pre-downloads URL reference before passing back a
+ * Stream reference.  When closed, it removes the local download file.
+ * @author stack
+ * @version $Date$, $Revision$
+ */
+public abstract class DownloadURLConnection extends URLConnection {
+    private final String CLASSNAME = DownloadURLConnection.class.getName();
+    private final Logger LOGGER = Logger.getLogger(CLASSNAME);
+    private static final File TMPDIR =
+        new File(System.getProperty("java.io.tmpdir", "/tmp"));
+    private File downloadFile = null;
+
+    protected DownloadURLConnection(URL u) {
+        super(u);
+    }
+    
+    protected String getScript() {
+    	return System.getProperty(this.getClass().getName() + ".path",
+    		"UNDEFINED");
+    }
+    
+    protected String [] getCommand(final URL thisUrl,
+    		final File downloadFile) {
+    	return new String[] {getScript(), thisUrl.getPath(),
+        	downloadFile.getAbsolutePath()};  
+    }
+
+    /**
+     * Do script copy to local file.
+     * File is available via {@link #getFile()}.
+     * @throws IOException 
+     */
+    public void connect() throws IOException {
+        if (this.connected) {
+            return;
+        }
+        
+        this.downloadFile = File.createTempFile(CLASSNAME, null, TMPDIR);
+        try {
+            String [] cmd = getCommand(this.url, this.downloadFile);    
+            if (LOGGER.isLoggable(Level.FINE)) {
+                StringBuffer buffer = new StringBuffer();
+                for (int i = 0; i < cmd.length; i++) {
+                    if (i > 0) {
+                        buffer.append(" ");
+                    }
+                    buffer.append(cmd[i]);
+                }
+                LOGGER.fine("Command: " + buffer.toString());
+            }
+            ProcessResult pr = ProcessUtils.exec(cmd);
+            if (pr.getResult() != 0) {
+                LOGGER.info(Arrays.toString(cmd) + " returned non-null " + pr.getResult());
+            }
+            // Assume download went smoothly.
+            this.connected = true;
+        } catch (IOException ioe) {
+            // Clean up my tmp file.
+            this.downloadFile.delete();
+            this.downloadFile = null;
+            // Rethrow.
+            throw ioe;
+        }
+    }
+    
+    public File getFile() {
+        return this.downloadFile;
+    }
+    
+    protected void setFile(final File f) {
+        this.downloadFile = f;
+    }
+
+    public InputStream getInputStream() throws IOException {
+        if (!this.connected) {
+            connect();
+        }
+        
+        // Return BufferedInputStream so 'delegation' is done for me, so
+        // I don't have to implement all IS methods and pass to my
+        // 'delegate' instance.
+        final DownloadURLConnection connection = this;
+        return new BufferedInputStream(new FileInputStream(this.downloadFile)) {
+            private DownloadURLConnection ruc = connection;
+
+            public void close() throws IOException {
+                super.close();
+                if (this.ruc != null && this.ruc.getFile()!= null &&
+                    this.ruc.getFile().exists()) {
+                    this.ruc.getFile().delete();
+                    this.ruc.setFile(null);
+                }
+            }
+        };
+    }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/net/FTPException.java b/src/main/java/org/archive/net/FTPException.java
new file mode 100644
index 00000000..2d104390
--- /dev/null
+++ b/src/main/java/org/archive/net/FTPException.java
@@ -0,0 +1,56 @@
+/*
+ *  This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ *  Licensed to the Internet Archive (IA) by one or more individual 
+ *  contributors. 
+ *
+ *  The IA licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.archive.net;
+
+import java.io.IOException;
+
+/**
+ * Indicates that a FTP operation failed due to a protocol violation.
+ * For instance, if authentication fails.
+ * 
+ * @author pjack
+ */
+public class FTPException extends IOException {
+    private static final long serialVersionUID = 1L;
+    
+    /**
+     * The reply code from the FTP server.
+     */
+    private int code;
+    
+    /**
+     * Constructs a new FTPException.
+     * 
+     * @param code  the error code from the FTP server
+     */
+    public FTPException(int code) {
+        super("FTP error code: " + code);
+        this.code = code;
+    }
+
+
+    /**
+     * Returns the error code from the FTP server.
+     * 
+     * @return  the error code from the FTP server
+     */
+    public int getReplyCode() {
+        return code;
+    }
+}
diff --git a/src/main/java/org/archive/url/PublicSuffixes.java b/src/main/java/org/archive/net/PublicSuffixes.java
similarity index 99%
rename from src/main/java/org/archive/url/PublicSuffixes.java
rename to src/main/java/org/archive/net/PublicSuffixes.java
index 7c3df6b8..eab8081a 100644
--- a/src/main/java/org/archive/url/PublicSuffixes.java
+++ b/src/main/java/org/archive/net/PublicSuffixes.java
@@ -17,7 +17,7 @@
  *  limitations under the License.
  */
 
-package org.archive.url;
+package org.archive.net;
 
 import java.io.BufferedReader;
 import java.io.BufferedWriter;
diff --git a/src/main/java/org/archive/net/md5/Handler.java b/src/main/java/org/archive/net/md5/Handler.java
new file mode 100644
index 00000000..8afcdebb
--- /dev/null
+++ b/src/main/java/org/archive/net/md5/Handler.java
@@ -0,0 +1,87 @@
+/*
+ *  This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ *  Licensed to the Internet Archive (IA) by one or more individual 
+ *  contributors. 
+ *
+ *  The IA licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.archive.net.md5;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.net.URLConnection;
+import java.net.URLStreamHandler;
+
+/**
+ * A protocol handler for an 'md5' URI scheme.
+ * Md5 URLs look like this: md5:deadbeefdeadbeefdeadbeefdeadbeef
+ * When this handler is invoked against an md5 URL, it passes the raw md5 to 
+ * the configured script as an argument.  The configured script then does the
+ * work to bring the item pointed to by the md5 local so we can open a Stream
+ * on the local copy.  Local file is deleted when we finish. Do
+ * {@link org.archive.net.DownloadURLConnection#getFile()} to get name of
+ * temporary file.
+ * 
+ * You need to define the system property
+ * -Djava.protocol.handler.pkgs=org.archive.net to add this handler
+ * to the java.net.URL set. Also define system properties
+ * -Dorg.archive.net.md5.Md5URLConnection.path=PATH_TO_SCRIPT to
+ * pass path of script to run as well as
+ * -Dorg.archive.net.md5.Md5URLConnection.options=OPTIONS for
+ * any options you'd like to include.  The pointed-to PATH_TO_SCRIPT
+ * will be invoked as follows: PATH_TO_SCRIPT OPTIONS MD5
+ * LOCAL_TMP_FILE.  The LOCAL_TMP_FILE file is made in
+ * java.io.tmpdir using java tmp name code.
+ * @author stack
+ */
+public class Handler extends URLStreamHandler {
+    protected URLConnection openConnection(URL u) {
+        return new Md5URLConnection(u);
+    }
+
+    /**
+     * Main dumps rsync file to STDOUT.
+     * @param args
+     * @throws IOException
+     */
+    public static void main(String[] args)
+    throws IOException {  
+        if (args.length != 1) {
+            System.out.println("Usage: java java " +
+                "-Djava.protocol.handler.pkgs=org.archive.net " +
+                "org.archive.net.md5.Handler " +
+                "md5:deadbeefdeadbeefdeadbeefdeadbeef");
+            System.exit(1);
+        }
+        System.setProperty("org.archive.net.md5.Md5URLConnection.path",
+            "/tmp/manifest");
+        System.setProperty("java.protocol.handler.pkgs", "org.archive.net");
+        URL u = new URL(args[0]);
+        URLConnection connect = u.openConnection();
+        // Write download to stdout.
+        final int bufferlength = 4096;
+        byte [] buffer = new byte [bufferlength];
+        InputStream is = connect.getInputStream();
+        try {
+            for (int count = is.read(buffer, 0, bufferlength);
+                    (count = is.read(buffer, 0, bufferlength)) != -1;) {
+                System.out.write(buffer, 0, count);
+            }
+            System.out.flush();
+        } finally {
+            is.close();
+        }
+    }
+}
diff --git a/src/main/java/org/archive/net/md5/Md5URLConnection.java b/src/main/java/org/archive/net/md5/Md5URLConnection.java
new file mode 100644
index 00000000..e4fe98e3
--- /dev/null
+++ b/src/main/java/org/archive/net/md5/Md5URLConnection.java
@@ -0,0 +1,34 @@
+/*
+ *  This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ *  Licensed to the Internet Archive (IA) by one or more individual 
+ *  contributors. 
+ *
+ *  The IA licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.archive.net.md5;
+
+import java.net.URL;
+
+import org.archive.net.DownloadURLConnection;
+
+/**
+ * Md5 URL connection.
+ * @author stack
+ * @version $Date$, $Revision$
+ */
+public class Md5URLConnection extends DownloadURLConnection {
+    protected Md5URLConnection(URL u) {
+        super(u);
+    }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/net/rsync/Handler.java b/src/main/java/org/archive/net/rsync/Handler.java
new file mode 100644
index 00000000..9eb35f5d
--- /dev/null
+++ b/src/main/java/org/archive/net/rsync/Handler.java
@@ -0,0 +1,71 @@
+/*
+ *  This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ *  Licensed to the Internet Archive (IA) by one or more individual 
+ *  contributors. 
+ *
+ *  The IA licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.archive.net.rsync;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.net.URLConnection;
+import java.net.URLStreamHandler;
+
+/**
+ * A protocol handler that uses native rsync client to do copy.
+ * You need to define the system property
+ * -Djava.protocol.handler.pkgs=org.archive.net to add this handler
+ * to the java.net.URL set.  Assumes rsync is in path.  Define
+ * system property
+ * -Dorg.archive.net.rsync.RsyncUrlConnection.path=PATH_TO_RSYNC to
+ * pass path to rsync. Downloads to java.io.tmpdir.
+ * @author stack
+ */
+public class Handler extends URLStreamHandler {
+    protected URLConnection openConnection(URL u) {
+        return new RsyncURLConnection(u);
+    }
+
+    /**
+     * Main dumps rsync file to STDOUT.
+     * @param args
+     * @throws IOException
+     */
+    public static void main(String[] args)
+    throws IOException {
+        if (args.length != 1) {
+            System.out.println("Usage: java java " +
+                "-Djava.protocol.handler.pkgs=org.archive.net " +
+                "org.archive.net.rsync.Handler RSYNC_URL");
+            System.exit(1);
+        }
+        URL u = new URL(args[0]);
+        URLConnection connect = u.openConnection();
+        // Write download to stdout.
+        final int bufferlength = 4096;
+        byte [] buffer = new byte [bufferlength];
+        InputStream is = connect.getInputStream();
+        try {
+            for (int count = is.read(buffer, 0, bufferlength);
+                    (count = is.read(buffer, 0, bufferlength)) != -1;) {
+                System.out.write(buffer, 0, count);
+            }
+            System.out.flush();
+        } finally {
+            is.close();
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/net/rsync/RsyncURLConnection.java b/src/main/java/org/archive/net/rsync/RsyncURLConnection.java
new file mode 100644
index 00000000..c6097e96
--- /dev/null
+++ b/src/main/java/org/archive/net/rsync/RsyncURLConnection.java
@@ -0,0 +1,51 @@
+/*
+ *  This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ *  Licensed to the Internet Archive (IA) by one or more individual 
+ *  contributors. 
+ *
+ *  The IA licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.archive.net.rsync;
+
+import java.io.File;
+import java.net.URL;
+
+import org.archive.net.DownloadURLConnection;
+
+/**
+ * Rsync URL connection.
+ * @author stack
+ * @version $Date$, $Revision$
+ */
+public class RsyncURLConnection extends DownloadURLConnection {
+    private final String RSYNC_TIMEOUT =
+    	System.getProperty(RsyncURLConnection.class.getName() + ".timeout",
+    		"300");
+
+    protected RsyncURLConnection(URL u) {
+        super(u);
+    }
+    
+    protected String getScript() {
+    	return System.getProperty(this.getClass().getName() + ".path",
+    		"rsync");
+    }
+    
+    @Override
+    protected String[] getCommand(final URL thisUrl,
+    		final File downloadFile) {
+    	return new String[] {getScript(), "--timeout=" + RSYNC_TIMEOUT,
+    		this.url.getPath(), downloadFile.getAbsolutePath()};  
+    }
+}
diff --git a/src/main/java/org/archive/uid/RecordIDGenerator.java b/src/main/java/org/archive/uid/RecordIDGenerator.java
new file mode 100644
index 00000000..97f1a022
--- /dev/null
+++ b/src/main/java/org/archive/uid/RecordIDGenerator.java
@@ -0,0 +1,72 @@
+/*
+ *  This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ *  Licensed to the Internet Archive (IA) by one or more individual 
+ *  contributors. 
+ *
+ *  The IA licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.archive.uid;
+
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.Map;
+
+/**
+ * A record-id generator.
+ *
+ * @contributor stack
+ * @contributor gojomo
+ * @version $Revision$ $Date$
+ */
+public interface RecordIDGenerator {
+	/**
+	 * @return A URI that can serve as a record-id.
+	 * @throws URISyntaxException
+	 */
+	public URI getRecordID();
+	
+	/**
+	 * @param qualifiers Qualifiers to add.
+	 * @return A URI qualified with passed qualifiers that can
+	 * serve as a record-id, or, a new, unique record-id without qualifiers
+	 * (if qualifiers not easily implemented using passed URI scheme).
+	 */
+	public URI getQualifiedRecordID(final Map qualifiers);
+	
+	/**
+	 * @param key Name of qualifier
+	 * @param value Value of qualifier
+	 * @return A URI qualified with passed qualifiers that can
+	 * serve as a record-id, or, a new, unique record-id without qualifiers
+	 * (if qualifiers not easily implemented using passed URI scheme).
+	 */
+	public URI getQualifiedRecordID(final String key, final String value);
+	
+	/**
+	 * Append (or if already present, update) qualifiers to passed
+	 * recordId.  Use with caution. Guard against turning up a
+	 * result that already exists.  Use when writing a group of records inside
+	 * a single transaction. 
+	 * 
+	 * How qualifiers are appended/updated varies with URI scheme. Its allowed
+	 * that an invocation of this method does nought but call
+	 * {@link #getRecordID()}, returning a new URI unrelated to the passed
+	 * recordId and passed qualifier.  
+	 * @param recordId URI to append qualifier to.
+	 * @param qualifiers Map of qualifier values keyed by qualifier name.
+	 * @return New URI based off passed uri and passed qualifier.
+	 */
+	public URI qualifyRecordID(final URI recordId,
+	    final Map  qualifiers);
+}
diff --git a/src/main/java/org/archive/uid/UUIDGenerator.java b/src/main/java/org/archive/uid/UUIDGenerator.java
new file mode 100644
index 00000000..26d29e60
--- /dev/null
+++ b/src/main/java/org/archive/uid/UUIDGenerator.java
@@ -0,0 +1,72 @@
+/*
+ *  This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ *  Licensed to the Internet Archive (IA) by one or more individual 
+ *  contributors. 
+ *
+ *  The IA licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.archive.uid;
+
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.Map;
+import java.util.UUID;
+
+/**
+ * Generates UUIDs, using
+ * {@link java.util.UUID java.util.UUID}, formatted as URNs from the UUID
+ * namespace [See RFC4122].
+ * Here is an examples of the type of ID it makes: 
+ * urn:uuid:0161811f-5da6-4c6e-9808-a2fab97114cf. Always makes a
+ * new identifier even when passed qualifiers.
+ *
+ * @author stack
+ * @version $Revision$ $Date$
+ * @see RFC4122
+ */
+public class UUIDGenerator implements RecordIDGenerator {
+	private static final String SCHEME = "urn:uuid";
+	private static final String SCHEME_COLON = SCHEME + ":";
+	
+	public UUIDGenerator() {
+		super();
+	}
+
+	public URI qualifyRecordID(URI recordId,
+			final Map qualifiers) {
+		return getRecordID();
+	}
+
+	private String getUUID() {
+		return UUID.randomUUID().toString();
+	}
+	
+	public URI getRecordID() {
+		try {
+            return new URI(SCHEME_COLON + getUUID());
+        } catch (URISyntaxException e) {
+            // should be impossible
+            throw new RuntimeException(e); 
+        }
+	}
+	
+	public URI getQualifiedRecordID(
+			final String key, final String value){
+		return getRecordID();
+	}
+
+	public URI getQualifiedRecordID(Map qualifiers){
+		return getRecordID();
+	}
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/uid/package.html b/src/main/java/org/archive/uid/package.html
new file mode 100644
index 00000000..dc49f07b
--- /dev/null
+++ b/src/main/java/org/archive/uid/package.html
@@ -0,0 +1,28 @@
+
+
+
+org.archive.uid package
+
+
+A unique ID generator.
+Default is {@link org.archive.uid.UUIDGenerator}.  
+To use another ID Generator, set the System Property
+org.archive.uid.GeneratorFactory.generator to point
+at an alternate implementation of {@link org.archive.uid.Generator}.
+
+
TODO
+
+    MIME boundaries have upper-bound of 70 characters total including
+    'blank line' (CRLFCRLF) and two leading hyphens. Add to
+    {@link org.archive.uid.Generator} 
+    interface an upper-bound on generated ID length.
+Add example of an actionable uid generator:
+e.g. http://archive.org/UID-SCHEME/ID
+where scheme might be UUID and an ID might be
+f9472055-fbb6-4810-90e8-68fd39e145a6;type=metadata or,
+using ARK: 
+http://archive.org/ark:/13030/f9472055-fbb6-4810-90e8-68fd39e145a6;type=metadata.
+
+
+
+
diff --git a/src/main/java/org/archive/url/ExtractRule.java b/src/main/java/org/archive/url/ExtractRule.java
new file mode 100644
index 00000000..bcfb3b2f
--- /dev/null
+++ b/src/main/java/org/archive/url/ExtractRule.java
@@ -0,0 +1,45 @@
+package org.archive.url;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class ExtractRule
+{
+	protected String startsWith;
+	protected String regex;
+	
+	protected Pattern regexPattern;
+	
+	public String getStartsWith() {
+		return startsWith;
+	}
+	public void setStartsWith(String startsWith) {
+		this.startsWith = startsWith;
+	}
+	public String getRegex() {
+		return regex;
+	}
+	public void setRegex(String regex) {
+		regexPattern = Pattern.compile(regex);
+		this.regex = regex;
+	}
+	
+	public Matcher extract(String url)
+	{		
+		if ((startsWith != null) && !startsWith.isEmpty() && !url.startsWith(startsWith)) {
+			return null;
+		}
+		
+		if (regexPattern == null) {
+			return null;
+		}
+		
+		Matcher match = regexPattern.matcher(url);
+		
+		if (!match.find()) {
+			return null;
+		}
+		
+		return match;
+	}
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/url/RewriteRule.java b/src/main/java/org/archive/url/RewriteRule.java
new file mode 100644
index 00000000..47292686
--- /dev/null
+++ b/src/main/java/org/archive/url/RewriteRule.java
@@ -0,0 +1,55 @@
+package org.archive.url;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class RewriteRule
+{
+	protected String startsWith;
+	protected String regex;
+	protected String replace;
+	
+	protected Pattern regexPattern;
+	
+	public String getStartsWith() {
+		return startsWith;
+	}
+	public void setStartsWith(String startsWith) {
+		this.startsWith = startsWith;
+	}
+	public String getRegex() {
+		return regex;
+	}
+	public void setRegex(String regex) {
+		regexPattern = Pattern.compile(regex);
+		this.regex = regex;
+	}
+	public String getReplace() {
+		return replace;
+	}
+	public void setReplace(String replace) {
+		this.replace = replace;
+	}
+	
+	public boolean rewrite(StringBuilder sb)
+	{
+		String urlkey = sb.toString();
+		
+		if ((startsWith != null) && !urlkey.startsWith(startsWith)) {
+			return false;
+		}
+		
+		if (regexPattern == null || replace == null) {
+			return false;
+		}
+		
+		Matcher match = regexPattern.matcher(urlkey);
+		
+		if (match.matches()) {
+			sb.replace(0, sb.length(), match.replaceAll(replace));
+			return true;
+		}
+		
+		return false;
+	}
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/url/UrlSurtRangeComputer.java b/src/main/java/org/archive/url/UrlSurtRangeComputer.java
index 74057117..2b960e16 100644
--- a/src/main/java/org/archive/url/UrlSurtRangeComputer.java
+++ b/src/main/java/org/archive/url/UrlSurtRangeComputer.java
@@ -112,7 +112,7 @@ public String[] determineRange(String url, MatchType match, String from, String
 		return new String[]{startKey, endKey, host};
 	}
 	
-	protected String incLastChar(String input)
+	public static String incLastChar(String input)
 	{
         StringBuilder sb = new StringBuilder(input);
         sb.setCharAt(sb.length() - 1, (char)(sb.charAt(sb.length() - 1) + 1));
diff --git a/src/main/java/org/archive/url/WaybackURLKeyMaker.java b/src/main/java/org/archive/url/WaybackURLKeyMaker.java
index 23c67d06..99fb92e9 100644
--- a/src/main/java/org/archive/url/WaybackURLKeyMaker.java
+++ b/src/main/java/org/archive/url/WaybackURLKeyMaker.java
@@ -2,8 +2,6 @@
 
 import java.net.URISyntaxException;
 import java.util.List;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
 
 public class WaybackURLKeyMaker implements URLKeyMaker {
 //	URLCanonicalizer canonicalizer = new NonMassagingIAURLCanonicalizer();
@@ -21,34 +19,6 @@ public void setCanonicalizer(URLCanonicalizer canonicalizer) {
 	
 	protected List customRules;
 	
-	public static class RewriteRule
-	{
-		String startsWith;
-		String regex;
-		String replace;
-		Pattern regexPattern;
-		
-		public String getStartsWith() {
-			return startsWith;
-		}
-		public void setStartsWith(String startsWith) {
-			this.startsWith = startsWith;
-		}
-		public String getRegex() {
-			return regex;
-		}
-		public void setRegex(String regex) {
-			regexPattern = Pattern.compile(regex);
-			this.regex = regex;
-		}
-		public String getReplace() {
-			return replace;
-		}
-		public void setReplace(String replace) {
-			this.replace = replace;
-		}
-	}
-	
 	public WaybackURLKeyMaker()
 	{
 
@@ -117,22 +87,12 @@ public void setCustomRules(List customRules) {
 	
 	protected String applyCustomRules(String urlkey)
 	{
+		StringBuilder sb = new StringBuilder(urlkey);
+		
 		for (RewriteRule rule : customRules) {
-			if ((rule.startsWith != null) && !urlkey.startsWith(rule.startsWith)) {
-				continue;
-			}
-			
-			if (rule.regexPattern == null || rule.replace == null) {
-				continue;
-			}
-			
-			Matcher match = rule.regexPattern.matcher(urlkey);
-			
-			if (match.matches()) {
-				urlkey = match.replaceAll(rule.replace);
-			}
+			rule.rewrite(sb);
 		}
 		
-		return urlkey;
+		return sb.toString();
 	}
 }
diff --git a/src/main/java/org/archive/util/DevUtils.java b/src/main/java/org/archive/util/DevUtils.java
new file mode 100644
index 00000000..d630a0b1
--- /dev/null
+++ b/src/main/java/org/archive/util/DevUtils.java
@@ -0,0 +1,116 @@
+/*
+ *  This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ *  Licensed to the Internet Archive (IA) by one or more individual 
+ *  contributors. 
+ *
+ *  The IA licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.archive.util;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.PrintWriter;
+import java.io.StringWriter;
+import java.util.logging.Logger;
+
+
+/**
+ * Write a message and stack trace to the 'org.archive.util.DevUtils' logger.
+ *
+ * @author gojomo
+ * @version $Revision$ $Date$
+ */
+public class DevUtils {
+    public static Logger logger =
+        Logger.getLogger(DevUtils.class.getName());
+
+    /**
+     * Log a warning message to the logger 'org.archive.util.DevUtils' made of
+     * the passed 'note' and a stack trace based off passed exception.
+     *
+     * @param ex Exception we print a stacktrace on.
+     * @param note Message to print ahead of the stacktrace.
+     */
+    public static void warnHandle(Throwable ex, String note) {
+        logger.warning(TextUtils.exceptionToString(note, ex));
+    }
+
+    /**
+     * @return Extra information gotten from current Thread.  May not
+     * always be available in which case we return empty string.
+     */
+    public static String extraInfo() {
+        StringWriter sw = new StringWriter();
+        PrintWriter pw = new PrintWriter(sw); 
+        final Thread current = Thread.currentThread();
+        if (current instanceof Reporter) {
+            Reporter tt = (Reporter)current;
+            try {
+                tt.reportTo(pw);
+            } catch (IOException e) {
+                // Not really possible w/ a StringWriter
+                e.printStackTrace();
+            } 
+        }
+        if (current instanceof ProgressStatisticsReporter) {
+            ProgressStatisticsReporter tt = (ProgressStatisticsReporter)current;
+            try {
+                tt.progressStatisticsLegend(pw);
+                tt.progressStatisticsLine(pw);
+            } catch (IOException e) {
+                // Not really possible w/ a StringWriter
+                e.printStackTrace();
+            }
+        }
+        pw.flush();
+        return sw.toString();
+    }
+
+    /**
+     * Nothing to see here, move along.
+     * @deprecated  This method was never used.
+     */
+    @Deprecated
+    public static void betterPrintStack(RuntimeException re) {
+        re.printStackTrace(System.err);
+    }
+    
+    /**
+     * Send this JVM process a SIGQUIT; giving a thread dump and possibly
+     * a heap histogram (if using -XX:+PrintClassHistogram).
+     * 
+     * Used to automatically dump info, for example when a serious error
+     * is encountered. Would use 'jmap'/'jstack', but have seen JVM
+     * lockups -- perhaps due to lost thread wake signals -- when using
+     * those against Sun 1.5.0+03 64bit JVM. 
+     */
+    public static void sigquitSelf() {
+        try {
+            Process p = Runtime.getRuntime().exec(
+                    new String[] {"perl", "-e", "print getppid(). \"\n\";"});
+            BufferedReader br =
+                new BufferedReader(new InputStreamReader(p.getInputStream()));
+            String ppid = br.readLine();
+            Runtime.getRuntime().exec(
+                    new String[] {"sh", "-c", "kill -3 "+ppid}).waitFor();
+        } catch (IOException e) {
+            // TODO Auto-generated catch block
+            e.printStackTrace();
+        } catch (InterruptedException e) {
+            // TODO Auto-generated catch block
+            e.printStackTrace();
+        }
+    }
+}
diff --git a/src/main/java/org/archive/util/FileUtils.java b/src/main/java/org/archive/util/FileUtils.java
new file mode 100644
index 00000000..3de276a9
--- /dev/null
+++ b/src/main/java/org/archive/util/FileUtils.java
@@ -0,0 +1,712 @@
+/*
+ *  This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ *  Licensed to the Internet Archive (IA) by one or more individual 
+ *  contributors. 
+ *
+ *  The IA licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.archive.util;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FileFilter;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.channels.ClosedByInterruptException;
+import java.nio.channels.FileChannel;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Properties;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import java.util.regex.Pattern;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.filefilter.IOFileFilter;
+import org.apache.commons.lang.math.LongRange;
+
+
+/** Utility methods for manipulating files and directories.
+ *
+ * @contributor John Erik Halse
+ * @contributor gojomo
+ */
+public class FileUtils {
+    private static final Logger LOGGER =
+        Logger.getLogger(FileUtils.class.getName());
+            
+    /**
+     * Constructor made private because all methods of this class are static.
+     */
+    private FileUtils() {
+        super();
+    }
+    
+    /**
+     * Copy the src file to the destination. Deletes any preexisting
+     * file at destination. 
+     * 
+     * @param src
+     * @param dest
+     * @return True if the extent was greater than actual bytes copied.
+     * @throws FileNotFoundException
+     * @throws IOException
+     */
+    public static boolean copyFile(final File src, final File dest)
+    throws FileNotFoundException, IOException {
+        return copyFile(src, dest, -1, true);
+    }
+    
+    /**
+     * Copy up to extent bytes of the source file to the destination.
+     * Deletes any preexisting file at destination.
+     *
+     * @param src
+     * @param dest
+     * @param extent Maximum number of bytes to copy
+     * @return True if the extent was greater than actual bytes copied.
+     * @throws FileNotFoundException
+     * @throws IOException
+     */
+    public static boolean copyFile(final File src, final File dest,
+        long extent)
+    throws FileNotFoundException, IOException {
+        return copyFile(src, dest, extent, true);
+    }
+
+	/**
+     * Copy up to extent bytes of the source file to the destination
+     *
+     * @param src
+     * @param dest
+     * @param extent Maximum number of bytes to copy
+	 * @param overwrite If target file already exits, and this parameter is
+     * true, overwrite target file (We do this by first deleting the target
+     * file before we begin the copy).
+	 * @return True if the extent was greater than actual bytes copied.
+     * @throws FileNotFoundException
+     * @throws IOException
+     */
+    public static boolean copyFile(final File src, final File dest,
+        long extent, final boolean overwrite)
+    throws FileNotFoundException, IOException {
+        boolean result = false;
+        if (LOGGER.isLoggable(Level.FINE)) {
+            LOGGER.fine("Copying file " + src + " to " + dest + " extent " +
+                extent + " exists " + dest.exists());
+        }
+        if (dest.exists()) {
+            if (overwrite) {
+                dest.delete();
+                LOGGER.finer(dest.getAbsolutePath() + " removed before copy.");
+            } else {
+                // Already in place and we're not to overwrite.  Return.
+                return result;
+            }
+        }
+        FileInputStream fis = null;
+        FileOutputStream fos = null;
+        FileChannel fcin = null;
+        FileChannel fcout = null;
+        try {
+            // Get channels
+            fis = new FileInputStream(src);
+            fos = new FileOutputStream(dest);
+            fcin = fis.getChannel();
+            fcout = fos.getChannel();
+            if (extent < 0) {
+                extent = fcin.size();
+            }
+
+            // Do the file copy
+            long trans = fcin.transferTo(0, extent, fcout);
+            if (trans < extent) {
+                result = false;
+            }
+            result = true; 
+        } catch (IOException e) {
+            // Add more info to the exception. Preserve old stacktrace.
+            // We get 'Invalid argument' on some file copies. See
+            // http://intellij.net/forums/thread.jsp?forum=13&thread=63027&message=853123
+            // for related issue.
+            String message = "Copying " + src.getAbsolutePath() + " to " +
+                dest.getAbsolutePath() + " with extent " + extent +
+                " got IOE: " + e.getMessage();
+            if ((e instanceof ClosedByInterruptException) ||
+                    ((e.getMessage()!=null)
+                            &&e.getMessage().equals("Invalid argument"))) {
+                LOGGER.severe("Failed copy, trying workaround: " + message);
+                workaroundCopyFile(src, dest);
+            } else {
+                IOException newE = new IOException(message);
+                newE.initCause(e);
+                throw newE;
+            }
+        } finally {
+            // finish up
+            if (fcin != null) {
+                fcin.close();
+            }
+            if (fcout != null) {
+                fcout.close();
+            }
+            if (fis != null) {
+                fis.close();
+            }
+            if (fos != null) {
+                fos.close();
+            }
+        }
+        return result;
+    }
+    
+    protected static void workaroundCopyFile(final File src,
+            final File dest)
+    throws IOException {
+        FileInputStream from = null;
+        FileOutputStream to = null;
+        try {
+            from = new FileInputStream(src);
+            to = new FileOutputStream(dest);
+            byte[] buffer = new byte[4096];
+            int bytesRead;
+            while ((bytesRead = from.read(buffer)) != -1) {
+                to.write(buffer, 0, bytesRead);
+            }
+        } finally {
+            if (from != null) {
+                try {
+                    from.close();
+                } catch (IOException e) {
+                    e.printStackTrace();
+                }
+            }
+            if (to != null) {
+                try {
+                    to.close();
+                } catch (IOException e) {
+                    e.printStackTrace();
+                }
+            }
+        }
+    }
+
+    /**
+     * Get a list of all files in directory that have passed prefix.
+     *
+     * @param dir Dir to look in.
+     * @param prefix Basename of files to look for. Compare is case insensitive.
+     *
+     * @return List of files in dir that start w/ passed basename.
+     */
+    public static File [] getFilesWithPrefix(File dir, final String prefix) {
+        FileFilter prefixFilter = new FileFilter() {
+                public boolean accept(File pathname)
+                {
+                    return pathname.getName().toLowerCase().
+                        startsWith(prefix.toLowerCase());
+                }
+            };
+        return dir.listFiles(prefixFilter);
+    }
+
+    /** Get a @link java.io.FileFilter that filters files based on a regular
+     * expression.
+     *
+     * @param regex the regular expression the files must match.
+     * @return the newly created filter.
+     */
+    public static IOFileFilter getRegexFileFilter(String regex) {
+        // Inner class defining the RegexFileFilter
+        class RegexFileFilter implements IOFileFilter {
+            Pattern pattern;
+
+            protected RegexFileFilter(String re) {
+                pattern = Pattern.compile(re);
+            }
+
+            public boolean accept(File pathname) {
+                return pattern.matcher(pathname.getName()).matches();
+            }
+
+            public boolean accept(File dir, String name) {
+                return accept(new File(dir,name));
+            }
+        }
+
+        return new RegexFileFilter(regex);
+    }
+    
+    /**
+     * Test file exists and is readable.
+     * @param f File to test.
+     * @exception FileNotFoundException If file does not exist or is not unreadable.
+     */
+    public static File assertReadable(final File f) throws FileNotFoundException {
+        if (!f.exists()) {
+            throw new FileNotFoundException(f.getAbsolutePath() +
+                " does not exist.");
+        }
+
+        if (!f.canRead()) {
+            throw new FileNotFoundException(f.getAbsolutePath() +
+                " is not readable.");
+        }
+        
+        return f;
+    }
+    
+    /**
+     * @param f File to test.
+     * @return True if file is readable, has uncompressed extension,
+     * and magic string at file start.
+     * @exception IOException If file not readable or other problem.
+     */
+    public static boolean isReadableWithExtensionAndMagic(final File f, 
+            final String uncompressedExtension, final String magic)
+    throws IOException {
+        boolean result = false;
+        FileUtils.assertReadable(f);
+        if(f.getName().toLowerCase().endsWith(uncompressedExtension)) {
+            FileInputStream fis = new FileInputStream(f);
+            try {
+                byte [] b = new byte[magic.length()];
+                int read = fis.read(b, 0, magic.length());
+                fis.close();
+                if (read == magic.length()) {
+                    StringBuffer beginStr
+                        = new StringBuffer(magic.length());
+                    for (int i = 0; i < magic.length(); i++) {
+                        beginStr.append((char)b[i]);
+                    }
+                    
+                    if (beginStr.toString().
+                            equalsIgnoreCase(magic)) {
+                        result = true;
+                    }
+                }
+            } finally {
+                fis.close();
+            }
+        }
+
+        return result;
+    }
+    
+    /**
+     * Turn path into a File, relative to context (which may be ignored 
+     * if path is absolute). 
+     * 
+     * @param context File context if path is relative
+     * @param path String path to make into a File
+     * @return File created
+     */
+    public static File maybeRelative(File context, String path) {
+        File f = new File(path);
+        if(f.isAbsolute()) {
+            return f;
+        }
+        return new File(context, path);
+    }
+    
+    /**
+     * Load Properties instance from a File
+     * 
+     * @param file
+     * @return Properties
+     * @throws IOException
+     */
+    public static Properties loadProperties(File file) throws IOException {
+        FileInputStream finp = new FileInputStream(file);
+        try {
+            Properties p = new Properties();
+            p.load(finp);
+            return p;
+        } finally {
+            ArchiveUtils.closeQuietly(finp);
+        }
+    }
+    
+    /**
+     * Store Properties instance to a File
+     * @param p
+     * @param file destination File
+     * @throws IOException
+     */
+    public static void storeProperties(Properties p, File file) throws IOException {
+        FileOutputStream fos = new FileOutputStream(file);
+        try {
+            p.store(fos,"");
+        } finally {
+            ArchiveUtils.closeQuietly(fos);
+        }
+    }
+
+    // TODO: comment
+    public static boolean moveAsideIfExists(File file) throws IOException {
+        if(!file.exists()) {
+            return true; 
+        }
+        String newName = 
+            file.getCanonicalPath() + "." 
+            + ArchiveUtils.get14DigitDate(file.lastModified());
+        boolean retVal = file.renameTo(new File(newName));
+        if(!retVal) {
+            LOGGER.warning("unable to move aside: "+file+" to "+newName);
+        }
+        return retVal;
+
+    }
+
+    /**
+     * Retrieve a number of lines from the file around the given 
+     * position, as when paging forward or backward through a file. 
+     * 
+     * @param file File to retrieve lines
+     * @param position offset to anchor lines
+     * @param signedDesiredLineCount lines requested; if negative, 
+     *        want this number of lines ending with a line containing
+     *        the position; if positive, want this number of lines,
+     *        all starting at or after position. 
+     * @param lines List to insert found lines
+     * @param lineEstimate int estimate of line size, 0 means use default
+     *        of 128
+     * @return LongRange indicating the file offsets corresponding to 
+     *         the beginning of the first line returned, and the point
+     *         after the end of the last line returned
+     * @throws IOException
+     */
+    @SuppressWarnings("unchecked")
+    public static LongRange pagedLines(File file, long position,
+            int signedDesiredLineCount, List lines, int lineEstimate)
+            throws IOException {
+        // consider negative positions as from end of file; -1 = last byte
+        if (position < 0) {
+            position = file.length() + position; 
+        }
+        
+        // calculate a reasonably sized chunk likely to have all desired lines
+        if(lineEstimate == 0) {
+            lineEstimate = 128; 
+        }
+        int desiredLineCount = Math.abs(signedDesiredLineCount);
+        long startPosition;
+        long fileEnd = file.length();
+        int bufferSize = (desiredLineCount + 5) * lineEstimate; 
+        if(signedDesiredLineCount>0) {
+            // reading forward; include previous char in case line-end
+            startPosition = position - 1;
+        } else {
+            // reading backward
+            startPosition = position - bufferSize + (2 * lineEstimate);
+        }
+        if(startPosition<0) {
+            startPosition = 0; 
+        }
+        if(startPosition+bufferSize > fileEnd) {
+            bufferSize = (int)(fileEnd - startPosition); 
+        }
+
+        // read that reasonable chunk
+        FileInputStream fis = new FileInputStream(file);
+        fis.getChannel().position(startPosition); 
+        byte[] buf = new byte[bufferSize];
+        ArchiveUtils.readFully(fis, buf);
+        IOUtils.closeQuietly(fis);
+        
+        // find all line starts fully in buffer
+        // (positions after a line-end, per line-end definition in 
+        // BufferedReader.readLine)
+        LinkedList lineStarts = new LinkedList();
+        if(startPosition==0) {
+            lineStarts.add(0);
+        }
+        boolean atLineEnd = false; 
+        boolean eatLF = false; 
+        int i; 
+        for(i = 0; i < bufferSize; i++) {
+            if ((char) buf[i] == '\n' && eatLF) {
+                eatLF = false;
+                continue;
+            }
+            if(atLineEnd) {
+                atLineEnd = false; 
+                lineStarts.add(i);
+                if(signedDesiredLineCount<0 && startPosition+i > position) {
+                    // reached next line past position, read no more
+                    break;
+                }
+            }
+            if ((char) buf[i] == '\r') {
+                atLineEnd = true; 
+                eatLF = true; 
+                continue;
+            }
+            if ((char) buf[i] == '\n') {
+                atLineEnd = true; 
+            }
+        }
+        if(startPosition+i == fileEnd) {
+            // add phantom lineStart after end
+            lineStarts.add(bufferSize);
+        }
+        int foundFullLines = lineStarts.size()-1;
+
+        // if found no lines
+        if(foundFullLines<1) {
+            if(signedDesiredLineCount>0) {
+                if(startPosition+bufferSize == fileEnd) {
+                    // nothing more to read: return nothing
+                    return new LongRange(fileEnd,fileEnd);
+                } else {
+                    // retry with larger lineEstimate
+                    return pagedLines(file, position, signedDesiredLineCount, lines, Math.max(bufferSize,lineEstimate));
+                }
+                
+            } else {
+                // try again with much larger line estimate
+                // TODO: fail gracefully before growing to multi-MB buffers
+                return pagedLines(file, position, signedDesiredLineCount, lines, bufferSize);
+            }
+        }
+                
+        // trim unneeded lines
+        while(signedDesiredLineCount>0 && startPosition+lineStarts.getFirst()desiredLineCount+1) {
+            if (signedDesiredLineCount < 0 && (startPosition+lineStarts.get(1) <= position) ) { 
+                // discard from front until reach line containing target position
+                lineStarts.removeFirst();
+            } else {
+                lineStarts.removeLast();
+            }
+        }
+        int firstLine =  lineStarts.getFirst();
+        int partialLine =  lineStarts.getLast(); 
+        LongRange range = new LongRange(startPosition + firstLine, startPosition + partialLine); 
+        List foundLines = 
+            IOUtils.readLines(new ByteArrayInputStream(buf,firstLine,partialLine-firstLine));
+
+        if(foundFullLines< 0 && startPosition > 0) {
+            // if needed and reading backward, read more lines from earlier
+            range = expandRange(
+                        range,
+                        pagedLines(file, 
+                                   range.getMinimumLong()-1, 
+                                   signedDesiredLineCount+foundFullLines, 
+                                   lines, 
+                                   bufferSize/foundFullLines));
+            
+        }
+        
+        lines.addAll(foundLines); 
+        
+        if(signedDesiredLineCount < 0 && range.getMaximumLong() < position) {
+            // did not get line containining start position
+            range = expandRange(
+                        range,
+                        pagedLines(file,
+                                   partialLine,
+                                   1,
+                                   lines,
+                                   bufferSize/foundFullLines));
+        }
+        
+        if(signedDesiredLineCount > 0 && foundFullLines < desiredLineCount && range.getMaximumLong() < fileEnd) {
+            // need more forward lines
+            range = expandRange(
+                    range,
+                    pagedLines(file,
+                               range.getMaximumLong(),
+                               desiredLineCount - foundFullLines,
+                               lines,
+                               bufferSize/foundFullLines));
+        }
+        
+        return range; 
+    }
+
+    public static LongRange expandRange(LongRange range1, LongRange range2) {
+        return new LongRange(Math.min(range1.getMinimumLong(), range2.getMinimumLong()),
+                             Math.max(range1.getMaximumLong(), range2.getMaximumLong()));
+        
+    }
+
+    public static LongRange pagedLines(File file, long position, int signedDesiredLongCount, List lines) throws IOException {
+        return pagedLines(file, position, signedDesiredLongCount, lines, 0);
+    }
+    
+    /**
+     * Delete the file now -- but in the event of failure, keep trying
+     * in the future. 
+     * 
+     * VERY IMPORTANT: Do not use with any file whose name/path may be 
+     * reused, because the lagged delete could then wind up deleting the
+     * newer file. Essentially, only to be used with uniquely-named temp
+     * files. 
+     * 
+     * Necessary because some platforms (looking at you, 
+     * JVM-on-Windows) will have deletes fail because of things like 
+     * file-mapped buffers remaining, and there's no explicit way to 
+     * unmap a buffer. (See 6-year-old Sun-stumping Java bug
+     * http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4724038 )
+     * We just have to wait and retry. 
+     * 
+     * (Why not just File.deleteOnExit? There could be an arbitrary, 
+     * unbounded number of files in such a situation, that are only 
+     * deletable a few seconds or minutes after our first attempt.
+     * Waiting for JVM exist could mean disk exhaustion. It's also
+     * unclear if the native FS class implementations of deleteOnExit
+     * use RAM per pending file.)
+     * 
+     * @param fileToDelete
+     */
+    public static synchronized void deleteSoonerOrLater(File fileToDelete) {
+        pendingDeletes.add(fileToDelete);
+        // if things are getting out of hand, force gc/finalization
+        if(pendingDeletes.size()>50) {
+            LOGGER.warning(">50 pending Files to delete; forcing gc/finalization");
+            System.gc();
+            System.runFinalization();
+        }
+        // try all pendingDeletes
+        Iterator iter = pendingDeletes.listIterator();
+        while(iter.hasNext()) {
+            File pending = iter.next(); 
+            if(pending.delete()) {
+                iter.remove();
+            }
+        }
+        // if things are still out of hand, complain loudly
+        if(pendingDeletes.size()>50) {
+            LOGGER.severe(">50 pending Files to delete even after gc/finalization");
+        }
+    }
+    protected static LinkedList pendingDeletes = new LinkedList();
+
+    /**
+     * Read the entire stream to EOF into the passed file.
+     * Closes is when done or if an exception.
+     * @param is Stream to read.
+     * @param toFile File to write to.
+     * @throws IOException 
+     */
+    public static long readFullyToFile(InputStream is, File toFile)
+            throws IOException {
+        OutputStream os = org.apache.commons.io.FileUtils.openOutputStream(toFile); 
+        try {
+            return IOUtils.copyLarge(is, os);
+        } finally {
+            IOUtils.closeQuietly(os); 
+            IOUtils.closeQuietly(is);
+        }
+    }
+
+    /**
+     * Ensure writeable directory.
+     *
+     * If doesn't exist, we attempt creation.
+     *
+     * @param dir Directory to test for exitence and is writeable.
+     *
+     * @return The passed dir.
+     *
+     * @exception IOException If passed directory does not exist and is not
+     * createable, or directory is not writeable or is not a directory.
+     */
+    public static File ensureWriteableDirectory(String dir)
+    throws IOException {
+        return FileUtils.ensureWriteableDirectory(new File(dir));
+    }
+
+    /**
+     * Ensure writeable directories.
+     *
+     * If doesn't exist, we attempt creation.
+     *
+     * @param dirs List of Files to test.
+     *
+     * @return The passed dirs.
+     *
+     * @exception IOException If passed directory does not exist and is not
+     * createable, or directory is not writeable or is not a directory.
+     */
+    public static List ensureWriteableDirectory(List dirs)
+    throws IOException {
+        for (Iterator i = dirs.iterator(); i.hasNext();) {
+             FileUtils.ensureWriteableDirectory(i.next());
+        }
+        return dirs;
+    }
+
+    /**
+     * Ensure writeable directory.
+     *
+     * If doesn't exist, we attempt creation.
+     *
+     * @param dir Directory to test for exitence and is writeable.
+     *
+     * @return The passed dir.
+     *
+     * @exception IOException If passed directory does not exist and is not
+     * createable, or directory is not writeable or is not a directory.
+     */
+    public static File ensureWriteableDirectory(File dir)
+    throws IOException {
+        if (!dir.exists()) {
+            boolean success = dir.mkdirs();
+            if (!success) {
+                throw new IOException("Failed to create directory: " + dir);
+            }
+        } else {
+            if (!dir.canWrite()) {
+                throw new IOException("Dir " + dir.getAbsolutePath() +
+                    " not writeable.");
+            } else if (!dir.isDirectory()) {
+                throw new IOException("Dir " + dir.getAbsolutePath() +
+                    " is not a directory.");
+            }
+        }
+    
+        return dir;
+    } 
+
+    public static File tryToCanonicalize(File file) {
+        try {
+            return file.getCanonicalFile();
+        } catch (IOException e) {
+            return file;
+        }
+    }
+
+    public static void appendTo(File fileToAppendTo, File fileToAppendFrom) throws IOException {
+        // optimal io block size according to http://lingrok.org/xref/coreutils/src/ioblksize.h
+        byte[] buf = new byte[65536];
+        FileOutputStream out = new FileOutputStream(fileToAppendTo, true);
+        FileInputStream in = new FileInputStream(fileToAppendFrom);
+        for (int n = in.read(buf); n > 0; n = in.read(buf)) {
+            out.write(buf, 0, n);
+        }
+        in.close();
+        out.flush();
+        out.close();
+    }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/util/InetAddressUtil.java b/src/main/java/org/archive/util/InetAddressUtil.java
new file mode 100644
index 00000000..585ba772
--- /dev/null
+++ b/src/main/java/org/archive/util/InetAddressUtil.java
@@ -0,0 +1,116 @@
+/*
+ *  This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ *  Licensed to the Internet Archive (IA) by one or more individual 
+ *  contributors. 
+ *
+ *  The IA licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+package org.archive.util;
+
+import java.net.InetAddress;
+import java.net.NetworkInterface;
+import java.net.SocketException;
+import java.net.UnknownHostException;
+import java.util.ArrayList;
+import java.util.Enumeration;
+import java.util.List;
+import java.util.logging.Logger;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * InetAddress utility.
+ * @author stack
+ * @version $Date$, $Revision$
+ */
+public class InetAddressUtil {
+    private static Logger logger =
+        Logger.getLogger(InetAddressUtil.class.getName());
+    
+    /**
+     * ipv4 address.
+     */
+    public static Pattern IPV4_QUADS = Pattern.compile(
+        "([0-9]{1,3})\\.([0-9]{1,3})\\.([0-9]{1,3})\\.([0-9]{1,3})");
+    
+    private InetAddressUtil () {
+        super();
+    }
+    
+    /**
+     * Returns InetAddress for passed host IF its in
+     * IPV4 quads format (e.g. 128.128.128.128).
+     * TODO: Move to an AddressParsingUtil class.
+     * @param host Host name to examine.
+     * @return InetAddress IF the passed name was an IP address, else null.
+     */
+    public static InetAddress getIPHostAddress(String host) {
+        InetAddress result = null;
+        Matcher matcher = IPV4_QUADS.matcher(host);
+        if (matcher == null || !matcher.matches()) {
+            return result;
+        }
+        try {
+            // Doing an Inet.getByAddress() avoids a lookup.
+            result = InetAddress.getByAddress(host,
+                    new byte[] {
+                    (byte)(new Integer(matcher.group(1)).intValue()),
+                    (byte)(new Integer(matcher.group(2)).intValue()),
+                    (byte)(new Integer(matcher.group(3)).intValue()),
+                    (byte)(new Integer(matcher.group(4)).intValue())});
+        } catch (NumberFormatException e) {
+            logger.warning(e.getMessage());
+        } catch (UnknownHostException e) {
+            logger.warning(e.getMessage());
+        }
+        return result;
+    }
+    
+    /**
+     * @return All known local names for this host or null if none found.
+     */
+    public static List getAllLocalHostNames() {
+        List localNames = new ArrayList();
+        Enumeration e = null;
+        try {
+            e = NetworkInterface.getNetworkInterfaces();
+        } catch(SocketException exception) {
+            throw new RuntimeException(exception);
+        }
+        for (; e.hasMoreElements();) {
+            for (Enumeration ee = e.nextElement().getInetAddresses();
+                    ee.hasMoreElements();) {
+                InetAddress ia = ee.nextElement();
+                if (ia != null) {
+                    if (ia.getHostName() != null) {
+                        localNames.add(ia.getCanonicalHostName());
+                    }
+                    if (ia.getHostAddress() !=  null) {
+                        localNames.add(ia.getHostAddress());
+                    }
+                }
+            }
+        }
+        final String localhost = "localhost";
+        if (!localNames.contains(localhost)) {
+            localNames.add(localhost);
+        }
+        final String localhostLocaldomain = "localhost.localdomain";
+        if (!localNames.contains(localhostLocaldomain)) {
+            localNames.add(localhostLocaldomain);
+        }
+        return localNames;
+    }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/util/IterableLineIterator.java b/src/main/java/org/archive/util/IterableLineIterator.java
new file mode 100644
index 00000000..6e0d9dc8
--- /dev/null
+++ b/src/main/java/org/archive/util/IterableLineIterator.java
@@ -0,0 +1,26 @@
+package org.archive.util;
+
+import java.io.Reader;
+import java.util.Iterator;
+
+import org.apache.commons.io.LineIterator;
+
+/**
+ * A LineIterator that also implements Iterable, so that it can be used with
+ * the java enhanced for-each loop syntax.
+ * 
+ * @contributor nlevitt
+ */
+public class IterableLineIterator extends LineIterator 
+    implements Iterable {
+
+    public IterableLineIterator(final Reader reader)
+            throws IllegalArgumentException {
+        super(reader);
+    }
+
+    @SuppressWarnings("unchecked")
+    public Iterator iterator() {
+        return this;
+    }
+}
diff --git a/src/main/java/org/archive/util/LaxHttpParser.java b/src/main/java/org/archive/util/LaxHttpParser.java
new file mode 100644
index 00000000..c1f768f0
--- /dev/null
+++ b/src/main/java/org/archive/util/LaxHttpParser.java
@@ -0,0 +1,242 @@
+/*
+ * $Header: /home/jerenkrantz/tmp/commons/commons-convert/cvs/home/cvs/jakarta-commons//httpclient/src/java/org/apache/commons/httpclient/LaxHttpParser.java,v 1.13 2005/01/11 13:57:06 oglueck Exp $
+ * $Revision$
+ * $Date$
+ *
+ * ====================================================================
+ *
+ *  Copyright 1999-2004 The Apache Software Foundation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation.  For more
+ * information on the Apache Software Foundation, please see
+ * .
+ *
+ */
+/*
+ * 
+ */
+
+package org.archive.util;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+
+import org.apache.commons.httpclient.Header;
+import org.apache.commons.httpclient.HttpException;
+import org.apache.commons.httpclient.util.EncodingUtil;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+/**
+ * A Modified version of HttpParser which doesn't throw exceptions on bad header lines
+ * 
+ * A utility class for parsing http header values according to
+ * RFC-2616 Section 4 and 19.3.
+ * 
+ * @author Michael Becke
+ * @author Oleg Kalnichevski
+ * 
+ * @since 2.0beta1
+ */
+public class LaxHttpParser {
+
+    /** Log object for this class. */
+    private static final Log LOG = LogFactory.getLog(LaxHttpParser.class);
+    
+    /**
+     * Constructor for LaxHttpParser.
+     */
+    protected LaxHttpParser() { }
+
+    /**
+     * Return byte array from an (unchunked) input stream.
+     * Stop reading when "\n" terminator encountered 
+     * If the stream ends before the line terminator is found,
+     * the last part of the string will still be returned. 
+     * If no input data available, null is returned.
+     *
+     * @param inputStream the stream to read from
+     *
+     * @throws IOException if an I/O problem occurs
+     * @return a byte array from the stream
+     */
+    public static byte[] readRawLine(InputStream inputStream) throws IOException {
+        LOG.trace("enter LaxHttpParser.readRawLine()");
+
+        ByteArrayOutputStream buf = new ByteArrayOutputStream();
+        int ch;
+        while ((ch = inputStream.read()) >= 0) {
+            buf.write(ch);
+            if (ch == '\n') { // be tolerant (RFC-2616 Section 19.3)
+                break;
+            }
+        }
+        if (buf.size() == 0) {
+            return null;
+        }
+        return buf.toByteArray();
+    }
+
+    /**
+     * Read up to "\n" from an (unchunked) input stream.
+     * If the stream ends before the line terminator is found,
+     * the last part of the string will still be returned.
+     * If no input data available, null is returned.
+     *
+     * @param inputStream the stream to read from
+     * @param charset charset of HTTP protocol elements
+     *
+     * @throws IOException if an I/O problem occurs
+     * @return a line from the stream
+     * 
+     * @since 3.0
+     */
+    public static String readLine(InputStream inputStream, String charset) throws IOException {
+        LOG.trace("enter LaxHttpParser.readLine(InputStream, String)");
+        byte[] rawdata = readRawLine(inputStream);
+        if (rawdata == null) {
+            return null;
+        }
+        // strip CR and LF from the end
+        int len = rawdata.length;
+        int offset = 0;
+        if (len > 0) {
+            if (rawdata[len - 1] == '\n') {
+                offset++;
+                if (len > 1) {
+                    if (rawdata[len - 2] == '\r') {
+                        offset++;
+                    }
+                }
+            }
+        }
+        return EncodingUtil.getString(rawdata, 0, len - offset, charset);
+    }
+
+    /**
+     * Read up to "\n" from an (unchunked) input stream.
+     * If the stream ends before the line terminator is found,
+     * the last part of the string will still be returned.
+     * If no input data available, null is returned
+     *
+     * @param inputStream the stream to read from
+     *
+     * @throws IOException if an I/O problem occurs
+     * @return a line from the stream
+     * 
+     * @deprecated use #readLine(InputStream, String)
+     */
+
+    public static String readLine(InputStream inputStream) throws IOException {
+        LOG.trace("enter LaxHttpParser.readLine(InputStream)");
+        return readLine(inputStream, "US-ASCII");
+    }
+    
+    /**
+     * Parses headers from the given stream.  Headers with the same name are not
+     * combined.
+     * 
+     * @param is the stream to read headers from
+     * @param charset the charset to use for reading the data
+     * 
+     * @return an array of headers in the order in which they were parsed
+     * 
+     * @throws IOException if an IO error occurs while reading from the stream
+     * @throws HttpException if there is an error parsing a header value
+     * 
+     * @since 3.0
+     */
+    public static Header[] parseHeaders(InputStream is, String charset) throws IOException, HttpException {
+        LOG.trace("enter HeaderParser.parseHeaders(InputStream, String)");
+
+        ArrayList
 headers = new ArrayList();
+        String name = null;
+        StringBuffer value = null;
+        for (; ;) {
+            String line = LaxHttpParser.readLine(is, charset);
+            if ((line == null) || (line.trim().length() < 1)) {
+                break;
+            }
+
+            // Parse the header name and value
+            // Check for folded headers first
+            // Detect LWS-char see HTTP/1.0 or HTTP/1.1 Section 2.2
+            // discussion on folded headers
+            if ((line.charAt(0) == ' ') || (line.charAt(0) == '\t')) {
+                // we have continuation folded header
+                // so append value
+                if (value != null) {
+                    value.append(' ');
+                    value.append(line.trim());
+                }
+            } else {
+                // make sure we save the previous name,value pair if present
+                if (name != null) {
+                    headers.add(new Header(name, value.toString()));
+                }
+
+                // Otherwise we should have normal HTTP header line
+                // Parse the header name and value
+                int colon = line.indexOf(":");
+                
+                // START IA/HERITRIX change
+                // Don't throw an exception if can't parse.  We want to keep
+                // going even though header is bad. Rather, create
+                // pseudo-header.
+                if (colon < 0) {
+                    // throw new ProtocolException("Unable to parse header: " +
+                    //      line);
+                    name = "HttpClient-Bad-Header-Line-Failed-Parse";
+                    value = new StringBuffer(line);
+
+                } else {
+                name = line.substring(0, colon).trim();
+                value = new StringBuffer(line.substring(colon + 1).trim());
+                }
+                // END IA/HERITRIX change
+            }
+
+        }
+
+        // make sure we save the last name,value pair if present
+        if (name != null) {
+            headers.add(new Header(name, value.toString()));
+        }
+        
+        return (Header[]) headers.toArray(new Header[headers.size()]);    
+    }
+
+    /**
+     * Parses headers from the given stream.  Headers with the same name are not
+     * combined.
+     * 
+     * @param is the stream to read headers from
+     * 
+     * @return an array of headers in the order in which they were parsed
+     * 
+     * @throws IOException if an IO error occurs while reading from the stream
+     * @throws HttpException if there is an error parsing a header value
+     * 
+     * @deprecated use #parseHeaders(InputStream, String)
+     */
+    public static Header[] parseHeaders(InputStream is) throws IOException, HttpException {
+        LOG.trace("enter HeaderParser.parseHeaders(InputStream, String)");
+        return parseHeaders(is, "US-ASCII");
+    }
+}
diff --git a/src/main/java/org/archive/util/MimetypeUtils.java b/src/main/java/org/archive/util/MimetypeUtils.java
new file mode 100644
index 00000000..adfa1a0f
--- /dev/null
+++ b/src/main/java/org/archive/util/MimetypeUtils.java
@@ -0,0 +1,75 @@
+/*
+ *  This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ *  Licensed to the Internet Archive (IA) by one or more individual 
+ *  contributors. 
+ *
+ *  The IA licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.archive.util;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Class of mimetype utilities.
+ * @author stack
+ */
+public class MimetypeUtils {
+    /**
+     * The 'no-type' content-type.
+     * 
+     * Defined in the ARC file spec at
+     * http://www.archive.org/web/researcher/ArcFileFormat.php.
+     */
+    public static final String NO_TYPE_MIMETYPE = "no-type";
+    
+    /**
+     * Truncation regex.
+     */
+    protected final static Pattern TRUNCATION_REGEX = Pattern.compile("^([^\\s;,]+).*");
+
+
+    /**
+     * Truncate passed mimetype.
+     * 
+     * Ensure no spaces.  Strip encoding.  Truncation required by
+     * ARC files.
+     *
+     * Truncate at delimiters [;, ].
+     * Truncate multi-part content type header at ';'.
+     * Apache httpclient collapses values of multiple instances of the
+     * header into one comma-separated value,therefore truncated at ','.
+     * Current ia_tools that work with arc files expect 5-column
+     * space-separated meta-lines, therefore truncate at ' '.
+     *
+     * @param contentType Raw content-type.
+     *
+     * @return Computed content-type made from passed content-type after
+     * running it through a set of rules.
+     */
+    public static String truncate(String contentType) {
+        if (contentType == null) {
+            contentType = NO_TYPE_MIMETYPE;
+        } else {
+            Matcher matcher = TRUNCATION_REGEX.matcher(contentType);
+            if (matcher.matches()) {
+            	contentType = matcher.group(1);
+            } else {
+            	contentType = NO_TYPE_MIMETYPE;
+            }
+        }
+
+        return contentType;
+    }
+}
diff --git a/src/main/java/org/archive/util/ProcessUtils.java b/src/main/java/org/archive/util/ProcessUtils.java
new file mode 100644
index 00000000..af792981
--- /dev/null
+++ b/src/main/java/org/archive/util/ProcessUtils.java
@@ -0,0 +1,151 @@
+/*
+ *  This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ *  Licensed to the Internet Archive (IA) by one or more individual 
+ *  contributors. 
+ *
+ *  The IA licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.archive.util;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.Arrays;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+/**
+ * Class to run an external process.
+ * @author stack
+ * @version $Date$ $Revision$
+ */
+public class ProcessUtils {
+    private static final Logger LOGGER =
+        Logger.getLogger(ProcessUtils.class.getName());
+    
+    protected ProcessUtils() {
+        super();
+    }
+    
+    /**
+     * Thread to gobble up an output stream.
+     * See http://www.javaworld.com/javaworld/jw-12-2000/jw-1229-traps.html
+     */
+    protected class StreamGobbler extends Thread {
+        private final InputStream is;
+        private final StringBuffer sink = new StringBuffer();
+
+        protected StreamGobbler(InputStream is, String name) {
+            this.is = is;
+            setName(name);
+        }
+
+        public void run() {
+            try {
+                BufferedReader br =
+                    new BufferedReader(new InputStreamReader(this.is));
+                for (String line = null; (line = br.readLine()) != null;) {
+                    this.sink.append(line);
+                }
+            } catch (IOException ioe) {
+                ioe.printStackTrace();
+            }
+        }
+        
+        public String getSink() {
+            return this.sink.toString();
+        }
+    }
+    
+    /**
+     * Data structure to hold result of a process exec.
+     * @author stack
+     * @version $Date$ $Revision$
+     */
+    public class ProcessResult {
+        private final String [] args;
+        private final int result;
+        private final String stdout;
+        private final String stderr;
+            
+        protected ProcessResult(String [] args, int result, String stdout,
+                    String stderr) {
+            this.args = args;
+            this.result = result;
+            this.stderr = stderr;
+            this.stdout = stdout;
+        }
+            
+        public int getResult() {
+            return this.result;
+        }
+            
+        public String getStdout() {
+            return this.stdout;
+        }
+            
+        public String getStderr() {
+            return this.stderr;
+        }
+                
+        public String toString() {
+            StringBuffer sb = new StringBuffer();
+            for (int i = 0; i < this.args.length; i++) {
+                sb.append(this.args[i]);
+                sb.append(", ");
+            }
+            return sb.toString() + " exit code: " + this.result +
+                ((this.stderr != null && this.stderr.length() > 0)?
+                    "\nSTDERR: " + this.stderr: "") +
+                ((this.stdout != null && this.stdout.length() > 0)?
+                    "\nSTDOUT: " + this.stdout: "");
+        }
+    }
+        
+    /**
+     * Runs process.
+     * @param args List of process args.
+     * @return A ProcessResult data structure.
+     * @throws IOException If interrupted, we throw an IOException. If non-zero
+     * exit code, we throw an IOException (This may need to change).
+     */
+    public static ProcessUtils.ProcessResult exec(String [] args)
+    throws IOException {
+        Process p = Runtime.getRuntime().exec(args);
+        ProcessUtils pu = new ProcessUtils();
+        // Gobble up any output.
+        StreamGobbler err = pu.new StreamGobbler(p.getErrorStream(), "stderr");
+        err.setDaemon(true);
+        err.start();
+        StreamGobbler out = pu.new StreamGobbler(p.getInputStream(), "stdout");
+        out.setDaemon(true);
+        out.start();
+        int exitVal;
+        try {
+            exitVal = p.waitFor();
+        } catch (InterruptedException e) {
+            throw new IOException("Wait on process " + Arrays.toString(args) + " interrupted: "
+                + e.getMessage());
+        }
+        ProcessUtils.ProcessResult result =
+            pu.new ProcessResult(args, exitVal, out.getSink(), err.getSink());
+        if (exitVal != 0) {
+            throw new IOException(result.toString());
+        } else if (LOGGER.isLoggable(Level.INFO)) {
+            LOGGER.info(result.toString());
+        }
+        return result;
+    }
+}
diff --git a/src/main/java/org/archive/util/ProgressStatisticsReporter.java b/src/main/java/org/archive/util/ProgressStatisticsReporter.java
new file mode 100644
index 00000000..dc1e51f7
--- /dev/null
+++ b/src/main/java/org/archive/util/ProgressStatisticsReporter.java
@@ -0,0 +1,36 @@
+/*
+ *  This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ *  Licensed to the Internet Archive (IA) by one or more individual 
+ *  contributors. 
+ *
+ *  The IA licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.archive.util;
+
+import java.io.IOException;
+import java.io.PrintWriter;
+
+public interface ProgressStatisticsReporter {
+    /**
+     * @param writer Where to write statistics.
+     * @throws IOException 
+     */
+    public void progressStatisticsLine(PrintWriter writer) throws IOException;
+    
+    /**
+     * @param writer Where to write statistics legend.
+     * @throws IOException 
+     */
+    public void progressStatisticsLegend(PrintWriter writer) throws IOException;
+}
diff --git a/src/main/java/org/archive/util/PropertyUtils.java b/src/main/java/org/archive/util/PropertyUtils.java
new file mode 100644
index 00000000..083615f6
--- /dev/null
+++ b/src/main/java/org/archive/util/PropertyUtils.java
@@ -0,0 +1,114 @@
+/*
+ *  This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ *  Licensed to the Internet Archive (IA) by one or more individual 
+ *  contributors. 
+ *
+ *  The IA licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.archive.util;
+
+import java.util.Properties;
+import java.util.regex.Matcher;
+
+import org.apache.commons.lang.StringUtils;
+
+/**
+ * Utilities for dealing with Java Properties (incl. System Properties)
+ * 
+ * @contributor stack
+ * @contributor gojomo
+ * @version $Date$ $Revision$
+ */
+public class PropertyUtils {
+    /***
+     * @param key Property key.
+     * @return Named property or null if the property is null or empty.
+     */
+    public static String getPropertyOrNull(final String key) {
+        String value = System.getProperty(key);
+        return (value == null || value.length() <= 0)? null: value;
+    }
+
+    /***
+     * @param key Property key.
+     * @return Boolean value or false if null or unreadable.
+     */
+    public static boolean getBooleanProperty(final String key) {
+        return (getPropertyOrNull(key) == null)?
+                false: Boolean.valueOf(getPropertyOrNull(key)).booleanValue();
+    }   
+    
+    /**
+     * @param key Key to use looking up system property.
+     * @param fallback If no value found for passed key, return
+     * fallback.
+     * @return Value of property or fallback.
+     */
+    public static int getIntProperty(final String key, final int fallback) {
+        return getPropertyOrNull(key) == null?
+                fallback: Integer.parseInt(getPropertyOrNull(key));
+    }
+    
+    /**
+     * Given a string which may contain expressions of the form 
+     * ${key}, replace each expression with the value corresponding to the
+     * given key in System Properties. If no value is present, 
+     * the expression is replaced with the empty-string. 
+     * 
+     * @param original String
+     * @param properties Properties to try in order; first value found (if any) is used
+     * @return modified String
+     */
+    public static String interpolateWithProperties(String original) {
+        return interpolateWithProperties(original,System.getProperties());
+    }
+
+    protected static String propRefPattern = "\\$\\{([^{}]+)\\}";
+    
+    /**
+     * Given a string which may contain expressions of the form 
+     * ${key}, replace each expression with the value corresponding to the
+     * given key in the supplied Properties instance. If no value is present, 
+     * the expression is replaced with the empty-string. 
+     * 
+     * @param original String
+     * @param props Properties to try in order; first value found (if any) is used
+     * @return modified String
+     */
+    public static String interpolateWithProperties(String original,
+            Properties... props) {
+        String result = original;
+        // cap number of interpolations as guard against unending loop
+        inter: for(int i =0; i < original.length()*2; i++) {
+            Matcher m = TextUtils.getMatcher(propRefPattern, result);
+            while(m.find()) {
+                String key = m.group(1); 
+                String value = "";
+                for(Properties properties : props) {
+                    value = properties.getProperty(key, "");
+                    if(StringUtils.isNotEmpty(value)) {
+                        break;
+                    }
+                }
+                result = result.substring(0,m.start()) 
+                            + value
+                            + result.substring(m.end());
+                continue inter;
+            }
+            // we only hit here if there were no interpolations last while loop
+            break;
+        }
+        return result; 
+    }
+}
diff --git a/src/main/java/org/archive/util/Recorder.java b/src/main/java/org/archive/util/Recorder.java
new file mode 100644
index 00000000..425344bb
--- /dev/null
+++ b/src/main/java/org/archive/util/Recorder.java
@@ -0,0 +1,593 @@
+/*
+ *  This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ *  Licensed to the Internet Archive (IA) by one or more individual 
+ *  contributors. 
+ *
+ *  The IA licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.archive.util;
+
+import java.io.BufferedInputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
+import java.nio.charset.Charset;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import java.util.zip.DeflaterInputStream;
+import java.util.zip.GZIPInputStream;
+
+import org.apache.commons.httpclient.ChunkedInputStream;
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang.StringUtils;
+import org.archive.io.GenericReplayCharSequence;
+import org.archive.io.RecordingInputStream;
+import org.archive.io.RecordingOutputStream;
+import org.archive.io.ReplayCharSequence;
+import org.archive.io.ReplayInputStream;
+
+import com.google.common.base.Charsets;
+
+
+/**
+ * Pairs together a RecordingInputStream and RecordingOutputStream
+ * to capture exactly a single HTTP transaction.
+ *
+ * Initially only supports HTTP/1.0 (one request, one response per stream)
+ *
+ * Call {@link #markContentBegin()} to demarc the transition between HTTP
+ * header and body.
+ *
+ * @author gojomo
+ */
+public class Recorder {
+    protected static Logger logger =
+        Logger.getLogger("org.archive.util.HttpRecorder");
+
+    private static final int DEFAULT_OUTPUT_BUFFER_SIZE = 16384;
+    private static final int DEFAULT_INPUT_BUFFER_SIZE = 524288;
+
+    private RecordingInputStream ris = null;
+    private RecordingOutputStream ros = null;
+
+    /**
+     * Backing file basename.
+     *
+     * Keep it around so can clean up backing files left on disk.
+     */
+    private String backingFileBasename = null;
+
+    /**
+     * Backing file output stream suffix.
+     */
+    private static final String RECORDING_OUTPUT_STREAM_SUFFIX = ".ros";
+
+   /**
+    * Backing file input stream suffix.
+    */
+    private static final String RECORDING_INPUT_STREAM_SUFFIX = ".ris";
+
+    /**
+     * recording-input (ris) content character encoding.
+     */
+    protected String characterEncoding = null;
+    
+    /**
+     * Charset to use for CharSequence provision. Will be UTF-8 if no
+     * encoding ever requested; a Charset matching above characterEncoding
+     * if possible; ISO_8859 if above characterEncoding is unsatisfiable. 
+     * TODO: unify to UTF-8 for unspecified and bad-specified cases? 
+     * (current behavior is for consistency with our prior but perhaps not
+     * optimal behavior) 
+     */
+    protected Charset charset = Charsets.UTF_8; 
+    
+    /** whether recording-input (ris) message-body is chunked */
+    protected boolean inputIsChunked = false; 
+
+    /** recording-input (ris) entity content-encoding (eg gzip, deflate), if any */ 
+    protected String contentEncoding = null; 
+    
+    private ReplayCharSequence replayCharSequence;
+
+   
+    /**
+     * Create an HttpRecorder.
+     *
+     * @param tempDir Directory into which we drop backing files for
+     * recorded input and output.
+     * @param backingFilenameBase Backing filename base to which we'll append
+     * suffices ris for recorded input stream and
+     * ros for recorded output stream.
+     * @param outBufferSize Size of output buffer to use.
+     * @param inBufferSize Size of input buffer to use.
+     */
+    public Recorder(File tempDir, String backingFilenameBase, 
+            int outBufferSize, int inBufferSize) {
+        this(new File(ensure(tempDir), backingFilenameBase),
+                outBufferSize, inBufferSize);
+    }
+    
+    
+    private static File ensure(File tempDir) {
+        try {
+            org.archive.util.FileUtils.ensureWriteableDirectory(tempDir);
+        } catch (IOException e) {
+            throw new IllegalStateException(e);
+        }
+        
+        return tempDir;
+    }
+    
+    public Recorder(File file, int outBufferSize, int inBufferSize) {
+        super();
+        this.backingFileBasename = file.getAbsolutePath();
+        this.ris = new RecordingInputStream(inBufferSize,
+            this.backingFileBasename + RECORDING_INPUT_STREAM_SUFFIX);
+        this.ros = new RecordingOutputStream(outBufferSize,
+            this.backingFileBasename + RECORDING_OUTPUT_STREAM_SUFFIX);
+    }
+
+    /**
+     * Create an HttpRecorder.
+     * 
+     * @param tempDir
+     *            Directory into which we drop backing files for recorded input
+     *            and output.
+     * @param backingFilenameBase
+     *            Backing filename base to which we'll append suffices
+     *            ris for recorded input stream and
+     *            ros for recorded output stream.
+     */
+    public Recorder(File tempDir, String backingFilenameBase) {
+        this(tempDir, backingFilenameBase, DEFAULT_INPUT_BUFFER_SIZE,
+                DEFAULT_OUTPUT_BUFFER_SIZE);
+    }
+
+    
+    /**
+     * Wrap the provided stream with the internal RecordingInputStream
+     *
+     * open() throws an exception if RecordingInputStream is already open.
+     *
+     * @param is InputStream to wrap.
+     *
+     * @return The input stream wrapper which itself is an input stream.
+     * Pass this in place of the passed stream so input can be recorded.
+     *
+     * @throws IOException
+     */
+    public InputStream inputWrap(InputStream is) 
+    throws IOException {
+        logger.fine(Thread.currentThread().getName() + " wrapping input");
+        
+        // discard any state from previously-recorded input
+        this.characterEncoding = null;
+        this.inputIsChunked = false;
+        this.contentEncoding = null; 
+        
+        this.ris.open(is);
+        return this.ris;
+    }
+
+    /**
+     * Wrap the provided stream with the internal RecordingOutputStream
+     *
+     * open() throws an exception if RecordingOutputStream is already open.
+     * 
+     * @param os The output stream to wrap.
+     *
+     * @return The output stream wrapper which is itself an output stream.
+     * Pass this in place of the passed stream so output can be recorded.
+     *
+     * @throws IOException
+     */
+    public OutputStream outputWrap(OutputStream os) 
+    throws IOException {
+        this.ros.open(os);
+        return this.ros;
+    }
+
+    /**
+     * Close all streams.
+     */
+    public void close() {
+        logger.fine(Thread.currentThread().getName() + " closing");
+        try {
+            this.ris.close();
+        } catch (IOException e) {
+            // TODO: Can we not let the exception out of here and report it
+            // higher up in the caller?
+            DevUtils.logger.log(Level.SEVERE, "close() ris" +
+                DevUtils.extraInfo(), e);
+        }
+        try {
+            this.ros.close();
+        } catch (IOException e) {
+            DevUtils.logger.log(Level.SEVERE, "close() ros" +
+                DevUtils.extraInfo(), e);
+        }
+    }
+
+    /**
+     * Return the internal RecordingInputStream
+     *
+     * @return A RIS.
+     */
+    public RecordingInputStream getRecordedInput() {
+        return this.ris;
+    }
+
+    /**
+     * @return The RecordingOutputStream.
+     */
+    public RecordingOutputStream getRecordedOutput() {
+        return this.ros;
+    }
+
+    /**
+     * Mark current position as the point where the HTTP headers end.
+     */
+    public void markContentBegin() {
+        this.ris.markContentBegin();
+    }
+
+    public long getResponseContentLength() {
+        return this.ris.getResponseContentLength();
+    }
+
+    /**
+     * Close both input and output recorders.
+     *
+     * Recorders are the output streams to which we are recording.
+     * {@link #close()} closes the stream that is being recorded and the
+     * recorder. This method explicitly closes the recorder only.
+     */
+    public void closeRecorders() {
+        try {
+            this.ris.closeRecorder();
+            this.ros.closeRecorder();
+        } catch (IOException e) {
+            DevUtils.warnHandle(e, "Convert to runtime exception?");
+        }
+    }
+
+    /**
+     * Cleanup backing files.
+     *
+     * Call when completely done w/ recorder.  Removes any backing files that
+     * may have been dropped.
+     */
+    public void cleanup() {
+        this.close();
+        this.delete(this.backingFileBasename + RECORDING_OUTPUT_STREAM_SUFFIX);
+        this.delete(this.backingFileBasename + RECORDING_INPUT_STREAM_SUFFIX);
+    }
+
+    /**
+     * Delete file if exists.
+     *
+     * @param name Filename to delete.
+     */
+    private void delete(String name) {
+        File f = new File(name);
+        if (f.exists()) {
+            f.delete();
+        }
+    }
+
+    
+    protected static ThreadLocal currentRecorder = new ThreadLocal();
+    
+    public static void setHttpRecorder(Recorder httpRecorder) {
+        currentRecorder.set(httpRecorder);
+    } 
+    
+    /**
+     * Get the current threads' HttpRecorder.
+     *
+     * @return This threads' HttpRecorder.  Returns null if can't find a
+     * HttpRecorder in current instance.
+     */
+    public static Recorder getHttpRecorder() {
+        return currentRecorder.get(); 
+    }
+
+    /**
+     * @param characterEncoding Character encoding of input recording.
+     * @return actual charset in use after attempt to set
+     */
+    public void setCharset(Charset cs) {
+        this.charset = cs;
+    }
+    
+    /**
+     * @return effective Charset of input recording 
+     */
+    public Charset getCharset() {
+        return this.charset; 
+    }
+    
+    /**
+     * @param characterEncoding Character encoding of input recording.
+     */
+    public void setInputIsChunked(boolean chunked) {
+        this.inputIsChunked = chunked;
+    }
+    
+    protected static Set SUPPORTED_ENCODINGS = new HashSet();
+    static {
+        SUPPORTED_ENCODINGS.add("gzip"); 
+        SUPPORTED_ENCODINGS.add("x-gzip");
+        SUPPORTED_ENCODINGS.add("deflate");
+        SUPPORTED_ENCODINGS.add("identity");
+        SUPPORTED_ENCODINGS.add("none"); // unofficial but common
+    }
+    /**
+     * @param contentEncoding declared content-encoding of input recording.
+     */
+    public void setContentEncoding(String contentEncoding) {
+        String lowerCoding = contentEncoding.toLowerCase(); 
+        if(!SUPPORTED_ENCODINGS.contains(contentEncoding.toLowerCase())) {
+            throw new IllegalArgumentException("contentEncoding unsupported: "+contentEncoding); 
+        }
+        this.contentEncoding = lowerCoding;
+    }
+
+    /**
+     * @return Returns the characterEncoding.
+     */
+    public String getContentEncoding() {
+        return this.contentEncoding;
+    }
+
+    
+    /**
+     * @return
+     * @throws IOException
+     * @deprecated use getContentReplayCharSequence
+     */
+    public ReplayCharSequence getReplayCharSequence() throws IOException {
+        return getContentReplayCharSequence();
+    }
+    
+    /**
+     * @return A ReplayCharSequence. Caller may call
+     *         {@link ReplayCharSequence#close()} when finished. However, in
+     *         heritrix, the ReplayCharSequence is closed automatically when url
+     *         processing has finished; in that context it's preferable not
+     *         to close, so that processors can reuse the same instance.
+     * @throws IOException
+     * @see {@link #endReplays()}
+     */
+    public ReplayCharSequence getContentReplayCharSequence() throws IOException {
+        if (replayCharSequence == null || !replayCharSequence.isOpen() 
+                || !replayCharSequence.getCharset().equals(charset)) {
+            if(replayCharSequence!=null && replayCharSequence.isOpen()) {
+                // existing sequence must not have matched now-configured Charset; close
+                replayCharSequence.close(); 
+            }
+            replayCharSequence = getContentReplayCharSequence(this.charset);
+        }
+        return replayCharSequence;
+    }
+    
+    
+    /**
+     * @param characterEncoding Encoding of recorded stream.
+     * @return A ReplayCharSequence  Will return null if an IOException.  Call
+     * close on returned RCS when done.
+     * @throws IOException
+     */
+    public ReplayCharSequence getContentReplayCharSequence(Charset requestedCharset) throws IOException {
+        // raw data overflows to disk; use temp file
+        InputStream ris = getContentReplayInputStream();
+        ReplayCharSequence rcs =  new GenericReplayCharSequence(
+                ris,
+                calcRecommendedCharBufferSize(this.getRecordedInput()), 
+                this.backingFileBasename + RECORDING_OUTPUT_STREAM_SUFFIX,
+                requestedCharset);
+        ris.close();
+        return rcs;
+    }
+    
+    /**
+     * Calculate a recommended size for an in-memory decoded-character buffer
+     * of this content. We seek a size that is itself no larger (in 2-byte chars)
+     * than the memory already used by the RecordingInputStream's internal raw 
+     * byte buffer, and also no larger than likely necessary. So, we take the 
+     * minimum of the actual recorded byte size and the RecordingInputStream's
+     * max buffer size. 
+     * 
+     * @param inStream
+     * @return int length for in-memory decoded-character buffer
+     */
+    static protected int calcRecommendedCharBufferSize(RecordingInputStream inStream) {
+        return (int) Math.min(inStream.getRecordedBufferLength()/2, inStream.getSize());
+    }
+    
+    /**
+     * Get a raw replay of all recorded data (including, for example, HTTP 
+     * protocol headers)
+     * 
+     * @return A replay input stream.
+     * @throws IOException
+     */
+    public ReplayInputStream getReplayInputStream() throws IOException {
+        return getRecordedInput().getReplayInputStream();
+    }
+    
+    /**
+     * Get a raw replay of the 'message-body'. For the common case of 
+     * HTTP, this is the raw, possibly chunked-transfer-encoded message 
+     * contents not including the leading headers. 
+     * 
+     * @return A replay input stream.
+     * @throws IOException
+     */
+    public ReplayInputStream getMessageBodyReplayInputStream() throws IOException {
+        return getRecordedInput().getMessageBodyReplayInputStream();
+    }
+    
+    /**
+     * Get a raw replay of the 'entity'. For the common case of 
+     * HTTP, this is the message-body after any (usually-unnecessary)
+     * transfer-decoding but before any content-encoding (eg gzip) decoding
+     * 
+     * @return A replay input stream.
+     * @throws IOException
+     */
+    public InputStream getEntityReplayInputStream() throws IOException {
+        if(inputIsChunked) {
+            return new ChunkedInputStream(getRecordedInput().getMessageBodyReplayInputStream());
+        } else {
+            return getRecordedInput().getMessageBodyReplayInputStream();
+        }
+    }
+    
+    /**
+     * Get a replay cued up for the 'content' (after all leading headers)
+     * 
+     * @return A replay input stream.
+     * @throws IOException
+     */
+    public InputStream getContentReplayInputStream() throws IOException {
+        InputStream entityStream = getEntityReplayInputStream();
+        if(StringUtils.isEmpty(contentEncoding)) {
+            return entityStream;
+        } else if ("gzip".equalsIgnoreCase(contentEncoding) || "x-gzip".equalsIgnoreCase(contentEncoding)) {
+            try {
+                return new GZIPInputStream(entityStream);
+            } catch (IOException ioe) {
+                logger.log(Level.WARNING,"gzip problem; using raw entity instead",ioe);
+                IOUtils.closeQuietly(entityStream); // close partially-read stream
+                return getEntityReplayInputStream(); 
+            }
+        } else if ("deflate".equalsIgnoreCase(contentEncoding)) {
+            return new DeflaterInputStream(entityStream);
+        } else if ("identity".equalsIgnoreCase(contentEncoding) || "none".equalsIgnoreCase(contentEncoding)) {
+            return entityStream;
+        } else {
+            // shouldn't be reached given check on setContentEncoding
+            logger.log(Level.INFO,"Unknown content-encoding '"+contentEncoding+"' declared; using raw entity instead");
+            return entityStream; 
+        }
+    }
+    
+    /**
+     * Return a short prefix of the presumed-textual content as a String.
+     * 
+     * @param size max length of String to return 
+     * @return String prefix, or empty String (with logged exception) on any error
+     */
+    public String getContentReplayPrefixString(int size) {
+        return getContentReplayPrefixString(size, this.charset);
+    }
+    
+    /**
+     * Return a short prefix of the presumed-textual content as a String.
+     * 
+     * @param size max length of String to return 
+     * @return String prefix, or empty String (with logged exception) on any error
+     */
+    public String getContentReplayPrefixString(int size, Charset cs) {
+        try {
+            InputStreamReader isr =  new InputStreamReader(getContentReplayInputStream(), cs); 
+            char[] chars = new char[size];
+            int count = isr.read(chars);
+            isr.close(); 
+            if (count > 0) {
+                return new String(chars,0,count);
+            } else {
+                return "";
+            }
+        } catch (IOException e) {
+            logger.log(Level.SEVERE,"unable to get replay prefix string", e);
+            return ""; 
+        } 
+    }
+    
+    /**
+     * @param tempFile
+     * @throws IOException
+     */
+    public void copyContentBodyTo(File tempFile) throws IOException {
+        InputStream inStream = null;
+        OutputStream outStream = null;
+        try {
+            inStream = getContentReplayInputStream();
+            outStream = FileUtils.openOutputStream(tempFile); 
+            IOUtils.copy(inStream, outStream); 
+        } finally {
+            IOUtils.closeQuietly(inStream); 
+            IOUtils.closeQuietly(outStream); 
+        }
+    }
+    
+    /**
+     * Record the input stream for later playback by an extractor, etc.
+     * This is convenience method used to setup an artificial HttpRecorder
+     * scenario used in unit tests, etc.
+     * @param dir Directory to write backing file to.
+     * @param basename of what we're recording.
+     * @param in Stream to read.
+     * @param encoding Stream encoding.
+     * @throws IOException
+     * @return An {@link org.archive.util.Recorder}.
+     */
+    public static Recorder wrapInputStreamWithHttpRecord(File dir,
+        String basename, InputStream in, String encoding)
+    throws IOException {
+        Recorder rec = new Recorder(dir, basename);
+        if (encoding != null && encoding.length() > 0) {
+            rec.setCharset(Charset.forName(encoding));
+        }
+        // Do not use FastBufferedInputStream here.  It does not
+        // support mark.
+        InputStream is = rec.inputWrap(new BufferedInputStream(in));
+        final int BUFFER_SIZE = 1024 * 4;
+        byte [] buffer = new byte[BUFFER_SIZE];
+        while(true) {
+            // Just read it all down.
+            int x = is.read(buffer);
+            if (x == -1) {
+                break;
+            }
+        }
+        is.close();
+        return rec;
+    }
+
+    public void endReplays() {
+        ArchiveUtils.closeQuietly(replayCharSequence);
+        replayCharSequence = null;
+
+        // like closeQuietly
+        try {
+            ris.clearForReuse();
+        } catch (IOException ioe) {
+        }
+
+        // like closeQuietly
+        try {
+            ros.clearForReuse();
+        } catch (IOException e) {
+        }
+    }
+}
diff --git a/src/main/java/org/archive/util/Reporter.java b/src/main/java/org/archive/util/Reporter.java
new file mode 100644
index 00000000..2fcb8cd8
--- /dev/null
+++ b/src/main/java/org/archive/util/Reporter.java
@@ -0,0 +1,56 @@
+/*
+ *  This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ *  Licensed to the Internet Archive (IA) by one or more individual 
+ *  contributors. 
+ *
+ *  The IA licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+package org.archive.util;
+
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.Map;
+
+public interface Reporter {
+    /**
+     * Make a default report to the passed-in Writer. Should
+     * be equivalent to reportTo(null, writer)
+     * 
+     * @param writer to receive report
+     */
+    public void reportTo(PrintWriter writer) throws IOException;
+    
+    /**
+     * Write a short single-line summary report 
+     * 
+     * @param writer to receive report
+     */
+    @Deprecated
+    public void shortReportLineTo(PrintWriter pw) throws IOException;
+    
+
+    /**
+     * @return Same data that's in the single line report, as key-value pairs
+     */
+    public Map shortReportMap();
+
+    
+    /**
+     * Return a legend for the single-line summary report as a String.
+     * 
+     * @return String single-line summary legend
+     */
+    public String shortReportLegend();
+}
diff --git a/src/main/java/org/archive/util/anvl/ANVLRecord.java b/src/main/java/org/archive/util/anvl/ANVLRecord.java
new file mode 100644
index 00000000..de2d3101
--- /dev/null
+++ b/src/main/java/org/archive/util/anvl/ANVLRecord.java
@@ -0,0 +1,336 @@
+/*
+ *  This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ *  Licensed to the Internet Archive (IA) by one or more individual 
+ *  contributors. 
+ *
+ *  The IA licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.archive.util.anvl;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import org.archive.io.UTF8Bytes;
+
+/**
+ * An ordered {@link List} with 'data' {@link Element} values.
+ * ANVLRecords end with a blank line.
+ * 
+ * @see A Name-Value
+ * Language (ANVL)
+ * @author stack
+ */
+public class ANVLRecord extends LinkedList implements UTF8Bytes {
+    private static final Logger logger = 
+        Logger.getLogger(ANVLRecord.class.getName());
+
+	public static final String MIMETYPE = "application/warc-fields";
+	
+	public static final ANVLRecord EMPTY_ANVL_RECORD = new ANVLRecord();
+    
+    /**
+     * Arbitrary upper bound on maximum size of ANVL Record.
+     * Will throw an IOException if exceed this size.
+     */
+    public static final long MAXIMUM_SIZE = 1024 * 10;
+	
+	/**
+	 * An ANVL 'newline'.
+	 * @see http://en.wikipedia.org/wiki/CRLF
+	 */
+    protected static final String CRLF = "\r\n";
+    
+    protected static final String FOLD_PREFIX = CRLF + ' ';
+    
+    public ANVLRecord() {
+        super();
+    }
+
+    public ANVLRecord(Collection c) {
+        super(c);
+    }
+
+    /** @deprecated */
+    public ANVLRecord(int initialCapacity) {
+        super();
+    }
+    
+    public boolean addLabel(final String l) {
+    	return super.add(new Element(new Label(l)));
+    }
+
+    public boolean addLabelValue(final String l, final String v) {
+    	try {
+    		return super.add(new Element(new Label(l), new Value(v)));
+    	} catch (IllegalArgumentException e) {
+    		logger.log(Level.WARNING, "bad label " + l + " or value " + v, e);
+    		return false;
+    	}
+    }
+    
+    @Override
+    public String toString() {
+        // TODO: What to emit for empty ANVLRecord?
+        StringBuilder sb = new StringBuilder();
+        for (final Iterator i = iterator(); i.hasNext();) {
+            sb.append(i.next());
+            sb.append(CRLF);
+        }
+        // 'ANVL Records end in a blank line'.
+        sb.append(CRLF);
+        return sb.toString();
+    }
+    
+    public Map asMap() {
+        Map m = new HashMap(size());
+        for (final Iterator i = iterator(); i.hasNext();) {
+            Element e = i.next();
+            m.put(e.getLabel().toString(),
+                e.isValue()? e.getValue().toString(): (String)null);
+        }
+        return m;
+    }
+    
+    @Override
+    public ANVLRecord clone() {
+        return (ANVLRecord) super.clone();
+    }
+    
+    /**
+     * @return This ANVLRecord as UTF8 bytes.
+     */
+    public byte [] getUTF8Bytes()
+    throws UnsupportedEncodingException {
+        return toString().getBytes(UTF8);
+    }
+    
+    /**
+     * Parses a single ANVLRecord from passed InputStream.
+     * Read as a single-byte stream until we get to a CRLFCRLF which
+     * signifies End-of-ANVLRecord. Then parse all read as a UTF-8 Stream.
+     * Doing it this way, while requiring a double-scan, it  makes it so do not
+     * need to be passed a RepositionableStream or a Stream that supports
+     * marking.  Also no danger of over-reading which can happen when we
+     * wrap passed Stream with an InputStreamReader for doing UTF-8
+     * character conversion (See the ISR class comment).
+     * @param is InputStream
+     * @return An ANVLRecord instance.
+     * @throws IOException
+     */
+    public static ANVLRecord load(final InputStream is)
+    throws IOException {
+        // It doesn't look like a CRLF sequence is possible in UTF-8 without
+    	// it signifying CRLF: The top bits are set in multibyte characters.
+    	// Was thinking of recording CRLF as I was running through this first
+    	// parse but the offsets would then be incorrect if any multibyte
+    	// characters in the intervening gaps between CRLF.
+        boolean isCRLF = false;
+        boolean recordStart = false;
+        ByteArrayOutputStream baos = new ByteArrayOutputStream(1024);
+        boolean done = false;
+        int read = 0;
+        for (int c  = -1, previousCharacter; !done;) {
+            if (read++ >= MAXIMUM_SIZE) {
+                throw new IOException("Read " + MAXIMUM_SIZE +
+                    " bytes without finding  \\r\\n\\r\\n " +
+                    "End-Of-ANVLRecord");
+            }
+            previousCharacter = c;
+            c = is.read();
+            if (c == -1) {
+                throw new IOException("End-Of-Stream before \\r\\n\\r\\n " +
+                    "End-Of-ANVLRecord:\n" +
+                    new String(baos.toByteArray(), UTF8));
+            }
+            if (isLF((char)c) && isCR((char)previousCharacter)) {
+                if (isCRLF) {
+                    // If we just had a CRLF, then its two CRLFs and its end of
+                    // record.  We're done.
+                    done = true;
+                } else {
+                    isCRLF = true;
+                }
+            } else if (!recordStart && Character.isWhitespace(c)) {
+                // Skip any whitespace at start of ANVLRecord.
+                continue;
+            } else {
+                // Clear isCRLF flag if this character is NOT a '\r'.
+                if (isCRLF && !isCR((char)c)) {
+                    isCRLF = false;
+                }
+                // Not whitespace so start record if we haven't already.
+                if (!recordStart) {
+                    recordStart = true;
+                }
+            }
+            baos.write(c);
+        }
+        return load(new String(baos.toByteArray(), UTF8));
+    }
+    
+    /** 
+     * Parse passed String for an ANVL Record.
+     * Looked at writing javacc grammer but preprocessing is required to
+     * handle folding: See
+     * https://javacc.dev.java.net/servlets/BrowseList?list=users&by=thread&from=56173.
+     * Looked at Terence Parr's ANTLR.  More capable.  Can set lookahead count.
+     * A value of 3 would help with folding.  But its a pain defining UNICODE
+     * grammers -- needed by ANVL -- and support seems incomplete
+     * anyways: http://www.doc.ic.ac.uk/lab/secondyear/Antlr/lexer.html#unicode.
+     * For now, go with the below hand-rolled parser.
+     * @param s String with an ANVLRecord.
+     * @return ANVLRecord parsed from passed String.
+     * @throws IOException 
+     */
+    public static ANVLRecord load(final String s)
+    throws IOException {
+        ANVLRecord record = new ANVLRecord();
+        boolean inValue = false, inLabel = false, inComment = false, 
+            inNewLine = false;
+        String label = null;
+        StringBuilder sb = new StringBuilder(s.length());
+        for (int i = 0;  i < s.length(); i++) {
+            char c = s.charAt(i);
+           
+            // Assert I can do look-ahead.
+            if ((i + 1) > s.length()) {
+                throw new IOException("Premature End-of-ANVLRecord:\n" +
+                    s.substring(i));
+            }
+            
+            // If at LF of a CRLF, just go around again. Eat up the LF.
+            if (inNewLine && isLF(c)) {
+                continue;
+            }
+            
+            // If we're at a CRLF and we were just on one, exit. Found Record.
+            if (inNewLine && isCR(c) && isLF(s.charAt(i + 1))) {
+                break;
+            }
+            
+            // Check if we're on a fold inside a Value. Skip multiple white
+            // space after CRLF. 
+            if (inNewLine && inValue && Character.isWhitespace(c)) {
+                continue;
+            }
+            
+            // Else set flag if we're at a CRLF.
+            inNewLine = isCR(c) && isLF(s.charAt(i + 1));
+            
+            if (inNewLine) {
+                if (inComment) {
+                    inComment = false;
+                } else if (label != null && !inValue) {
+					// Label only 'data element'.
+					record.addLabel(label);
+					label = null;
+					sb.setLength(0);
+				} else if (inValue) {
+					// Assert I can do look-ahead past current CRLF.
+					if ((i + 3) > s.length()) {
+						throw new IOException("Premature End-of-ANVLRecord "
+							+ "(2):\n" + s.substring(i));
+					}
+					if (!isCR(s.charAt(i + 2)) && !isLF(s.charAt(i + 3))
+							&& Character.isWhitespace(s.charAt(i + 2))) {
+						// Its a fold.  Let it go around. But add in a CRLF and
+						// space and do it here.  We don't let CRLF fall through
+						// to the sb.append on the end of this loop.
+						sb.append(CRLF);
+						sb.append(' ');
+					} else {
+						// Next line is a new SubElement, a new Comment or
+						// Label.
+						record.addLabelValue(label, sb.toString());
+						sb.setLength(0);
+						label = null;
+						inValue = false;
+					}
+				} else {
+					// We're whitespace between label and value or whitespace
+					// before we've figured whether label or comment.
+				}
+				// Don't let the '\r' or CRLF through.
+				continue;
+			}
+            
+            if (inComment) {
+            	continue;
+            } else if (inLabel) {
+            	if (c == Label.COLON) {
+            		label = sb.toString();
+            		sb.setLength(0);
+            		inLabel = false;
+            		continue;
+            	}
+            } else {
+            	if (!inLabel && !inValue && !inComment) {
+            		// We have no state. Figure one.
+            		if (Character.isWhitespace(c)) {
+            			// If no state, and whitespace, skip. Don't record.
+            			continue;
+            		} else if (label == null && c == '#') {
+            			inComment = true;
+            			// Don't record comments.
+            			continue;
+            		} else if (label == null) {
+            			inLabel = true;
+            		} else {
+            			inValue = true;
+            		}
+            	}
+            }
+			sb.append(c);
+        }
+        return record;
+    }
+    
+    /**
+     * @return Count of ANVLRecord bytes. Be careful, an empty ANVLRecord is
+     * CRLFCRLF so is of size 4.  Also, expensive, since it makes String of
+     * the record so it can count bytes.
+     */
+    public synchronized int getLength() {
+        int length = -1;
+        try {
+            length = getUTF8Bytes().length;
+        } catch (UnsupportedEncodingException e) {
+            throw new RuntimeException(e);
+        }
+        return length;
+    }
+    
+    public static boolean isCROrLF(final char c) {
+        return isCR(c) || isLF(c);
+    }
+    
+    public static boolean isCR(final char c) {
+        return c == ANVLRecord.CRLF.charAt(0);
+    }
+    
+    public static boolean isLF(final char c) {
+        return c == ANVLRecord.CRLF.charAt(1);
+    }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/util/anvl/Element.java b/src/main/java/org/archive/util/anvl/Element.java
new file mode 100644
index 00000000..5881fa9b
--- /dev/null
+++ b/src/main/java/org/archive/util/anvl/Element.java
@@ -0,0 +1,73 @@
+/*
+ *  This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ *  Licensed to the Internet Archive (IA) by one or more individual 
+ *  contributors. 
+ *
+ *  The IA licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.archive.util.anvl;
+
+
+/**
+ * ANVL 'data element'.
+ * Made of a lone {@link Label}, or a {@link Label} plus {@link Value}.
+ * 
+ * @author stack
+ * @see A Name-Value
+ * Language (ANVL)
+ */
+public class Element {
+    private final SubElement [] subElements;
+    
+    public Element(final Label l) {
+        this.subElements = new SubElement [] {l};
+    }
+    
+    public Element(final Label l, final Value v) {
+        this.subElements = new SubElement [] {l, v};
+    }
+    
+    public boolean isValue() {
+        return this.subElements.length > 1;
+    }
+    
+    public Label getLabel() {
+        return (Label)this.subElements[0];
+    }
+    
+    public Value getValue() {
+        if (!isValue()) {
+            return null;
+        }
+        return (Value)this.subElements[1];
+    }
+    
+    @Override
+    public String toString() {
+        StringBuilder sb = new StringBuilder();
+        for (int i = 0; i < subElements.length; i++) {
+            sb.append(subElements[i].toString());
+            if (i == 0) {
+                // Add colon after Label.
+                sb.append(':');
+                if (isValue()) {
+                    // Add space to intro the value.
+                    sb.append(' ');
+                }
+            }
+        }
+        return sb.toString();
+    }
+}
diff --git a/src/main/java/org/archive/util/anvl/Label.java b/src/main/java/org/archive/util/anvl/Label.java
new file mode 100644
index 00000000..fdadb735
--- /dev/null
+++ b/src/main/java/org/archive/util/anvl/Label.java
@@ -0,0 +1,41 @@
+/*
+ *  This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ *  Licensed to the Internet Archive (IA) by one or more individual 
+ *  contributors. 
+ *
+ *  The IA licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+package org.archive.util.anvl;
+
+class Label extends SubElement {
+	public static final char COLON = ':';
+	
+    @SuppressWarnings("unused")
+    private Label() {
+        this(null);
+    }
+    
+    public Label(final String s) {
+        super(s);
+    }
+    
+    @Override
+    protected void checkCharacter(char c, String srcStr, int index) {
+    	super.checkCharacter(c, srcStr, index);
+    	if (c == COLON) {
+    		throw new IllegalArgumentException("Label cannot contain " + COLON);
+    	}
+    }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/util/anvl/SubElement.java b/src/main/java/org/archive/util/anvl/SubElement.java
new file mode 100644
index 00000000..33b9e9bb
--- /dev/null
+++ b/src/main/java/org/archive/util/anvl/SubElement.java
@@ -0,0 +1,78 @@
+/*
+ *  This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ *  Licensed to the Internet Archive (IA) by one or more individual 
+ *  contributors. 
+ *
+ *  The IA licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.archive.util.anvl;
+
+/**
+ * Abstract ANVL 'data element' sub-part.
+ * Subclass to make a Comment, a Label, or a Value.
+ * @author stack
+ */
+abstract class SubElement {
+    private final String e;
+
+    protected SubElement() {
+        this(null);
+    }
+
+    public SubElement(final String s) {
+        this.e = baseCheck(s);
+    }
+
+    protected String baseCheck(final String s) {
+        // Check for null.
+        if (s == null) {
+            throw new IllegalArgumentException("Can't be null");
+        }
+        // Check for CRLF.
+        for (int i = 0; i < s.length(); i++) {
+            checkCharacter(s.charAt(i), s, i);
+        }
+        return s;
+    }
+    
+    protected void checkCharacter(final char c, final String srcStr,
+    		final int index) {
+        checkControlCharacter(c, srcStr, index);
+        checkCRLF(c, srcStr, index);
+    }
+    
+    protected void checkControlCharacter(final char c, final String srcStr,
+            final int index) {
+        if (Character.isISOControl(c) && !Character.isWhitespace(c) ||
+                !Character.isValidCodePoint(c)) {
+            throw new IllegalArgumentException(srcStr +
+                " contains a control character(s) or invalid code point: 0x" +
+                Integer.toHexString(c));
+        }
+    }
+    
+    protected void checkCRLF(final char c, final String srcStr,
+            final int index) {
+        if (ANVLRecord.isCROrLF(c)) {
+            throw new IllegalArgumentException(srcStr +
+                " contains disallowed CRLF control character(s): 0x" +
+                Integer.toHexString(c));
+        }
+    }
+    
+    @Override
+    public String toString() {
+        return e;
+    }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/util/anvl/Value.java b/src/main/java/org/archive/util/anvl/Value.java
new file mode 100644
index 00000000..2a650ba2
--- /dev/null
+++ b/src/main/java/org/archive/util/anvl/Value.java
@@ -0,0 +1,71 @@
+/*
+ *  This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ *  Licensed to the Internet Archive (IA) by one or more individual 
+ *  contributors. 
+ *
+ *  The IA licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+package org.archive.util.anvl;
+
+/**
+ * TODO: Now values 'fold' but should but perhaps they shouldn't be stored
+ * folded.  Only when we serialize should we fold (But how to know where
+ * to fold?).
+ * @author stack
+ * @version $Date$ $Version$
+ */
+class Value extends SubElement {
+
+    private StringBuilder sb;
+    private boolean folding = false;
+	
+    @SuppressWarnings("unused")
+    private Value() {
+        this(null);
+    }
+    
+    public Value(final String s) {
+        super(s);
+    }
+    
+    protected String baseCheck(String s) {
+        this.sb = new StringBuilder(s.length() * 2);
+        super.baseCheck(s);
+        return sb.toString();
+    }
+    
+    @Override
+    protected void checkCharacter(char c, String srcStr, int index) {
+        checkControlCharacter(c, srcStr, index);
+        // Now, rewrite the value String with folding (If CR or LF or CRLF
+        // present.
+        if (ANVLRecord.isCR(c)) {
+            this.folding = true;
+            this.sb.append(ANVLRecord.FOLD_PREFIX);
+        } else if (ANVLRecord.isLF(c)) {
+            if (!this.folding) {
+                this.folding = true;
+                this.sb.append(ANVLRecord.FOLD_PREFIX);
+            } else {
+                // Previous character was a CR. Fold prefix has been added.
+            }
+        } else if (this.folding && Character.isWhitespace(c)) {
+            // Only write out one whitespace character. Skip.
+        } else {
+            this.folding = false;
+            this.sb.append(c);
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/util/anvl/package.html b/src/main/java/org/archive/util/anvl/package.html
new file mode 100644
index 00000000..4a2a8963
--- /dev/null
+++ b/src/main/java/org/archive/util/anvl/package.html
@@ -0,0 +1,42 @@
+
+
+
+org.archive.util.anvl package
+
+
+Parsers and Writers for the (expired) Internet-Draft A Name-Value
+Language (ANVL).  Use {@link org.archive.util.anvl.ANVLRecord} 
+to create new instances of ANVL Records and for parsing.
+
+
Implementation Details
+The ANVL Internet-Draft of 14 February, 2005 is inspecific as to the
+definition of 'blank line' and 'newline'.  This parser implementation
+assumes CRNL.
+
+Says "An element consists of a label, a colon, and an optional value".
+Should that be: "An element consists of a label and an optional value, or a
+comment."
+
+Specification is unclear regards CR or NL in label or 
+comment (This implementation disallows CR or NL in labels but lets
+them pass in comments).
+
+A grammar would help.  Here is RFC822:
+
+     field       =  field-name ":" [ field-body ] CRLF
+     
+     field-name  =  1*<any CHAR, excluding CTLs, SPACE, and ":">
+     
+     field-body  =  field-body-contents
+                    [CRLF LWSP-char field-body]
+     
+     field-body-contents =
+                   <the ASCII characters making up the field-body, as
+                    defined in the following sections, and consisting
+                    of combinations of atom, quoted-string, and
+                    specials tokens, or else consisting of texts>
+
+
+
+
diff --git a/src/main/java/org/archive/util/binsearch/SeekableLineReaderIterator.java b/src/main/java/org/archive/util/binsearch/SeekableLineReaderIterator.java
index ca443ad4..991553c8 100644
--- a/src/main/java/org/archive/util/binsearch/SeekableLineReaderIterator.java
+++ b/src/main/java/org/archive/util/binsearch/SeekableLineReaderIterator.java
@@ -26,7 +26,7 @@ public String getNextInner() {
 				next = slr.readLine();
 			} catch (IOException e) {
 				if (propagateException) {
-					throw new RuntimeIOException();
+					throw new RuntimeIOException(e.toString());
 				}
 			}
 		}
diff --git a/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReader.java b/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReader.java
index 63eab9b4..d686a5e2 100644
--- a/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReader.java
+++ b/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReader.java
@@ -32,6 +32,8 @@ public int getStatus()
 	protected boolean noKeepAlive;
 	protected String cookie;
 	protected String connectedUrl;
+	protected String errHeader;
+	protected String saveErrHeader;
 
 	public abstract String getUrl();
 	
@@ -76,4 +78,20 @@ public String getConnectedUrl()
 	{
 		return connectedUrl;
 	}
+
+	public String getSaveErrHeader() {
+		return saveErrHeader;
+	}
+
+	public void setSaveErrHeader(String saveErrHeader) {
+		this.saveErrHeader = saveErrHeader;
+	}
+
+	public String getErrHeader() {
+		return errHeader;
+	}
+
+	public void setErrHeader(String errHeader) {
+		this.errHeader = errHeader;
+	}
 }
\ No newline at end of file
diff --git a/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReaderFactory.java b/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReaderFactory.java
index c1fa6fb6..b4a23db0 100644
--- a/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReaderFactory.java
+++ b/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReaderFactory.java
@@ -4,6 +4,7 @@
 
 import org.archive.util.binsearch.SeekableLineReaderFactory;
 import org.archive.util.binsearch.impl.http.ApacheHttp31SLRFactory;
+import org.archive.util.binsearch.impl.http.ApacheHttp43SLRFactory;
 import org.archive.util.binsearch.impl.http.HTTPURLConnSLRFactory;
 
 public abstract class HTTPSeekableLineReaderFactory implements SeekableLineReaderFactory {
@@ -20,6 +21,7 @@ protected HTTPSeekableLineReaderFactory()
 	public enum HttpLibs
 	{
 		APACHE_31,
+		APACHE_43,
 		URLCONN,
 	}
 		
@@ -50,6 +52,10 @@ public static HTTPSeekableLineReaderFactory getHttpFactory(HttpLibs type, String
 		case URLCONN:
 			factory = new HTTPURLConnSLRFactory();
 			break;
+			
+		case APACHE_43:
+			factory = new ApacheHttp43SLRFactory();
+			break;
 		}
 		
 		if (factory == null) {
diff --git a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java
index 0857bfd6..c4fdbba8 100644
--- a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java
+++ b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java
@@ -8,6 +8,7 @@
 import org.apache.commons.httpclient.HttpClient;
 import org.apache.commons.httpclient.HttpException;
 import org.apache.commons.httpclient.HttpMethod;
+import org.apache.commons.httpclient.cookie.CookiePolicy;
 import org.apache.commons.httpclient.methods.GetMethod;
 import org.apache.commons.httpclient.methods.HeadMethod;
 import org.apache.commons.io.input.CountingInputStream;
@@ -121,22 +122,28 @@ protected InputStream doSeekLoad(long offset, int maxLength) throws IOException
 			}
 			
 			if (this.getCookie() != null) {
+				activeMethod.getParams().setCookiePolicy(CookiePolicy.IGNORE_COOKIES);
 				activeMethod.setRequestHeader("Cookie", this.getCookie());
 			}
 			
 			int code = http.executeMethod(activeMethod);
 			
+			connectedUrl = activeMethod.getURI().toString();
+			
 			if ((code != 206) && (code != 200)) {
-				throw new BadHttpStatusException(code, url + " " + rangeHeader);
+				throw new BadHttpStatusException(code, connectedUrl + " " + rangeHeader);
 			}
 			
-			connectedUrl = activeMethod.getURI().toString();
-			
 			InputStream is = activeMethod.getResponseBodyAsStream();
 			cin = new CountingInputStream(is);
 			return cin;
 			
 		} catch (IOException io) {
+			if (saveErrHeader != null) {
+				errHeader = getHeaderValue(saveErrHeader);	
+			}
+			
+			connectedUrl = activeMethod.getURI().toString();
 			doClose();
 			throw io;
 		}
diff --git a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java
index 52e73a94..9bd7542b 100644
--- a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java
+++ b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java
@@ -3,12 +3,12 @@
 import java.io.IOException;
 import java.text.SimpleDateFormat;
 import java.util.Date;
-import java.util.logging.Level;
 import java.util.logging.Logger;
 
 import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
 import org.apache.commons.httpclient.HostConfiguration;
 import org.apache.commons.httpclient.HttpClient;
+import org.apache.commons.httpclient.HttpConnectionManager;
 import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
 import org.apache.commons.httpclient.params.HttpClientParams;
 import org.archive.util.binsearch.impl.HTTPSeekableLineReader;
@@ -17,7 +17,7 @@
 public class ApacheHttp31SLRFactory extends HTTPSeekableLineReaderFactory {
 	private final static Logger LOGGER = Logger.getLogger(ApacheHttp31SLRFactory.class.getName());
 	
-	private MultiThreadedHttpConnectionManager connectionManager = null;
+	private HttpConnectionManager connectionManager = null;
     private HostConfiguration hostConfiguration = null;
     private HttpClient http = null;
     
@@ -27,6 +27,7 @@ public ApacheHttp31SLRFactory(String uriString) {
 
     public ApacheHttp31SLRFactory() {
     	connectionManager = new MultiThreadedHttpConnectionManager();
+    	//connectionManager = new ThreadLocalHttpConnectionManager();
     	hostConfiguration = new HostConfiguration();
 		HttpClientParams params = new HttpClientParams();
     	http = new HttpClient(params,connectionManager);
@@ -35,15 +36,16 @@ public ApacheHttp31SLRFactory() {
     
     public void close() throws IOException
     {
-    	connectionManager.deleteClosedConnections();
+    	//connectionManager.deleteClosedConnections();
+    	connectionManager.closeIdleConnections(0);
     }
 	
 	@Override
 	public ApacheHttp31SLR get(String url) throws IOException {
 		
-		if (LOGGER.isLoggable(Level.FINEST)) {
-			LOGGER.finest("Connections: " + connectionManager.getConnectionsInPool(hostConfiguration));
-		}
+//		if (LOGGER.isLoggable(Level.FINEST)) {
+//			LOGGER.finest("Connections: " + connectionManager.getConnectionsInPool(hostConfiguration));
+//		}
 		
 		return new ApacheHttp31SLR(http, url);
 	}
diff --git a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp43SLR.java b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp43SLR.java
new file mode 100644
index 00000000..ef206bb1
--- /dev/null
+++ b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp43SLR.java
@@ -0,0 +1,214 @@
+package org.archive.util.binsearch.impl.http;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.InetSocketAddress;
+import java.net.Socket;
+import java.net.SocketAddress;
+import java.net.URL;
+
+import org.apache.http.Header;
+import org.apache.http.HttpException;
+import org.apache.http.HttpRequest;
+import org.apache.http.HttpResponse;
+import org.apache.http.HttpVersion;
+import org.apache.http.impl.DefaultBHttpClientConnection;
+import org.apache.http.message.BasicHttpRequest;
+import org.apache.http.util.EntityUtils;
+import org.archive.util.binsearch.impl.HTTPSeekableLineReader;
+import org.archive.util.zip.GZIPMembersInputStream;
+
+public class ApacheHttp43SLR extends HTTPSeekableLineReader {
+
+	private String urlString;
+	
+	private int connectTimeout = 0;
+	private int readTimeout = 0;
+	
+	private Socket socket = null;
+	private DefaultBHttpClientConnection activeConn = null;
+	private HttpResponse response = null;
+	
+	private final static int BUFF_SIZE = 8192;
+	
+	public ApacheHttp43SLR(String url)
+	{
+		urlString = url;
+	}
+	
+	public ApacheHttp43SLR(String url, int connectTimeout, int readTimeout)
+	{
+		this.urlString = url;
+		this.connectTimeout = connectTimeout;
+		this.readTimeout = readTimeout;
+	}
+	
+	@Override
+    public String getUrl() {
+	    return urlString;
+    }
+
+	@Override
+    public long getSize() throws IOException {
+		if (response == null) {
+			return 0;
+		}
+		
+		return response.getEntity().getContentLength();
+    }
+
+	@Override
+    public String getHeaderValue(String headerName) {
+		if (response == null) {
+			return null;
+		}
+		
+		Header header = response.getFirstHeader(headerName);
+		if (header == null) {
+			return null;
+		}
+		
+		return header.getValue();
+	}
+	
+	protected static int getPort(URL url)
+	{
+		int port = url.getPort();
+		
+		if (port > 0) {
+			return port;
+		}
+		
+		return url.getDefaultPort();
+	}
+	
+    protected InputStream doSeekLoad(long offset, int maxLength, URL url)
+            throws IOException {
+		
+		try {
+			SocketAddress endpoint = new InetSocketAddress(url.getHost(), getPort(url));
+			
+			socket = new Socket();
+			socket.connect(endpoint, connectTimeout);
+			
+			activeConn = new DefaultBHttpClientConnection(BUFF_SIZE);
+			activeConn.bind(socket);
+			activeConn.setSocketTimeout(readTimeout);
+			
+			HttpRequest request = new BasicHttpRequest("GET", url.getFile(), HttpVersion.HTTP_1_1);
+			
+			String rangeHeader = makeRangeHeader(offset, maxLength);
+			
+			if (rangeHeader != null) {
+				request.setHeader("Range", rangeHeader);
+			}
+			
+			if (this.isNoKeepAlive()) {
+				request.setHeader("Connection", "close");
+			} else {
+				request.setHeader("Connection", "keep-alive");
+			}
+			
+			if (this.getCookie() != null) {
+				request.setHeader("Cookie", this.getCookie());
+			}
+			
+			request.setHeader("Accept", "*/*");
+			request.setHeader("Host", url.getHost());
+			
+			activeConn.sendRequestHeader(request);
+			activeConn.flush();
+						
+			response = activeConn.receiveResponseHeader();
+			
+			int code = response.getStatusLine().getStatusCode();
+			
+			connectedUrl = url.toString();
+			
+			if (code > 300 && code < 400) {
+				Header header = response.getFirstHeader("Location");
+				
+				doClose();
+				
+				if (header != null) {
+					URL redirectURL = new URL(header.getValue());
+					return doSeekLoad(offset, maxLength, redirectURL);
+				}
+			}
+			
+			if (code != 200 && code != 206) {
+				throw new BadHttpStatusException(code, connectedUrl + " " + rangeHeader);
+			}
+			
+			activeConn.receiveResponseEntity(response);
+			
+			return response.getEntity().getContent();
+			
+		} catch (HttpException e) {
+			doClose();
+			throw new IOException(e);
+			
+        } catch (IOException io) {
+        	
+			if (saveErrHeader != null) {
+				errHeader = getHeaderValue(saveErrHeader);	
+			}
+			
+			connectedUrl = url.toString();
+			
+			doClose();
+			throw io;
+        }
+    }
+    
+    @Override
+	public void seekWithMaxRead(long offset, boolean gzip, int maxLength) throws IOException
+	{
+		if (closed) {
+			throw new IOException("Seek after close()");
+		}
+		
+		br = null;
+		
+		try {
+			doSeekLoad(offset, maxLength);
+		
+			if (bufferFully && (maxLength > 0)) {
+				byte[] buffer = EntityUtils.toByteArray(response.getEntity());
+				
+				doClose();
+				
+				is = new ByteArrayInputStream(buffer);
+			}
+		
+	    	if (gzip) {
+	    		is = new GZIPMembersInputStream(is, blockSize);
+	    	}
+	    	
+		} catch (IOException io) {
+			doClose();
+			throw io;
+		}
+	}
+
+	@Override
+    protected void doClose() throws IOException {
+		if (activeConn != null) {
+			activeConn.close();
+			activeConn = null;
+			socket = null;
+		} else if (socket != null) {
+			socket.close();
+			socket = null;
+		}
+		response = null;
+	}
+
+	@Override
+    protected InputStream doSeekLoad(long offset, int maxLength)
+            throws IOException {
+		
+		return doSeekLoad(offset, maxLength, new URL(urlString));
+    }
+}
diff --git a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp43SLRFactory.java b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp43SLRFactory.java
new file mode 100644
index 00000000..5e3bb3ed
--- /dev/null
+++ b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp43SLRFactory.java
@@ -0,0 +1,100 @@
+package org.archive.util.binsearch.impl.http;
+
+import java.io.IOException;
+
+import org.archive.util.binsearch.impl.HTTPSeekableLineReader;
+import org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory;
+
+public class ApacheHttp43SLRFactory extends HTTPSeekableLineReaderFactory {
+	
+	private int readTimeout = 0;
+	private int connectTimeout = 0;
+
+	public ApacheHttp43SLRFactory()
+	{
+		
+	}
+
+	@Override
+    public HTTPSeekableLineReader get(String url) throws IOException {
+		return new ApacheHttp43SLR(url, connectTimeout, readTimeout);
+    }
+
+	@Override
+    public void close() throws IOException {
+	    // TODO Auto-generated method stub 
+    }
+
+	@Override
+    public void setProxyHostPort(String hostPort) {
+	    // TODO Auto-generated method stub
+	    
+    }
+
+	@Override
+    public void setMaxTotalConnections(int maxTotalConnections) {
+	    // TODO Auto-generated method stub
+	    
+    }
+
+	@Override
+    public int getMaxTotalConnections() {
+	    // TODO Auto-generated method stub
+	    return 0;
+    }
+
+	@Override
+    public void setMaxHostConnections(int maxHostConnections) {
+	    // TODO Auto-generated method stub
+	    
+    }
+
+	@Override
+    public int getMaxHostConnections() {
+	    // TODO Auto-generated method stub
+	    return 0;
+    }
+
+	@Override
+    public int getConnectionTimeoutMS() {
+		return connectTimeout;
+    }
+
+	@Override
+    public void setConnectionTimeoutMS(int connectionTimeoutMS) {
+		connectTimeout  = connectionTimeoutMS;
+	    
+    }
+
+	@Override
+    public int getSocketTimeoutMS() {
+		return readTimeout;
+    }
+
+	@Override
+    public void setSocketTimeoutMS(int socketTimeoutMS) {
+		readTimeout = socketTimeoutMS;
+    }
+
+	@Override
+    public void setStaleChecking(boolean enabled) {
+
+    }
+
+	@Override
+    public boolean isStaleChecking() {
+	    // TODO Auto-generated method stub
+	    return false;
+    }
+
+	@Override
+    public long getModTime() {
+	    // TODO Auto-generated method stub
+	    return 0;
+    }
+
+	@Override
+    public void setNumRetries(int numRetries) {
+	    // TODO Auto-generated method stub
+    }
+}
diff --git a/src/main/java/org/archive/util/binsearch/impl/http/HTTPURLConnSLR.java b/src/main/java/org/archive/util/binsearch/impl/http/HTTPURLConnSLR.java
index f21437f7..6d618e43 100644
--- a/src/main/java/org/archive/util/binsearch/impl/http/HTTPURLConnSLR.java
+++ b/src/main/java/org/archive/util/binsearch/impl/http/HTTPURLConnSLR.java
@@ -76,13 +76,12 @@ protected InputStream doSeekLoad(long offset, int maxLength)
 		httpUrlConn.connect();
 		
 		int code = httpUrlConn.getResponseCode();
+		connectedUrl =  httpUrlConn.getURL().toString();
 		
 		if ((code != 206) && (code != 200)) {
-			throw new BadHttpStatusException(code, url + " " + rangeHeader);
+			throw new BadHttpStatusException(code, connectedUrl + " " + rangeHeader);
 		}
 		
-		connectedUrl =  httpUrlConn.getURL().toString();
-		
 		InputStream is = httpUrlConn.getInputStream();
 		cin = new CountingInputStream(is);
 		return cin;
diff --git a/src/main/java/org/archive/util/io/RuntimeIOException.java b/src/main/java/org/archive/util/io/RuntimeIOException.java
index b6efbf74..1d74f79c 100644
--- a/src/main/java/org/archive/util/io/RuntimeIOException.java
+++ b/src/main/java/org/archive/util/io/RuntimeIOException.java
@@ -3,13 +3,36 @@
 public class RuntimeIOException extends RuntimeException {
     private static final long serialVersionUID = 4762025404760379497L;
     
+    private int status = 503;
+    
     public RuntimeIOException()
     {
     	
     }
     
+    public RuntimeIOException(String message)
+    {
+    	super(message);
+    }
+    
+    public RuntimeIOException(int status)
+    {
+    	this.status = status;
+    }
+    
     public RuntimeIOException(Throwable cause)
     {
     	super(cause);
-    }    
+    }
+   
+    public RuntimeIOException(int status, Throwable cause)
+    {
+    	super(cause);
+    	this.status = status;
+    } 
+    
+    public int getStatus()
+    {
+    	return status;
+    }
 }
diff --git a/src/main/java/org/archive/util/iterator/CloseableCompositeIterator.java b/src/main/java/org/archive/util/iterator/CloseableCompositeIterator.java
new file mode 100644
index 00000000..b9f632e2
--- /dev/null
+++ b/src/main/java/org/archive/util/iterator/CloseableCompositeIterator.java
@@ -0,0 +1,73 @@
+package org.archive.util.iterator;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.LinkedList;
+
+public class CloseableCompositeIterator implements CloseableIterator {
+
+	protected LinkedList> iters;
+	protected Iterator> iterPtr;
+	protected CloseableIterator currIter;
+	
+    public CloseableCompositeIterator()
+	{
+		iters = new LinkedList>();
+	}
+	
+	public void addFirst(CloseableIterator e)
+	{
+		iters.addFirst(e);
+	}
+	
+	public void addLast(CloseableIterator e)
+	{
+		iters.addLast(e);
+	}
+	
+	@Override
+    public boolean hasNext() {
+		
+		if (iterPtr == null) {
+			iterPtr = iters.iterator();
+			currIter = iterPtr.next();
+		}
+		
+		if (currIter == null) {
+			return false;
+		}
+		
+		while (currIter != null) {
+			if (currIter.hasNext()) {
+				return true;
+			}
+			
+			currIter = (iterPtr.hasNext() ? iterPtr.next() : null);
+		}
+		
+		return false;
+    }
+
+	@Override
+    public E next() {
+		return currIter.next();
+    }
+
+	@Override
+    public void remove() {
+		currIter.remove();
+    }
+
+	@Override
+    public void close() throws IOException {
+		for (CloseableIterator e : iters) {
+			if (e != null) {
+				try {
+					e.close();
+				} catch (IOException io) {
+					
+				}
+			}
+		}
+    }
+}
diff --git a/src/main/java/org/archive/util/iterator/CloseableIteratorWrapper.java b/src/main/java/org/archive/util/iterator/CloseableIteratorWrapper.java
new file mode 100644
index 00000000..f35c85e5
--- /dev/null
+++ b/src/main/java/org/archive/util/iterator/CloseableIteratorWrapper.java
@@ -0,0 +1,42 @@
+package org.archive.util.iterator;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+/**
+ * Wrap a regular Iterator to create a CloseableIterator where the close() is a no-op
+ * @author ilya
+ *
+ * @param 
+ */
+
+public class CloseableIteratorWrapper implements CloseableIterator
+{
+	protected Iterator iter;
+	
+	public CloseableIteratorWrapper(Iterator iter)
+	{
+		this.iter = iter;
+	}
+	
+	@Override
+    public boolean hasNext() {
+		return this.iter.hasNext();
+    }
+
+	@Override
+    public S next() {
+		return this.iter.next();
+    }
+
+	@Override
+    public void remove() {
+		this.iter.remove();
+        
+    }
+
+	@Override
+    public void close() throws IOException {
+        //No Op
+    }		
+}
\ No newline at end of file
diff --git a/src/main/resources/effective_tld_names.dat b/src/main/resources/effective_tld_names.dat
index 2c201312..7c4a0860 100644
--- a/src/main/resources/effective_tld_names.dat
+++ b/src/main/resources/effective_tld_names.dat
@@ -1,44 +1,6 @@
-// ***** BEGIN LICENSE BLOCK *****
-// Version: MPL 1.1/GPL 2.0/LGPL 2.1
-// 
-// The contents of this file are subject to the Mozilla Public License Version 
-// 1.1 (the "License"); you may not use this file except in compliance with 
-// the License. You may obtain a copy of the License at 
-// http://www.mozilla.org/MPL/
-// 
-// Software distributed under the License is distributed on an "AS IS" basis,
-// WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
-// for the specific language governing rights and limitations under the
-// License.
-// 
-// The Original Code is the Public Suffix List.
-// 
-// The Initial Developer of the Original Code is
-// Jo Hermans .
-// Portions created by the Initial Developer are Copyright (C) 2007
-// the Initial Developer. All Rights Reserved.
-// 
-// Contributor(s):
-//   Ruben Arakelyan 
-//   Gervase Markham 
-//   Pamela Greene 
-//   David Triendl 
-//   Jothan Frakes 
-//   The kind representatives of many TLD registries
-// 
-// Alternatively, the contents of this file may be used under the terms of
-// either the GNU General Public License Version 2 or later (the "GPL"), or
-// the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
-// in which case the provisions of the GPL or the LGPL are applicable instead
-// of those above. If you wish to allow use of your version of this file only
-// under the terms of either the GPL or the LGPL, and not to allow others to
-// use your version of this file under the terms of the MPL, indicate your
-// decision by deleting the provisions above and replace them with the notice
-// and other provisions required by the GPL or the LGPL. If you do not delete
-// the provisions above, a recipient may use your version of this file under
-// the terms of any one of the MPL, the GPL or the LGPL.
-// 
-// ***** END LICENSE BLOCK *****
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 // ===BEGIN ICANN DOMAINS===
 
@@ -257,9 +219,9 @@ net.au
 org.au
 edu.au
 gov.au
-csiro.au
 asn.au
 id.au
+csiro.au
 // Historic 2LDs (closed to new registration, but sites still exist)
 info.au
 conf.au
@@ -453,13 +415,13 @@ b.br
 bio.br
 blog.br
 bmd.br
-can.br
 cim.br
 cng.br
 cnt.br
 com.br
 coop.br
 ecn.br
+eco.br
 edu.br
 emp.br
 eng.br
@@ -480,6 +442,7 @@ ind.br
 inf.br
 jor.br
 jus.br
+leg.br
 lel.br
 mat.br
 med.br
@@ -729,6 +692,14 @@ inf.cu
 // cv : http://en.wikipedia.org/wiki/.cv
 cv
 
+// cw : http://www.una.cw/cw_registry/
+// Confirmed by registry  2013-03-26
+cw
+com.cw
+edu.cw
+net.cw
+org.cw
+
 // cx : http://en.wikipedia.org/wiki/.cx
 // list of other 2nd level tlds ?
 cx
@@ -987,9 +958,15 @@ gov.gr
 // gs : http://en.wikipedia.org/wiki/.gs
 gs
 
-// gt : http://www.gt/politicas.html
-*.gt
-!www.gt
+// gt : http://www.gt/politicas_de_registro.html
+gt
+com.gt
+edu.gt
+gob.gt
+ind.gt
+mil.gt
+net.gt
+org.gt
 
 // gu : http://gadao.gov.gu/registration.txt
 *.gu
@@ -1103,13 +1080,14 @@ tozsde.hu
 utazas.hu
 video.hu
 
-// id : http://en.wikipedia.org/wiki/.id
-// see also: https://register.pandi.or.id/
+// id : https://register.pandi.or.id/
 id
 ac.id
+biz.id
 co.id
 go.id
 mil.id
+my.id
 net.id
 or.id
 sch.id
@@ -1511,10 +1489,9 @@ jobs
 
 // jp : http://en.wikipedia.org/wiki/.jp
 // http://jprs.co.jp/en/jpdomain.html
-// Submitted by registry  2008-06-11
-// Updated by registry  2008-12-04
+// Updated by registry  2012-05-28
 jp
-// jp organizational type names 
+// jp organizational type names
 ac.jp
 ad.jp
 co.jp
@@ -1524,125 +1501,1750 @@ gr.jp
 lg.jp
 ne.jp
 or.jp
+// jp preficture type names
+aichi.jp
+akita.jp
+aomori.jp
+chiba.jp
+ehime.jp
+fukui.jp
+fukuoka.jp
+fukushima.jp
+gifu.jp
+gunma.jp
+hiroshima.jp
+hokkaido.jp
+hyogo.jp
+ibaraki.jp
+ishikawa.jp
+iwate.jp
+kagawa.jp
+kagoshima.jp
+kanagawa.jp
+kochi.jp
+kumamoto.jp
+kyoto.jp
+mie.jp
+miyagi.jp
+miyazaki.jp
+nagano.jp
+nagasaki.jp
+nara.jp
+niigata.jp
+oita.jp
+okayama.jp
+okinawa.jp
+osaka.jp
+saga.jp
+saitama.jp
+shiga.jp
+shimane.jp
+shizuoka.jp
+tochigi.jp
+tokushima.jp
+tokyo.jp
+tottori.jp
+toyama.jp
+wakayama.jp
+yamagata.jp
+yamaguchi.jp
+yamanashi.jp
 // jp geographic type names
 // http://jprs.jp/doc/rule/saisoku-1.html
-*.aichi.jp
-*.akita.jp
-*.aomori.jp
-*.chiba.jp
-*.ehime.jp
-*.fukui.jp
-*.fukuoka.jp
-*.fukushima.jp
-*.gifu.jp
-*.gunma.jp
-*.hiroshima.jp
-*.hokkaido.jp
-*.hyogo.jp
-*.ibaraki.jp
-*.ishikawa.jp
-*.iwate.jp
-*.kagawa.jp
-*.kagoshima.jp
-*.kanagawa.jp
 *.kawasaki.jp
 *.kitakyushu.jp
 *.kobe.jp
-*.kochi.jp
-*.kumamoto.jp
-*.kyoto.jp
-*.mie.jp
-*.miyagi.jp
-*.miyazaki.jp
-*.nagano.jp
-*.nagasaki.jp
 *.nagoya.jp
-*.nara.jp
-*.niigata.jp
-*.oita.jp
-*.okayama.jp
-*.okinawa.jp
-*.osaka.jp
-*.saga.jp
-*.saitama.jp
 *.sapporo.jp
 *.sendai.jp
-*.shiga.jp
-*.shimane.jp
-*.shizuoka.jp
-*.tochigi.jp
-*.tokushima.jp
-*.tokyo.jp
-*.tottori.jp
-*.toyama.jp
-*.wakayama.jp
-*.yamagata.jp
-*.yamaguchi.jp
-*.yamanashi.jp
 *.yokohama.jp
-!metro.tokyo.jp
-!pref.aichi.jp
-!pref.akita.jp
-!pref.aomori.jp
-!pref.chiba.jp
-!pref.ehime.jp
-!pref.fukui.jp
-!pref.fukuoka.jp
-!pref.fukushima.jp
-!pref.gifu.jp
-!pref.gunma.jp
-!pref.hiroshima.jp
-!pref.hokkaido.jp
-!pref.hyogo.jp
-!pref.ibaraki.jp
-!pref.ishikawa.jp
-!pref.iwate.jp
-!pref.kagawa.jp
-!pref.kagoshima.jp
-!pref.kanagawa.jp
-!pref.kochi.jp
-!pref.kumamoto.jp
-!pref.kyoto.jp
-!pref.mie.jp
-!pref.miyagi.jp
-!pref.miyazaki.jp
-!pref.nagano.jp
-!pref.nagasaki.jp
-!pref.nara.jp
-!pref.niigata.jp
-!pref.oita.jp
-!pref.okayama.jp
-!pref.okinawa.jp
-!pref.osaka.jp
-!pref.saga.jp
-!pref.saitama.jp
-!pref.shiga.jp
-!pref.shimane.jp
-!pref.shizuoka.jp
-!pref.tochigi.jp
-!pref.tokushima.jp
-!pref.tottori.jp
-!pref.toyama.jp
-!pref.wakayama.jp
-!pref.yamagata.jp
-!pref.yamaguchi.jp
-!pref.yamanashi.jp
-!city.chiba.jp
-!city.fukuoka.jp
-!city.hiroshima.jp
 !city.kawasaki.jp
 !city.kitakyushu.jp
 !city.kobe.jp
-!city.kyoto.jp
 !city.nagoya.jp
-!city.niigata.jp
-!city.okayama.jp
-!city.osaka.jp
-!city.saitama.jp
 !city.sapporo.jp
 !city.sendai.jp
-!city.shizuoka.jp
 !city.yokohama.jp
+// 4th level registration
+aisai.aichi.jp
+ama.aichi.jp
+anjo.aichi.jp
+asuke.aichi.jp
+chiryu.aichi.jp
+chita.aichi.jp
+fuso.aichi.jp
+gamagori.aichi.jp
+handa.aichi.jp
+hazu.aichi.jp
+hekinan.aichi.jp
+higashiura.aichi.jp
+ichinomiya.aichi.jp
+inazawa.aichi.jp
+inuyama.aichi.jp
+isshiki.aichi.jp
+iwakura.aichi.jp
+kanie.aichi.jp
+kariya.aichi.jp
+kasugai.aichi.jp
+kira.aichi.jp
+kiyosu.aichi.jp
+komaki.aichi.jp
+konan.aichi.jp
+kota.aichi.jp
+mihama.aichi.jp
+miyoshi.aichi.jp
+nagakute.aichi.jp
+nishio.aichi.jp
+nisshin.aichi.jp
+obu.aichi.jp
+oguchi.aichi.jp
+oharu.aichi.jp
+okazaki.aichi.jp
+owariasahi.aichi.jp
+seto.aichi.jp
+shikatsu.aichi.jp
+shinshiro.aichi.jp
+shitara.aichi.jp
+tahara.aichi.jp
+takahama.aichi.jp
+tobishima.aichi.jp
+toei.aichi.jp
+togo.aichi.jp
+tokai.aichi.jp
+tokoname.aichi.jp
+toyoake.aichi.jp
+toyohashi.aichi.jp
+toyokawa.aichi.jp
+toyone.aichi.jp
+toyota.aichi.jp
+tsushima.aichi.jp
+yatomi.aichi.jp
+akita.akita.jp
+daisen.akita.jp
+fujisato.akita.jp
+gojome.akita.jp
+hachirogata.akita.jp
+happou.akita.jp
+higashinaruse.akita.jp
+honjo.akita.jp
+honjyo.akita.jp
+ikawa.akita.jp
+kamikoani.akita.jp
+kamioka.akita.jp
+katagami.akita.jp
+kazuno.akita.jp
+kitaakita.akita.jp
+kosaka.akita.jp
+kyowa.akita.jp
+misato.akita.jp
+mitane.akita.jp
+moriyoshi.akita.jp
+nikaho.akita.jp
+noshiro.akita.jp
+odate.akita.jp
+oga.akita.jp
+ogata.akita.jp
+semboku.akita.jp
+yokote.akita.jp
+yurihonjo.akita.jp
+aomori.aomori.jp
+gonohe.aomori.jp
+hachinohe.aomori.jp
+hashikami.aomori.jp
+hiranai.aomori.jp
+hirosaki.aomori.jp
+itayanagi.aomori.jp
+kuroishi.aomori.jp
+misawa.aomori.jp
+mutsu.aomori.jp
+nakadomari.aomori.jp
+noheji.aomori.jp
+oirase.aomori.jp
+owani.aomori.jp
+rokunohe.aomori.jp
+sannohe.aomori.jp
+shichinohe.aomori.jp
+shingo.aomori.jp
+takko.aomori.jp
+towada.aomori.jp
+tsugaru.aomori.jp
+tsuruta.aomori.jp
+abiko.chiba.jp
+asahi.chiba.jp
+chonan.chiba.jp
+chosei.chiba.jp
+choshi.chiba.jp
+chuo.chiba.jp
+funabashi.chiba.jp
+futtsu.chiba.jp
+hanamigawa.chiba.jp
+ichihara.chiba.jp
+ichikawa.chiba.jp
+ichinomiya.chiba.jp
+inzai.chiba.jp
+isumi.chiba.jp
+kamagaya.chiba.jp
+kamogawa.chiba.jp
+kashiwa.chiba.jp
+katori.chiba.jp
+katsuura.chiba.jp
+kimitsu.chiba.jp
+kisarazu.chiba.jp
+kozaki.chiba.jp
+kujukuri.chiba.jp
+kyonan.chiba.jp
+matsudo.chiba.jp
+midori.chiba.jp
+mihama.chiba.jp
+minamiboso.chiba.jp
+mobara.chiba.jp
+mutsuzawa.chiba.jp
+nagara.chiba.jp
+nagareyama.chiba.jp
+narashino.chiba.jp
+narita.chiba.jp
+noda.chiba.jp
+oamishirasato.chiba.jp
+omigawa.chiba.jp
+onjuku.chiba.jp
+otaki.chiba.jp
+sakae.chiba.jp
+sakura.chiba.jp
+shimofusa.chiba.jp
+shirako.chiba.jp
+shiroi.chiba.jp
+shisui.chiba.jp
+sodegaura.chiba.jp
+sosa.chiba.jp
+tako.chiba.jp
+tateyama.chiba.jp
+togane.chiba.jp
+tohnosho.chiba.jp
+tomisato.chiba.jp
+urayasu.chiba.jp
+yachimata.chiba.jp
+yachiyo.chiba.jp
+yokaichiba.chiba.jp
+yokoshibahikari.chiba.jp
+yotsukaido.chiba.jp
+ainan.ehime.jp
+honai.ehime.jp
+ikata.ehime.jp
+imabari.ehime.jp
+iyo.ehime.jp
+kamijima.ehime.jp
+kihoku.ehime.jp
+kumakogen.ehime.jp
+masaki.ehime.jp
+matsuno.ehime.jp
+matsuyama.ehime.jp
+namikata.ehime.jp
+niihama.ehime.jp
+ozu.ehime.jp
+saijo.ehime.jp
+seiyo.ehime.jp
+shikokuchuo.ehime.jp
+tobe.ehime.jp
+toon.ehime.jp
+uchiko.ehime.jp
+uwajima.ehime.jp
+yawatahama.ehime.jp
+echizen.fukui.jp
+eiheiji.fukui.jp
+fukui.fukui.jp
+ikeda.fukui.jp
+katsuyama.fukui.jp
+mihama.fukui.jp
+minamiechizen.fukui.jp
+obama.fukui.jp
+ohi.fukui.jp
+ono.fukui.jp
+sabae.fukui.jp
+sakai.fukui.jp
+takahama.fukui.jp
+tsuruga.fukui.jp
+wakasa.fukui.jp
+ashiya.fukuoka.jp
+buzen.fukuoka.jp
+chikugo.fukuoka.jp
+chikuho.fukuoka.jp
+chikujo.fukuoka.jp
+chikushino.fukuoka.jp
+chikuzen.fukuoka.jp
+chuo.fukuoka.jp
+dazaifu.fukuoka.jp
+fukuchi.fukuoka.jp
+hakata.fukuoka.jp
+higashi.fukuoka.jp
+hirokawa.fukuoka.jp
+hisayama.fukuoka.jp
+iizuka.fukuoka.jp
+inatsuki.fukuoka.jp
+kaho.fukuoka.jp
+kasuga.fukuoka.jp
+kasuya.fukuoka.jp
+kawara.fukuoka.jp
+keisen.fukuoka.jp
+koga.fukuoka.jp
+kurate.fukuoka.jp
+kurogi.fukuoka.jp
+kurume.fukuoka.jp
+minami.fukuoka.jp
+miyako.fukuoka.jp
+miyama.fukuoka.jp
+miyawaka.fukuoka.jp
+mizumaki.fukuoka.jp
+munakata.fukuoka.jp
+nakagawa.fukuoka.jp
+nakama.fukuoka.jp
+nishi.fukuoka.jp
+nogata.fukuoka.jp
+ogori.fukuoka.jp
+okagaki.fukuoka.jp
+okawa.fukuoka.jp
+oki.fukuoka.jp
+omuta.fukuoka.jp
+onga.fukuoka.jp
+onojo.fukuoka.jp
+oto.fukuoka.jp
+saigawa.fukuoka.jp
+sasaguri.fukuoka.jp
+shingu.fukuoka.jp
+shinyoshitomi.fukuoka.jp
+shonai.fukuoka.jp
+soeda.fukuoka.jp
+sue.fukuoka.jp
+tachiarai.fukuoka.jp
+tagawa.fukuoka.jp
+takata.fukuoka.jp
+toho.fukuoka.jp
+toyotsu.fukuoka.jp
+tsuiki.fukuoka.jp
+ukiha.fukuoka.jp
+umi.fukuoka.jp
+usui.fukuoka.jp
+yamada.fukuoka.jp
+yame.fukuoka.jp
+yanagawa.fukuoka.jp
+yukuhashi.fukuoka.jp
+aizubange.fukushima.jp
+aizumisato.fukushima.jp
+aizuwakamatsu.fukushima.jp
+asakawa.fukushima.jp
+bandai.fukushima.jp
+date.fukushima.jp
+fukushima.fukushima.jp
+furudono.fukushima.jp
+futaba.fukushima.jp
+hanawa.fukushima.jp
+higashi.fukushima.jp
+hirata.fukushima.jp
+hirono.fukushima.jp
+iitate.fukushima.jp
+inawashiro.fukushima.jp
+ishikawa.fukushima.jp
+iwaki.fukushima.jp
+izumizaki.fukushima.jp
+kagamiishi.fukushima.jp
+kaneyama.fukushima.jp
+kawamata.fukushima.jp
+kitakata.fukushima.jp
+kitashiobara.fukushima.jp
+koori.fukushima.jp
+koriyama.fukushima.jp
+kunimi.fukushima.jp
+miharu.fukushima.jp
+mishima.fukushima.jp
+namie.fukushima.jp
+nango.fukushima.jp
+nishiaizu.fukushima.jp
+nishigo.fukushima.jp
+okuma.fukushima.jp
+omotego.fukushima.jp
+ono.fukushima.jp
+otama.fukushima.jp
+samegawa.fukushima.jp
+shimogo.fukushima.jp
+shirakawa.fukushima.jp
+showa.fukushima.jp
+soma.fukushima.jp
+sukagawa.fukushima.jp
+taishin.fukushima.jp
+tamakawa.fukushima.jp
+tanagura.fukushima.jp
+tenei.fukushima.jp
+yabuki.fukushima.jp
+yamato.fukushima.jp
+yamatsuri.fukushima.jp
+yanaizu.fukushima.jp
+yugawa.fukushima.jp
+anpachi.gifu.jp
+ena.gifu.jp
+gifu.gifu.jp
+ginan.gifu.jp
+godo.gifu.jp
+gujo.gifu.jp
+hashima.gifu.jp
+hichiso.gifu.jp
+hida.gifu.jp
+higashishirakawa.gifu.jp
+ibigawa.gifu.jp
+ikeda.gifu.jp
+kakamigahara.gifu.jp
+kani.gifu.jp
+kasahara.gifu.jp
+kasamatsu.gifu.jp
+kawaue.gifu.jp
+kitagata.gifu.jp
+mino.gifu.jp
+minokamo.gifu.jp
+mitake.gifu.jp
+mizunami.gifu.jp
+motosu.gifu.jp
+nakatsugawa.gifu.jp
+ogaki.gifu.jp
+sakahogi.gifu.jp
+seki.gifu.jp
+sekigahara.gifu.jp
+shirakawa.gifu.jp
+tajimi.gifu.jp
+takayama.gifu.jp
+tarui.gifu.jp
+toki.gifu.jp
+tomika.gifu.jp
+wanouchi.gifu.jp
+yamagata.gifu.jp
+yaotsu.gifu.jp
+yoro.gifu.jp
+annaka.gunma.jp
+chiyoda.gunma.jp
+fujioka.gunma.jp
+higashiagatsuma.gunma.jp
+isesaki.gunma.jp
+itakura.gunma.jp
+kanna.gunma.jp
+kanra.gunma.jp
+katashina.gunma.jp
+kawaba.gunma.jp
+kiryu.gunma.jp
+kusatsu.gunma.jp
+maebashi.gunma.jp
+meiwa.gunma.jp
+midori.gunma.jp
+minakami.gunma.jp
+naganohara.gunma.jp
+nakanojo.gunma.jp
+nanmoku.gunma.jp
+numata.gunma.jp
+oizumi.gunma.jp
+ora.gunma.jp
+ota.gunma.jp
+shibukawa.gunma.jp
+shimonita.gunma.jp
+shinto.gunma.jp
+showa.gunma.jp
+takasaki.gunma.jp
+takayama.gunma.jp
+tamamura.gunma.jp
+tatebayashi.gunma.jp
+tomioka.gunma.jp
+tsukiyono.gunma.jp
+tsumagoi.gunma.jp
+ueno.gunma.jp
+yoshioka.gunma.jp
+asaminami.hiroshima.jp
+daiwa.hiroshima.jp
+etajima.hiroshima.jp
+fuchu.hiroshima.jp
+fukuyama.hiroshima.jp
+hatsukaichi.hiroshima.jp
+higashihiroshima.hiroshima.jp
+hongo.hiroshima.jp
+jinsekikogen.hiroshima.jp
+kaita.hiroshima.jp
+kui.hiroshima.jp
+kumano.hiroshima.jp
+kure.hiroshima.jp
+mihara.hiroshima.jp
+miyoshi.hiroshima.jp
+naka.hiroshima.jp
+onomichi.hiroshima.jp
+osakikamijima.hiroshima.jp
+otake.hiroshima.jp
+saka.hiroshima.jp
+sera.hiroshima.jp
+seranishi.hiroshima.jp
+shinichi.hiroshima.jp
+shobara.hiroshima.jp
+takehara.hiroshima.jp
+abashiri.hokkaido.jp
+abira.hokkaido.jp
+aibetsu.hokkaido.jp
+akabira.hokkaido.jp
+akkeshi.hokkaido.jp
+asahikawa.hokkaido.jp
+ashibetsu.hokkaido.jp
+ashoro.hokkaido.jp
+assabu.hokkaido.jp
+atsuma.hokkaido.jp
+bibai.hokkaido.jp
+biei.hokkaido.jp
+bifuka.hokkaido.jp
+bihoro.hokkaido.jp
+biratori.hokkaido.jp
+chippubetsu.hokkaido.jp
+chitose.hokkaido.jp
+date.hokkaido.jp
+ebetsu.hokkaido.jp
+embetsu.hokkaido.jp
+eniwa.hokkaido.jp
+erimo.hokkaido.jp
+esan.hokkaido.jp
+esashi.hokkaido.jp
+fukagawa.hokkaido.jp
+fukushima.hokkaido.jp
+furano.hokkaido.jp
+furubira.hokkaido.jp
+haboro.hokkaido.jp
+hakodate.hokkaido.jp
+hamatonbetsu.hokkaido.jp
+hidaka.hokkaido.jp
+higashikagura.hokkaido.jp
+higashikawa.hokkaido.jp
+hiroo.hokkaido.jp
+hokuryu.hokkaido.jp
+hokuto.hokkaido.jp
+honbetsu.hokkaido.jp
+horokanai.hokkaido.jp
+horonobe.hokkaido.jp
+ikeda.hokkaido.jp
+imakane.hokkaido.jp
+ishikari.hokkaido.jp
+iwamizawa.hokkaido.jp
+iwanai.hokkaido.jp
+kamifurano.hokkaido.jp
+kamikawa.hokkaido.jp
+kamishihoro.hokkaido.jp
+kamisunagawa.hokkaido.jp
+kamoenai.hokkaido.jp
+kayabe.hokkaido.jp
+kembuchi.hokkaido.jp
+kikonai.hokkaido.jp
+kimobetsu.hokkaido.jp
+kitahiroshima.hokkaido.jp
+kitami.hokkaido.jp
+kiyosato.hokkaido.jp
+koshimizu.hokkaido.jp
+kunneppu.hokkaido.jp
+kuriyama.hokkaido.jp
+kuromatsunai.hokkaido.jp
+kushiro.hokkaido.jp
+kutchan.hokkaido.jp
+kyowa.hokkaido.jp
+mashike.hokkaido.jp
+matsumae.hokkaido.jp
+mikasa.hokkaido.jp
+minamifurano.hokkaido.jp
+mombetsu.hokkaido.jp
+moseushi.hokkaido.jp
+mukawa.hokkaido.jp
+muroran.hokkaido.jp
+naie.hokkaido.jp
+nakagawa.hokkaido.jp
+nakasatsunai.hokkaido.jp
+nakatombetsu.hokkaido.jp
+nanae.hokkaido.jp
+nanporo.hokkaido.jp
+nayoro.hokkaido.jp
+nemuro.hokkaido.jp
+niikappu.hokkaido.jp
+niki.hokkaido.jp
+nishiokoppe.hokkaido.jp
+noboribetsu.hokkaido.jp
+numata.hokkaido.jp
+obihiro.hokkaido.jp
+obira.hokkaido.jp
+oketo.hokkaido.jp
+okoppe.hokkaido.jp
+otaru.hokkaido.jp
+otobe.hokkaido.jp
+otofuke.hokkaido.jp
+otoineppu.hokkaido.jp
+oumu.hokkaido.jp
+ozora.hokkaido.jp
+pippu.hokkaido.jp
+rankoshi.hokkaido.jp
+rebun.hokkaido.jp
+rikubetsu.hokkaido.jp
+rishiri.hokkaido.jp
+rishirifuji.hokkaido.jp
+saroma.hokkaido.jp
+sarufutsu.hokkaido.jp
+shakotan.hokkaido.jp
+shari.hokkaido.jp
+shibecha.hokkaido.jp
+shibetsu.hokkaido.jp
+shikabe.hokkaido.jp
+shikaoi.hokkaido.jp
+shimamaki.hokkaido.jp
+shimizu.hokkaido.jp
+shimokawa.hokkaido.jp
+shinshinotsu.hokkaido.jp
+shintoku.hokkaido.jp
+shiranuka.hokkaido.jp
+shiraoi.hokkaido.jp
+shiriuchi.hokkaido.jp
+sobetsu.hokkaido.jp
+sunagawa.hokkaido.jp
+taiki.hokkaido.jp
+takasu.hokkaido.jp
+takikawa.hokkaido.jp
+takinoue.hokkaido.jp
+teshikaga.hokkaido.jp
+tobetsu.hokkaido.jp
+tohma.hokkaido.jp
+tomakomai.hokkaido.jp
+tomari.hokkaido.jp
+toya.hokkaido.jp
+toyako.hokkaido.jp
+toyotomi.hokkaido.jp
+toyoura.hokkaido.jp
+tsubetsu.hokkaido.jp
+tsukigata.hokkaido.jp
+urakawa.hokkaido.jp
+urausu.hokkaido.jp
+uryu.hokkaido.jp
+utashinai.hokkaido.jp
+wakkanai.hokkaido.jp
+wassamu.hokkaido.jp
+yakumo.hokkaido.jp
+yoichi.hokkaido.jp
+aioi.hyogo.jp
+akashi.hyogo.jp
+ako.hyogo.jp
+amagasaki.hyogo.jp
+aogaki.hyogo.jp
+asago.hyogo.jp
+ashiya.hyogo.jp
+awaji.hyogo.jp
+fukusaki.hyogo.jp
+goshiki.hyogo.jp
+harima.hyogo.jp
+himeji.hyogo.jp
+ichikawa.hyogo.jp
+inagawa.hyogo.jp
+itami.hyogo.jp
+kakogawa.hyogo.jp
+kamigori.hyogo.jp
+kamikawa.hyogo.jp
+kasai.hyogo.jp
+kasuga.hyogo.jp
+kawanishi.hyogo.jp
+miki.hyogo.jp
+minamiawaji.hyogo.jp
+nishinomiya.hyogo.jp
+nishiwaki.hyogo.jp
+ono.hyogo.jp
+sanda.hyogo.jp
+sannan.hyogo.jp
+sasayama.hyogo.jp
+sayo.hyogo.jp
+shingu.hyogo.jp
+shinonsen.hyogo.jp
+shiso.hyogo.jp
+sumoto.hyogo.jp
+taishi.hyogo.jp
+taka.hyogo.jp
+takarazuka.hyogo.jp
+takasago.hyogo.jp
+takino.hyogo.jp
+tamba.hyogo.jp
+tatsuno.hyogo.jp
+toyooka.hyogo.jp
+yabu.hyogo.jp
+yashiro.hyogo.jp
+yoka.hyogo.jp
+yokawa.hyogo.jp
+ami.ibaraki.jp
+asahi.ibaraki.jp
+bando.ibaraki.jp
+chikusei.ibaraki.jp
+daigo.ibaraki.jp
+fujishiro.ibaraki.jp
+hitachi.ibaraki.jp
+hitachinaka.ibaraki.jp
+hitachiomiya.ibaraki.jp
+hitachiota.ibaraki.jp
+ibaraki.ibaraki.jp
+ina.ibaraki.jp
+inashiki.ibaraki.jp
+itako.ibaraki.jp
+iwama.ibaraki.jp
+joso.ibaraki.jp
+kamisu.ibaraki.jp
+kasama.ibaraki.jp
+kashima.ibaraki.jp
+kasumigaura.ibaraki.jp
+koga.ibaraki.jp
+miho.ibaraki.jp
+mito.ibaraki.jp
+moriya.ibaraki.jp
+naka.ibaraki.jp
+namegata.ibaraki.jp
+oarai.ibaraki.jp
+ogawa.ibaraki.jp
+omitama.ibaraki.jp
+ryugasaki.ibaraki.jp
+sakai.ibaraki.jp
+sakuragawa.ibaraki.jp
+shimodate.ibaraki.jp
+shimotsuma.ibaraki.jp
+shirosato.ibaraki.jp
+sowa.ibaraki.jp
+suifu.ibaraki.jp
+takahagi.ibaraki.jp
+tamatsukuri.ibaraki.jp
+tokai.ibaraki.jp
+tomobe.ibaraki.jp
+tone.ibaraki.jp
+toride.ibaraki.jp
+tsuchiura.ibaraki.jp
+tsukuba.ibaraki.jp
+uchihara.ibaraki.jp
+ushiku.ibaraki.jp
+yachiyo.ibaraki.jp
+yamagata.ibaraki.jp
+yawara.ibaraki.jp
+yuki.ibaraki.jp
+anamizu.ishikawa.jp
+hakui.ishikawa.jp
+hakusan.ishikawa.jp
+kaga.ishikawa.jp
+kahoku.ishikawa.jp
+kanazawa.ishikawa.jp
+kawakita.ishikawa.jp
+komatsu.ishikawa.jp
+nakanoto.ishikawa.jp
+nanao.ishikawa.jp
+nomi.ishikawa.jp
+nonoichi.ishikawa.jp
+noto.ishikawa.jp
+shika.ishikawa.jp
+suzu.ishikawa.jp
+tsubata.ishikawa.jp
+tsurugi.ishikawa.jp
+uchinada.ishikawa.jp
+wajima.ishikawa.jp
+fudai.iwate.jp
+fujisawa.iwate.jp
+hanamaki.iwate.jp
+hiraizumi.iwate.jp
+hirono.iwate.jp
+ichinohe.iwate.jp
+ichinoseki.iwate.jp
+iwaizumi.iwate.jp
+iwate.iwate.jp
+joboji.iwate.jp
+kamaishi.iwate.jp
+kanegasaki.iwate.jp
+karumai.iwate.jp
+kawai.iwate.jp
+kitakami.iwate.jp
+kuji.iwate.jp
+kunohe.iwate.jp
+kuzumaki.iwate.jp
+miyako.iwate.jp
+mizusawa.iwate.jp
+morioka.iwate.jp
+ninohe.iwate.jp
+noda.iwate.jp
+ofunato.iwate.jp
+oshu.iwate.jp
+otsuchi.iwate.jp
+rikuzentakata.iwate.jp
+shiwa.iwate.jp
+shizukuishi.iwate.jp
+sumita.iwate.jp
+takizawa.iwate.jp
+tanohata.iwate.jp
+tono.iwate.jp
+yahaba.iwate.jp
+yamada.iwate.jp
+ayagawa.kagawa.jp
+higashikagawa.kagawa.jp
+kanonji.kagawa.jp
+kotohira.kagawa.jp
+manno.kagawa.jp
+marugame.kagawa.jp
+mitoyo.kagawa.jp
+naoshima.kagawa.jp
+sanuki.kagawa.jp
+tadotsu.kagawa.jp
+takamatsu.kagawa.jp
+tonosho.kagawa.jp
+uchinomi.kagawa.jp
+utazu.kagawa.jp
+zentsuji.kagawa.jp
+akune.kagoshima.jp
+amami.kagoshima.jp
+hioki.kagoshima.jp
+isa.kagoshima.jp
+isen.kagoshima.jp
+izumi.kagoshima.jp
+kagoshima.kagoshima.jp
+kanoya.kagoshima.jp
+kawanabe.kagoshima.jp
+kinko.kagoshima.jp
+kouyama.kagoshima.jp
+makurazaki.kagoshima.jp
+matsumoto.kagoshima.jp
+minamitane.kagoshima.jp
+nakatane.kagoshima.jp
+nishinoomote.kagoshima.jp
+satsumasendai.kagoshima.jp
+soo.kagoshima.jp
+tarumizu.kagoshima.jp
+yusui.kagoshima.jp
+aikawa.kanagawa.jp
+atsugi.kanagawa.jp
+ayase.kanagawa.jp
+chigasaki.kanagawa.jp
+ebina.kanagawa.jp
+fujisawa.kanagawa.jp
+hadano.kanagawa.jp
+hakone.kanagawa.jp
+hiratsuka.kanagawa.jp
+isehara.kanagawa.jp
+kaisei.kanagawa.jp
+kamakura.kanagawa.jp
+kiyokawa.kanagawa.jp
+matsuda.kanagawa.jp
+minamiashigara.kanagawa.jp
+miura.kanagawa.jp
+nakai.kanagawa.jp
+ninomiya.kanagawa.jp
+odawara.kanagawa.jp
+oi.kanagawa.jp
+oiso.kanagawa.jp
+sagamihara.kanagawa.jp
+samukawa.kanagawa.jp
+tsukui.kanagawa.jp
+yamakita.kanagawa.jp
+yamato.kanagawa.jp
+yokosuka.kanagawa.jp
+yugawara.kanagawa.jp
+zama.kanagawa.jp
+zushi.kanagawa.jp
+aki.kochi.jp
+geisei.kochi.jp
+hidaka.kochi.jp
+higashitsuno.kochi.jp
+ino.kochi.jp
+kagami.kochi.jp
+kami.kochi.jp
+kitagawa.kochi.jp
+kochi.kochi.jp
+mihara.kochi.jp
+motoyama.kochi.jp
+muroto.kochi.jp
+nahari.kochi.jp
+nakamura.kochi.jp
+nankoku.kochi.jp
+nishitosa.kochi.jp
+niyodogawa.kochi.jp
+ochi.kochi.jp
+okawa.kochi.jp
+otoyo.kochi.jp
+otsuki.kochi.jp
+sakawa.kochi.jp
+sukumo.kochi.jp
+susaki.kochi.jp
+tosa.kochi.jp
+tosashimizu.kochi.jp
+toyo.kochi.jp
+tsuno.kochi.jp
+umaji.kochi.jp
+yasuda.kochi.jp
+yusuhara.kochi.jp
+amakusa.kumamoto.jp
+arao.kumamoto.jp
+aso.kumamoto.jp
+choyo.kumamoto.jp
+gyokuto.kumamoto.jp
+hitoyoshi.kumamoto.jp
+kamiamakusa.kumamoto.jp
+kashima.kumamoto.jp
+kikuchi.kumamoto.jp
+kosa.kumamoto.jp
+kumamoto.kumamoto.jp
+mashiki.kumamoto.jp
+mifune.kumamoto.jp
+minamata.kumamoto.jp
+minamioguni.kumamoto.jp
+nagasu.kumamoto.jp
+nishihara.kumamoto.jp
+oguni.kumamoto.jp
+ozu.kumamoto.jp
+sumoto.kumamoto.jp
+takamori.kumamoto.jp
+uki.kumamoto.jp
+uto.kumamoto.jp
+yamaga.kumamoto.jp
+yamato.kumamoto.jp
+yatsushiro.kumamoto.jp
+ayabe.kyoto.jp
+fukuchiyama.kyoto.jp
+higashiyama.kyoto.jp
+ide.kyoto.jp
+ine.kyoto.jp
+joyo.kyoto.jp
+kameoka.kyoto.jp
+kamo.kyoto.jp
+kita.kyoto.jp
+kizu.kyoto.jp
+kumiyama.kyoto.jp
+kyotamba.kyoto.jp
+kyotanabe.kyoto.jp
+kyotango.kyoto.jp
+maizuru.kyoto.jp
+minami.kyoto.jp
+minamiyamashiro.kyoto.jp
+miyazu.kyoto.jp
+muko.kyoto.jp
+nagaokakyo.kyoto.jp
+nakagyo.kyoto.jp
+nantan.kyoto.jp
+oyamazaki.kyoto.jp
+sakyo.kyoto.jp
+seika.kyoto.jp
+tanabe.kyoto.jp
+uji.kyoto.jp
+ujitawara.kyoto.jp
+wazuka.kyoto.jp
+yamashina.kyoto.jp
+yawata.kyoto.jp
+asahi.mie.jp
+inabe.mie.jp
+ise.mie.jp
+kameyama.mie.jp
+kawagoe.mie.jp
+kiho.mie.jp
+kisosaki.mie.jp
+kiwa.mie.jp
+komono.mie.jp
+kumano.mie.jp
+kuwana.mie.jp
+matsusaka.mie.jp
+meiwa.mie.jp
+mihama.mie.jp
+minamiise.mie.jp
+misugi.mie.jp
+miyama.mie.jp
+nabari.mie.jp
+shima.mie.jp
+suzuka.mie.jp
+tado.mie.jp
+taiki.mie.jp
+taki.mie.jp
+tamaki.mie.jp
+toba.mie.jp
+tsu.mie.jp
+udono.mie.jp
+ureshino.mie.jp
+watarai.mie.jp
+yokkaichi.mie.jp
+furukawa.miyagi.jp
+higashimatsushima.miyagi.jp
+ishinomaki.miyagi.jp
+iwanuma.miyagi.jp
+kakuda.miyagi.jp
+kami.miyagi.jp
+kawasaki.miyagi.jp
+kesennuma.miyagi.jp
+marumori.miyagi.jp
+matsushima.miyagi.jp
+minamisanriku.miyagi.jp
+misato.miyagi.jp
+murata.miyagi.jp
+natori.miyagi.jp
+ogawara.miyagi.jp
+ohira.miyagi.jp
+onagawa.miyagi.jp
+osaki.miyagi.jp
+rifu.miyagi.jp
+semine.miyagi.jp
+shibata.miyagi.jp
+shichikashuku.miyagi.jp
+shikama.miyagi.jp
+shiogama.miyagi.jp
+shiroishi.miyagi.jp
+tagajo.miyagi.jp
+taiwa.miyagi.jp
+tome.miyagi.jp
+tomiya.miyagi.jp
+wakuya.miyagi.jp
+watari.miyagi.jp
+yamamoto.miyagi.jp
+zao.miyagi.jp
+aya.miyazaki.jp
+ebino.miyazaki.jp
+gokase.miyazaki.jp
+hyuga.miyazaki.jp
+kadogawa.miyazaki.jp
+kawaminami.miyazaki.jp
+kijo.miyazaki.jp
+kitagawa.miyazaki.jp
+kitakata.miyazaki.jp
+kitaura.miyazaki.jp
+kobayashi.miyazaki.jp
+kunitomi.miyazaki.jp
+kushima.miyazaki.jp
+mimata.miyazaki.jp
+miyakonojo.miyazaki.jp
+miyazaki.miyazaki.jp
+morotsuka.miyazaki.jp
+nichinan.miyazaki.jp
+nishimera.miyazaki.jp
+nobeoka.miyazaki.jp
+saito.miyazaki.jp
+shiiba.miyazaki.jp
+shintomi.miyazaki.jp
+takaharu.miyazaki.jp
+takanabe.miyazaki.jp
+takazaki.miyazaki.jp
+tsuno.miyazaki.jp
+achi.nagano.jp
+agematsu.nagano.jp
+anan.nagano.jp
+aoki.nagano.jp
+asahi.nagano.jp
+azumino.nagano.jp
+chikuhoku.nagano.jp
+chikuma.nagano.jp
+chino.nagano.jp
+fujimi.nagano.jp
+hakuba.nagano.jp
+hara.nagano.jp
+hiraya.nagano.jp
+iida.nagano.jp
+iijima.nagano.jp
+iiyama.nagano.jp
+iizuna.nagano.jp
+ikeda.nagano.jp
+ikusaka.nagano.jp
+ina.nagano.jp
+karuizawa.nagano.jp
+kawakami.nagano.jp
+kiso.nagano.jp
+kisofukushima.nagano.jp
+kitaaiki.nagano.jp
+komagane.nagano.jp
+komoro.nagano.jp
+matsukawa.nagano.jp
+matsumoto.nagano.jp
+miasa.nagano.jp
+minamiaiki.nagano.jp
+minamimaki.nagano.jp
+minamiminowa.nagano.jp
+minowa.nagano.jp
+miyada.nagano.jp
+miyota.nagano.jp
+mochizuki.nagano.jp
+nagano.nagano.jp
+nagawa.nagano.jp
+nagiso.nagano.jp
+nakagawa.nagano.jp
+nakano.nagano.jp
+nozawaonsen.nagano.jp
+obuse.nagano.jp
+ogawa.nagano.jp
+okaya.nagano.jp
+omachi.nagano.jp
+omi.nagano.jp
+ookuwa.nagano.jp
+ooshika.nagano.jp
+otaki.nagano.jp
+otari.nagano.jp
+sakae.nagano.jp
+sakaki.nagano.jp
+saku.nagano.jp
+sakuho.nagano.jp
+shimosuwa.nagano.jp
+shinanomachi.nagano.jp
+shiojiri.nagano.jp
+suwa.nagano.jp
+suzaka.nagano.jp
+takagi.nagano.jp
+takamori.nagano.jp
+takayama.nagano.jp
+tateshina.nagano.jp
+tatsuno.nagano.jp
+togakushi.nagano.jp
+togura.nagano.jp
+tomi.nagano.jp
+ueda.nagano.jp
+wada.nagano.jp
+yamagata.nagano.jp
+yamanouchi.nagano.jp
+yasaka.nagano.jp
+yasuoka.nagano.jp
+chijiwa.nagasaki.jp
+futsu.nagasaki.jp
+goto.nagasaki.jp
+hasami.nagasaki.jp
+hirado.nagasaki.jp
+iki.nagasaki.jp
+isahaya.nagasaki.jp
+kawatana.nagasaki.jp
+kuchinotsu.nagasaki.jp
+matsuura.nagasaki.jp
+nagasaki.nagasaki.jp
+obama.nagasaki.jp
+omura.nagasaki.jp
+oseto.nagasaki.jp
+saikai.nagasaki.jp
+sasebo.nagasaki.jp
+seihi.nagasaki.jp
+shimabara.nagasaki.jp
+shinkamigoto.nagasaki.jp
+togitsu.nagasaki.jp
+tsushima.nagasaki.jp
+unzen.nagasaki.jp
+ando.nara.jp
+gose.nara.jp
+heguri.nara.jp
+higashiyoshino.nara.jp
+ikaruga.nara.jp
+ikoma.nara.jp
+kamikitayama.nara.jp
+kanmaki.nara.jp
+kashiba.nara.jp
+kashihara.nara.jp
+katsuragi.nara.jp
+kawai.nara.jp
+kawakami.nara.jp
+kawanishi.nara.jp
+koryo.nara.jp
+kurotaki.nara.jp
+mitsue.nara.jp
+miyake.nara.jp
+nara.nara.jp
+nosegawa.nara.jp
+oji.nara.jp
+ouda.nara.jp
+oyodo.nara.jp
+sakurai.nara.jp
+sango.nara.jp
+shimoichi.nara.jp
+shimokitayama.nara.jp
+shinjo.nara.jp
+soni.nara.jp
+takatori.nara.jp
+tawaramoto.nara.jp
+tenkawa.nara.jp
+tenri.nara.jp
+uda.nara.jp
+yamatokoriyama.nara.jp
+yamatotakada.nara.jp
+yamazoe.nara.jp
+yoshino.nara.jp
+aga.niigata.jp
+agano.niigata.jp
+gosen.niigata.jp
+itoigawa.niigata.jp
+izumozaki.niigata.jp
+joetsu.niigata.jp
+kamo.niigata.jp
+kariwa.niigata.jp
+kashiwazaki.niigata.jp
+minamiuonuma.niigata.jp
+mitsuke.niigata.jp
+muika.niigata.jp
+murakami.niigata.jp
+myoko.niigata.jp
+nagaoka.niigata.jp
+niigata.niigata.jp
+ojiya.niigata.jp
+omi.niigata.jp
+sado.niigata.jp
+sanjo.niigata.jp
+seiro.niigata.jp
+seirou.niigata.jp
+sekikawa.niigata.jp
+shibata.niigata.jp
+tagami.niigata.jp
+tainai.niigata.jp
+tochio.niigata.jp
+tokamachi.niigata.jp
+tsubame.niigata.jp
+tsunan.niigata.jp
+uonuma.niigata.jp
+yahiko.niigata.jp
+yoita.niigata.jp
+yuzawa.niigata.jp
+beppu.oita.jp
+bungoono.oita.jp
+bungotakada.oita.jp
+hasama.oita.jp
+hiji.oita.jp
+himeshima.oita.jp
+hita.oita.jp
+kamitsue.oita.jp
+kokonoe.oita.jp
+kuju.oita.jp
+kunisaki.oita.jp
+kusu.oita.jp
+oita.oita.jp
+saiki.oita.jp
+taketa.oita.jp
+tsukumi.oita.jp
+usa.oita.jp
+usuki.oita.jp
+yufu.oita.jp
+akaiwa.okayama.jp
+asakuchi.okayama.jp
+bizen.okayama.jp
+hayashima.okayama.jp
+ibara.okayama.jp
+kagamino.okayama.jp
+kasaoka.okayama.jp
+kibichuo.okayama.jp
+kumenan.okayama.jp
+kurashiki.okayama.jp
+maniwa.okayama.jp
+misaki.okayama.jp
+nagi.okayama.jp
+niimi.okayama.jp
+nishiawakura.okayama.jp
+okayama.okayama.jp
+satosho.okayama.jp
+setouchi.okayama.jp
+shinjo.okayama.jp
+shoo.okayama.jp
+soja.okayama.jp
+takahashi.okayama.jp
+tamano.okayama.jp
+tsuyama.okayama.jp
+wake.okayama.jp
+yakage.okayama.jp
+aguni.okinawa.jp
+ginowan.okinawa.jp
+ginoza.okinawa.jp
+gushikami.okinawa.jp
+haebaru.okinawa.jp
+higashi.okinawa.jp
+hirara.okinawa.jp
+iheya.okinawa.jp
+ishigaki.okinawa.jp
+ishikawa.okinawa.jp
+itoman.okinawa.jp
+izena.okinawa.jp
+kadena.okinawa.jp
+kin.okinawa.jp
+kitadaito.okinawa.jp
+kitanakagusuku.okinawa.jp
+kumejima.okinawa.jp
+kunigami.okinawa.jp
+minamidaito.okinawa.jp
+motobu.okinawa.jp
+nago.okinawa.jp
+naha.okinawa.jp
+nakagusuku.okinawa.jp
+nakijin.okinawa.jp
+nanjo.okinawa.jp
+nishihara.okinawa.jp
+ogimi.okinawa.jp
+okinawa.okinawa.jp
+onna.okinawa.jp
+shimoji.okinawa.jp
+taketomi.okinawa.jp
+tarama.okinawa.jp
+tokashiki.okinawa.jp
+tomigusuku.okinawa.jp
+tonaki.okinawa.jp
+urasoe.okinawa.jp
+uruma.okinawa.jp
+yaese.okinawa.jp
+yomitan.okinawa.jp
+yonabaru.okinawa.jp
+yonaguni.okinawa.jp
+zamami.okinawa.jp
+abeno.osaka.jp
+chihayaakasaka.osaka.jp
+chuo.osaka.jp
+daito.osaka.jp
+fujiidera.osaka.jp
+habikino.osaka.jp
+hannan.osaka.jp
+higashiosaka.osaka.jp
+higashisumiyoshi.osaka.jp
+higashiyodogawa.osaka.jp
+hirakata.osaka.jp
+ibaraki.osaka.jp
+ikeda.osaka.jp
+izumi.osaka.jp
+izumiotsu.osaka.jp
+izumisano.osaka.jp
+kadoma.osaka.jp
+kaizuka.osaka.jp
+kanan.osaka.jp
+kashiwara.osaka.jp
+katano.osaka.jp
+kawachinagano.osaka.jp
+kishiwada.osaka.jp
+kita.osaka.jp
+kumatori.osaka.jp
+matsubara.osaka.jp
+minato.osaka.jp
+minoh.osaka.jp
+misaki.osaka.jp
+moriguchi.osaka.jp
+neyagawa.osaka.jp
+nishi.osaka.jp
+nose.osaka.jp
+osakasayama.osaka.jp
+sakai.osaka.jp
+sayama.osaka.jp
+sennan.osaka.jp
+settsu.osaka.jp
+shijonawate.osaka.jp
+shimamoto.osaka.jp
+suita.osaka.jp
+tadaoka.osaka.jp
+taishi.osaka.jp
+tajiri.osaka.jp
+takaishi.osaka.jp
+takatsuki.osaka.jp
+tondabayashi.osaka.jp
+toyonaka.osaka.jp
+toyono.osaka.jp
+yao.osaka.jp
+ariake.saga.jp
+arita.saga.jp
+fukudomi.saga.jp
+genkai.saga.jp
+hamatama.saga.jp
+hizen.saga.jp
+imari.saga.jp
+kamimine.saga.jp
+kanzaki.saga.jp
+karatsu.saga.jp
+kashima.saga.jp
+kitagata.saga.jp
+kitahata.saga.jp
+kiyama.saga.jp
+kouhoku.saga.jp
+kyuragi.saga.jp
+nishiarita.saga.jp
+ogi.saga.jp
+omachi.saga.jp
+ouchi.saga.jp
+saga.saga.jp
+shiroishi.saga.jp
+taku.saga.jp
+tara.saga.jp
+tosu.saga.jp
+yoshinogari.saga.jp
+arakawa.saitama.jp
+asaka.saitama.jp
+chichibu.saitama.jp
+fujimi.saitama.jp
+fujimino.saitama.jp
+fukaya.saitama.jp
+hanno.saitama.jp
+hanyu.saitama.jp
+hasuda.saitama.jp
+hatogaya.saitama.jp
+hatoyama.saitama.jp
+hidaka.saitama.jp
+higashichichibu.saitama.jp
+higashimatsuyama.saitama.jp
+honjo.saitama.jp
+ina.saitama.jp
+iruma.saitama.jp
+iwatsuki.saitama.jp
+kamiizumi.saitama.jp
+kamikawa.saitama.jp
+kamisato.saitama.jp
+kasukabe.saitama.jp
+kawagoe.saitama.jp
+kawaguchi.saitama.jp
+kawajima.saitama.jp
+kazo.saitama.jp
+kitamoto.saitama.jp
+koshigaya.saitama.jp
+kounosu.saitama.jp
+kuki.saitama.jp
+kumagaya.saitama.jp
+matsubushi.saitama.jp
+minano.saitama.jp
+misato.saitama.jp
+miyashiro.saitama.jp
+miyoshi.saitama.jp
+moroyama.saitama.jp
+nagatoro.saitama.jp
+namegawa.saitama.jp
+niiza.saitama.jp
+ogano.saitama.jp
+ogawa.saitama.jp
+ogose.saitama.jp
+okegawa.saitama.jp
+omiya.saitama.jp
+otaki.saitama.jp
+ranzan.saitama.jp
+ryokami.saitama.jp
+saitama.saitama.jp
+sakado.saitama.jp
+satte.saitama.jp
+sayama.saitama.jp
+shiki.saitama.jp
+shiraoka.saitama.jp
+soka.saitama.jp
+sugito.saitama.jp
+toda.saitama.jp
+tokigawa.saitama.jp
+tokorozawa.saitama.jp
+tsurugashima.saitama.jp
+urawa.saitama.jp
+warabi.saitama.jp
+yashio.saitama.jp
+yokoze.saitama.jp
+yono.saitama.jp
+yorii.saitama.jp
+yoshida.saitama.jp
+yoshikawa.saitama.jp
+yoshimi.saitama.jp
+aisho.shiga.jp
+gamo.shiga.jp
+higashiomi.shiga.jp
+hikone.shiga.jp
+koka.shiga.jp
+konan.shiga.jp
+kosei.shiga.jp
+koto.shiga.jp
+kusatsu.shiga.jp
+maibara.shiga.jp
+moriyama.shiga.jp
+nagahama.shiga.jp
+nishiazai.shiga.jp
+notogawa.shiga.jp
+omihachiman.shiga.jp
+otsu.shiga.jp
+ritto.shiga.jp
+ryuoh.shiga.jp
+takashima.shiga.jp
+takatsuki.shiga.jp
+torahime.shiga.jp
+toyosato.shiga.jp
+yasu.shiga.jp
+akagi.shimane.jp
+ama.shimane.jp
+gotsu.shimane.jp
+hamada.shimane.jp
+higashiizumo.shimane.jp
+hikawa.shimane.jp
+hikimi.shimane.jp
+izumo.shimane.jp
+kakinoki.shimane.jp
+masuda.shimane.jp
+matsue.shimane.jp
+misato.shimane.jp
+nishinoshima.shimane.jp
+ohda.shimane.jp
+okinoshima.shimane.jp
+okuizumo.shimane.jp
+shimane.shimane.jp
+tamayu.shimane.jp
+tsuwano.shimane.jp
+unnan.shimane.jp
+yakumo.shimane.jp
+yasugi.shimane.jp
+yatsuka.shimane.jp
+arai.shizuoka.jp
+atami.shizuoka.jp
+fuji.shizuoka.jp
+fujieda.shizuoka.jp
+fujikawa.shizuoka.jp
+fujinomiya.shizuoka.jp
+fukuroi.shizuoka.jp
+gotemba.shizuoka.jp
+haibara.shizuoka.jp
+hamamatsu.shizuoka.jp
+higashiizu.shizuoka.jp
+ito.shizuoka.jp
+iwata.shizuoka.jp
+izu.shizuoka.jp
+izunokuni.shizuoka.jp
+kakegawa.shizuoka.jp
+kannami.shizuoka.jp
+kawanehon.shizuoka.jp
+kawazu.shizuoka.jp
+kikugawa.shizuoka.jp
+kosai.shizuoka.jp
+makinohara.shizuoka.jp
+matsuzaki.shizuoka.jp
+minamiizu.shizuoka.jp
+mishima.shizuoka.jp
+morimachi.shizuoka.jp
+nishiizu.shizuoka.jp
+numazu.shizuoka.jp
+omaezaki.shizuoka.jp
+shimada.shizuoka.jp
+shimizu.shizuoka.jp
+shimoda.shizuoka.jp
+shizuoka.shizuoka.jp
+susono.shizuoka.jp
+yaizu.shizuoka.jp
+yoshida.shizuoka.jp
+ashikaga.tochigi.jp
+bato.tochigi.jp
+haga.tochigi.jp
+ichikai.tochigi.jp
+iwafune.tochigi.jp
+kaminokawa.tochigi.jp
+kanuma.tochigi.jp
+karasuyama.tochigi.jp
+kuroiso.tochigi.jp
+mashiko.tochigi.jp
+mibu.tochigi.jp
+moka.tochigi.jp
+motegi.tochigi.jp
+nasu.tochigi.jp
+nasushiobara.tochigi.jp
+nikko.tochigi.jp
+nishikata.tochigi.jp
+nogi.tochigi.jp
+ohira.tochigi.jp
+ohtawara.tochigi.jp
+oyama.tochigi.jp
+sakura.tochigi.jp
+sano.tochigi.jp
+shimotsuke.tochigi.jp
+shioya.tochigi.jp
+takanezawa.tochigi.jp
+tochigi.tochigi.jp
+tsuga.tochigi.jp
+ujiie.tochigi.jp
+utsunomiya.tochigi.jp
+yaita.tochigi.jp
+aizumi.tokushima.jp
+anan.tokushima.jp
+ichiba.tokushima.jp
+itano.tokushima.jp
+kainan.tokushima.jp
+komatsushima.tokushima.jp
+matsushige.tokushima.jp
+mima.tokushima.jp
+minami.tokushima.jp
+miyoshi.tokushima.jp
+mugi.tokushima.jp
+nakagawa.tokushima.jp
+naruto.tokushima.jp
+sanagochi.tokushima.jp
+shishikui.tokushima.jp
+tokushima.tokushima.jp
+wajiki.tokushima.jp
+adachi.tokyo.jp
+akiruno.tokyo.jp
+akishima.tokyo.jp
+aogashima.tokyo.jp
+arakawa.tokyo.jp
+bunkyo.tokyo.jp
+chiyoda.tokyo.jp
+chofu.tokyo.jp
+chuo.tokyo.jp
+edogawa.tokyo.jp
+fuchu.tokyo.jp
+fussa.tokyo.jp
+hachijo.tokyo.jp
+hachioji.tokyo.jp
+hamura.tokyo.jp
+higashikurume.tokyo.jp
+higashimurayama.tokyo.jp
+higashiyamato.tokyo.jp
+hino.tokyo.jp
+hinode.tokyo.jp
+hinohara.tokyo.jp
+inagi.tokyo.jp
+itabashi.tokyo.jp
+katsushika.tokyo.jp
+kita.tokyo.jp
+kiyose.tokyo.jp
+kodaira.tokyo.jp
+koganei.tokyo.jp
+kokubunji.tokyo.jp
+komae.tokyo.jp
+koto.tokyo.jp
+kouzushima.tokyo.jp
+kunitachi.tokyo.jp
+machida.tokyo.jp
+meguro.tokyo.jp
+minato.tokyo.jp
+mitaka.tokyo.jp
+mizuho.tokyo.jp
+musashimurayama.tokyo.jp
+musashino.tokyo.jp
+nakano.tokyo.jp
+nerima.tokyo.jp
+ogasawara.tokyo.jp
+okutama.tokyo.jp
+ome.tokyo.jp
+oshima.tokyo.jp
+ota.tokyo.jp
+setagaya.tokyo.jp
+shibuya.tokyo.jp
+shinagawa.tokyo.jp
+shinjuku.tokyo.jp
+suginami.tokyo.jp
+sumida.tokyo.jp
+tachikawa.tokyo.jp
+taito.tokyo.jp
+tama.tokyo.jp
+toshima.tokyo.jp
+chizu.tottori.jp
+hino.tottori.jp
+kawahara.tottori.jp
+koge.tottori.jp
+kotoura.tottori.jp
+misasa.tottori.jp
+nanbu.tottori.jp
+nichinan.tottori.jp
+sakaiminato.tottori.jp
+tottori.tottori.jp
+wakasa.tottori.jp
+yazu.tottori.jp
+yonago.tottori.jp
+asahi.toyama.jp
+fuchu.toyama.jp
+fukumitsu.toyama.jp
+funahashi.toyama.jp
+himi.toyama.jp
+imizu.toyama.jp
+inami.toyama.jp
+johana.toyama.jp
+kamiichi.toyama.jp
+kurobe.toyama.jp
+nakaniikawa.toyama.jp
+namerikawa.toyama.jp
+nanto.toyama.jp
+nyuzen.toyama.jp
+oyabe.toyama.jp
+taira.toyama.jp
+takaoka.toyama.jp
+tateyama.toyama.jp
+toga.toyama.jp
+tonami.toyama.jp
+toyama.toyama.jp
+unazuki.toyama.jp
+uozu.toyama.jp
+yamada.toyama.jp
+arida.wakayama.jp
+aridagawa.wakayama.jp
+gobo.wakayama.jp
+hashimoto.wakayama.jp
+hidaka.wakayama.jp
+hirogawa.wakayama.jp
+inami.wakayama.jp
+iwade.wakayama.jp
+kainan.wakayama.jp
+kamitonda.wakayama.jp
+katsuragi.wakayama.jp
+kimino.wakayama.jp
+kinokawa.wakayama.jp
+kitayama.wakayama.jp
+koya.wakayama.jp
+koza.wakayama.jp
+kozagawa.wakayama.jp
+kudoyama.wakayama.jp
+kushimoto.wakayama.jp
+mihama.wakayama.jp
+misato.wakayama.jp
+nachikatsuura.wakayama.jp
+shingu.wakayama.jp
+shirahama.wakayama.jp
+taiji.wakayama.jp
+tanabe.wakayama.jp
+wakayama.wakayama.jp
+yuasa.wakayama.jp
+yura.wakayama.jp
+asahi.yamagata.jp
+funagata.yamagata.jp
+higashine.yamagata.jp
+iide.yamagata.jp
+kahoku.yamagata.jp
+kaminoyama.yamagata.jp
+kaneyama.yamagata.jp
+kawanishi.yamagata.jp
+mamurogawa.yamagata.jp
+mikawa.yamagata.jp
+murayama.yamagata.jp
+nagai.yamagata.jp
+nakayama.yamagata.jp
+nanyo.yamagata.jp
+nishikawa.yamagata.jp
+obanazawa.yamagata.jp
+oe.yamagata.jp
+oguni.yamagata.jp
+ohkura.yamagata.jp
+oishida.yamagata.jp
+sagae.yamagata.jp
+sakata.yamagata.jp
+sakegawa.yamagata.jp
+shinjo.yamagata.jp
+shirataka.yamagata.jp
+shonai.yamagata.jp
+takahata.yamagata.jp
+tendo.yamagata.jp
+tozawa.yamagata.jp
+tsuruoka.yamagata.jp
+yamagata.yamagata.jp
+yamanobe.yamagata.jp
+yonezawa.yamagata.jp
+yuza.yamagata.jp
+abu.yamaguchi.jp
+hagi.yamaguchi.jp
+hikari.yamaguchi.jp
+hofu.yamaguchi.jp
+iwakuni.yamaguchi.jp
+kudamatsu.yamaguchi.jp
+mitou.yamaguchi.jp
+nagato.yamaguchi.jp
+oshima.yamaguchi.jp
+shimonoseki.yamaguchi.jp
+shunan.yamaguchi.jp
+tabuse.yamaguchi.jp
+tokuyama.yamaguchi.jp
+toyota.yamaguchi.jp
+ube.yamaguchi.jp
+yuu.yamaguchi.jp
+chuo.yamanashi.jp
+doshi.yamanashi.jp
+fuefuki.yamanashi.jp
+fujikawa.yamanashi.jp
+fujikawaguchiko.yamanashi.jp
+fujiyoshida.yamanashi.jp
+hayakawa.yamanashi.jp
+hokuto.yamanashi.jp
+ichikawamisato.yamanashi.jp
+kai.yamanashi.jp
+kofu.yamanashi.jp
+koshu.yamanashi.jp
+kosuge.yamanashi.jp
+minami-alps.yamanashi.jp
+minobu.yamanashi.jp
+nakamichi.yamanashi.jp
+nanbu.yamanashi.jp
+narusawa.yamanashi.jp
+nirasaki.yamanashi.jp
+nishikatsura.yamanashi.jp
+oshino.yamanashi.jp
+otsuki.yamanashi.jp
+showa.yamanashi.jp
+tabayama.yamanashi.jp
+tsuru.yamanashi.jp
+uenohara.yamanashi.jp
+yamanakako.yamanashi.jp
+yamanashi.yamanashi.jp
 
 // ke : http://www.kenic.or.ke/index.php?option=com_content&task=view&id=117&Itemid=145
 *.ke
@@ -2579,6 +4181,7 @@ name.my
 
 // mz : http://www.gobin.info/domainname/mz-template.doc
 *.mz
+!teledata.mz
 
 // na : http://www.na-nic.com.na/
 // http://www.info.na/domain/
@@ -3714,6 +5317,9 @@ org.pn
 edu.pn
 net.pn
 
+// post : http://en.wikipedia.org/wiki/.post
+post
+
 // pr : http://www.nic.pr/index.asp?f=1
 pr
 com.pr
@@ -3772,8 +5378,16 @@ ed.pw
 go.pw
 belau.pw
 
-// py : http://www.nic.py/faq_a.html#faq_b
-*.py
+// py : http://www.nic.py/pautas.html#seccion_9
+// Confirmed by registry 2012-10-03
+py
+com.py
+coop.py
+edu.py
+gov.py
+mil.py
+net.py
+org.py
 
 // qa : http://domains.qa/en/
 qa
@@ -4004,6 +5618,7 @@ net.sd
 org.sd
 edu.sd
 med.sd
+tv.sd
 gov.sd
 info.sd
 
@@ -4051,7 +5666,7 @@ x.se
 y.se
 z.se
 
-// sg : http://www.nic.net.sg/sub_policies_agreement/2ld.html
+// sg : http://www.nic.net.sg/page/registration-policies-procedures-and-guidelines
 sg
 com.sg
 net.sg
@@ -4060,9 +5675,13 @@ gov.sg
 edu.sg
 per.sg
 
-// sh : http://www.nic.sh/rules.html
-// list of 2nd level domains ?
+// sh : http://www.nic.sh/registrar.html
 sh
+com.sh
+net.sh
+gov.sh
+org.sh
+mil.sh
 
 // si : http://en.wikipedia.org/wiki/.si
 si
@@ -4126,6 +5745,11 @@ su
 // sv : http://www.svnet.org.sv/svpolicy.html
 *.sv
 
+// sx : http://en.wikipedia.org/wiki/.sx
+// Confirmed by registry  2012-05-31
+sx
+gov.sx
+
 // sy : http://en.wikipedia.org/wiki/.sy
 // see also: http://www.gobin.info/domainname/sy.doc
 sy
@@ -4157,8 +5781,7 @@ tel
 tf
 
 // tg : http://en.wikipedia.org/wiki/.tg
-// http://www.nic.tg/nictg/index.php implies no reserved 2nd-level domains,
-// although this contradicts wikipedia.
+// http://www.nic.tg/
 tg
 
 // th : http://en.wikipedia.org/wiki/.th
@@ -4172,7 +5795,7 @@ mi.th
 net.th
 or.th
 
-// tj : http://www.nic.tj/policy.htm
+// tj : http://www.nic.tj/policy.html
 tj
 ac.tj
 biz.tj
@@ -4197,9 +5820,16 @@ tk
 tl
 gov.tl
 
-// tm : http://www.nic.tm/rules.html
-// list of 2nd level tlds ?
+// tm : http://www.nic.tm/local.html
 tm
+com.tm
+co.tm
+org.tm
+net.tm
+nom.tm
+gov.tm
+mil.tm
+edu.tm
 
 // tn : http://en.wikipedia.org/wiki/.tn
 // http://whois.ati.tn/
@@ -4286,101 +5916,133 @@ club.tw
 組織.tw
 商業.tw
 
-// tz : http://en.wikipedia.org/wiki/.tz
-// Submitted by registry  2008-06-17
-// Updated from http://www.tznic.or.tz/index.php/domains.html 2010-10-25
+// tz : http://www.tznic.or.tz/index.php/domains
+// Confirmed by registry  2013-01-22
 ac.tz
 co.tz
 go.tz
+hotel.tz
+info.tz
+me.tz
 mil.tz
+mobi.tz
 ne.tz
 or.tz
 sc.tz
+tv.tz
 
-// ua : http://www.nic.net.ua/
+// ua : https://hostmaster.ua/policy/?ua
+// Submitted by registry  2012-04-27
 ua
+// ua 2LD
 com.ua
 edu.ua
 gov.ua
 in.ua
 net.ua
 org.ua
-// ua geo-names
+// ua geographic names
+// https://hostmaster.ua/2ld/
 cherkassy.ua
+cherkasy.ua
 chernigov.ua
+chernihiv.ua
+chernivtsi.ua
 chernovtsy.ua
 ck.ua
 cn.ua
+cr.ua
 crimea.ua
 cv.ua
 dn.ua
 dnepropetrovsk.ua
+dnipropetrovsk.ua
+dominic.ua
 donetsk.ua
 dp.ua
 if.ua
 ivano-frankivsk.ua
 kh.ua
+kharkiv.ua
 kharkov.ua
 kherson.ua
 khmelnitskiy.ua
+khmelnytskyi.ua
 kiev.ua
 kirovograd.ua
 km.ua
 kr.ua
+krym.ua
 ks.ua
 kv.ua
+kyiv.ua
 lg.ua
+lt.ua
 lugansk.ua
 lutsk.ua
+lv.ua
 lviv.ua
 mk.ua
+mykolaiv.ua
 nikolaev.ua
 od.ua
+odesa.ua
 odessa.ua
 pl.ua
 poltava.ua
+rivne.ua
 rovno.ua
 rv.ua
+sb.ua
 sebastopol.ua
+sevastopol.ua
+sm.ua
 sumy.ua
 te.ua
 ternopil.ua
+uz.ua
 uzhgorod.ua
 vinnica.ua
+vinnytsia.ua
 vn.ua
+volyn.ua
+yalta.ua
 zaporizhzhe.ua
-zp.ua
+zaporizhzhia.ua
 zhitomir.ua
+zhytomyr.ua
+zp.ua
 zt.ua
 
 // Private registries in .ua
 co.ua
 pp.ua
 
-// ug : http://www.registry.co.ug/
+// ug : https://www.registry.co.ug/
 ug
 co.ug
+or.ug
 ac.ug
 sc.ug
 go.ug
 ne.ug
-or.ug
+com.ug
+org.ug
 
 // uk : http://en.wikipedia.org/wiki/.uk
+// Submitted by registry  2012-10-02
+// and tweaked by us pending further consultation.
 *.uk
 *.sch.uk
 !bl.uk
 !british-library.uk
-!icnet.uk
 !jet.uk
 !mod.uk
+!national-library-scotland.uk
 !nel.uk
-!nhs.uk
 !nic.uk
 !nls.uk
-!national-library-scotland.uk
 !parliament.uk
-!police.uk
 
 // us : http://en.wikipedia.org/wiki/.us
 us
@@ -4627,14 +6289,21 @@ pvt.k12.ma.us
 chtr.k12.ma.us
 paroch.k12.ma.us
 
-// uy : http://www.antel.com.uy/
-*.uy
+// uy : http://www.nic.org.uy/
+uy
+com.uy
+edu.uy
+gub.uy
+mil.uy
+net.uy
+org.uy
 
-// uz : http://www.reg.uz/registerr.html
-// are there other 2nd level tlds ?
+// uz : http://www.reg.uz/
 uz
-com.uz
 co.uz
+com.uz
+net.uz
+org.uz
 
 // va : http://en.wikipedia.org/wiki/.va
 va
@@ -4649,8 +6318,19 @@ gov.vc
 mil.vc
 edu.vc
 
-// ve : http://registro.nic.ve/nicve/registro/index.html
-*.ve
+// ve : https://registro.nic.ve/
+// Confirmed by registry 2012-10-04
+ve
+co.ve
+com.ve
+e12.ve
+edu.ve
+gov.ve
+info.ve
+mil.ve
+net.ve
+org.ve
+web.ve
 
 // vg : http://en.wikipedia.org/wiki/.vg
 vg
@@ -4708,7 +6388,7 @@ yt
 // 
 
 // xn--mgbaam7a8h ("Emerat" Arabic) : AE
-//http://nic.ae/english/arabicdomain/rules.jsp
+// http://nic.ae/english/arabicdomain/rules.jsp
 امارات
 
 // xn--54b7fta0cc ("Bangla" Bangla) : BD  
@@ -4772,9 +6452,9 @@ yt
 // xn--mgba3a4fra ("Iran" Arabic) : IR  
 ايران
 
-//xn--mgbayh7gpa ("al-Ordon" Arabic) JO
-//National Information Technology Center (NITC) 
-//Royal Scientific Society, Al-Jubeiha
+// xn--mgbayh7gpa ("al-Ordon" Arabic) : JO
+// National Information Technology Center (NITC) 
+// Royal Scientific Society, Al-Jubeiha
 الاردن
 
 // xn--3e0b707e ("Republic of Korea" Hangul) : KR  
@@ -4878,27 +6558,75 @@ xxx
 // ===END ICANN DOMAINS===
 // ===BEGIN PRIVATE DOMAINS===
 
-// info.at : http://www.info.at/
-biz.at
-info.at
-
-// priv.at : http://www.nic.priv.at/
-// Submitted by registry  2008-06-09
-priv.at
-
-// co.ca : http://registry.co.ca
-co.ca
+// Amazon CloudFront : https://aws.amazon.com/cloudfront/
+// Requested by Donavan Miller  2013-03-22
+cloudfront.net
+
+// Amazon Elastic Compute Cloud: https://aws.amazon.com/ec2/
+// Requested by Osman Surkatty  2013-04-02
+compute.amazonaws.com
+us-east-1.amazonaws.com
+compute-1.amazonaws.com
+z-1.compute-1.amazonaws.com
+z-2.compute-1.amazonaws.com
+ap-northeast-1.compute.amazonaws.com
+ap-southeast-1.compute.amazonaws.com
+ap-southeast-2.compute.amazonaws.com
+eu-west-1.compute.amazonaws.com
+sa-east-1.compute.amazonaws.com
+us-gov-west-1.compute.amazonaws.com
+us-west-1.compute.amazonaws.com
+us-west-2.compute.amazonaws.com
+
+// Amazon Elastic Beanstalk : https://aws.amazon.com/elasticbeanstalk/ 
+// Requested by Adam Stein  2013-04-02
+elasticbeanstalk.com
+
+// Amazon Elastic Load Balancing : https://aws.amazon.com/elasticloadbalancing/
+// Requested by Scott Vidmar  2013-03-27
+elb.amazonaws.com
+
+// Amazon S3 : https://aws.amazon.com/s3/
+// Requested by Courtney Eckhardt  2013-03-22
+s3.amazonaws.com
+s3-us-west-2.amazonaws.com
+s3-us-west-1.amazonaws.com
+s3-eu-west-1.amazonaws.com
+s3-ap-southeast-1.amazonaws.com
+s3-ap-southeast-2.amazonaws.com
+s3-ap-northeast-1.amazonaws.com
+s3-sa-east-1.amazonaws.com
+s3-us-gov-west-1.amazonaws.com
+s3-fips-us-gov-west-1.amazonaws.com
+s3-website-us-east-1.amazonaws.com
+s3-website-us-west-2.amazonaws.com
+s3-website-us-west-1.amazonaws.com
+s3-website-eu-west-1.amazonaws.com
+s3-website-ap-southeast-1.amazonaws.com
+s3-website-ap-southeast-2.amazonaws.com
+s3-website-ap-northeast-1.amazonaws.com
+s3-website-sa-east-1.amazonaws.com
+s3-website-us-gov-west-1.amazonaws.com
+
+// BetaInABox
+// Requested by adrian@betainabox.com 2012-09-13
+betainabox.com
 
 // CentralNic : http://www.centralnic.com/names/domains
-// Confirmed by registry  2008-06-09
+// Requested by registry  2012-09-27
+ae.org
 ar.com
 br.com
 cn.com
+com.de
 de.com
 eu.com
 gb.com
+gb.net
 gr.com
 hu.com
+hu.net
+jp.net
 jpn.com
 kr.com
 no.com
@@ -4906,44 +6634,32 @@ qc.com
 ru.com
 sa.com
 se.com
+se.net
 uk.com
+uk.net
 us.com
+us.org
 uy.com
 za.com
-gb.net
-jp.net
-se.net
-uk.net
-ae.org
-us.org
-com.de
-
-// Opera Software, A.S.A.
-// Requested by Yngve Pettersen  2009-11-26
-operaunite.com
-
-// Google, Inc.
-// Requested by Eduardo Vela  2010-09-06
-appspot.com
-
-// iki.fi : Submitted by Hannu Aronsson  2009-11-05
-iki.fi
 
 // c.la : http://www.c.la/
 c.la
 
-// ZaNiC : http://www.za.net/
-// Confirmed by registry  2009-10-03
-za.net
-za.org
+// cloudControl : https://www.cloudcontrol.com/
+// Requested by Tobias Wilken  2013-07-23
+cloudcontrolled.com
+cloudcontrolapp.com
+
+// co.ca : http://registry.co.ca/
+co.ca
 
 // CoDNS B.V.
-// Added 2010-05-23.
 co.nl
 co.no
 
-// Mainseek Sp. z o.o. : http://www.co.pl/
-co.pl
+// DreamHost : http://www.dreamhost.com/
+// Requested by Andrew Farmer  2012-10-02
+dreamhosters.com
 
 // DynDNS.com : http://www.dyndns.com/services/dns/dyndns/
 dyndns-at-home.com
@@ -5226,4 +6942,104 @@ webhop.org
 worse-than.tv
 writesthisblog.com
 
+// Fastly Inc. http://www.fastly.com/
+// Requested by Vladimir Vuksan  2013-05-31
+a.ssl.fastly.net
+b.ssl.fastly.net
+global.ssl.fastly.net
+a.prod.fastly.net
+global.prod.fastly.net
+
+// GitHub, Inc.
+// Requested by Ben Toews  2013-04-18
+github.io
+
+// GlobeHosting, Inc.
+// Requested by Zoltan Egresi  2013-07-12
+ro.com
+
+// Google, Inc.
+// Requested by Eduardo Vela  2012-10-24
+appspot.com
+blogspot.be
+blogspot.bj
+blogspot.ca
+blogspot.cf
+blogspot.ch
+blogspot.co.at
+blogspot.co.il
+blogspot.co.nz
+blogspot.co.uk
+blogspot.com
+blogspot.com.ar
+blogspot.com.au
+blogspot.com.br
+blogspot.com.es
+blogspot.cv
+blogspot.cz
+blogspot.de
+blogspot.dk
+blogspot.fi
+blogspot.fr
+blogspot.gr
+blogspot.hk
+blogspot.hu
+blogspot.ie
+blogspot.in
+blogspot.it
+blogspot.jp
+blogspot.kr
+blogspot.mr
+blogspot.mx
+blogspot.nl
+blogspot.no
+blogspot.pt
+blogspot.re
+blogspot.ro
+blogspot.se
+blogspot.sg
+blogspot.sk
+blogspot.td
+blogspot.tw
+codespot.com
+googleapis.com
+googlecode.com
+
+// Heroku : https://www.heroku.com/
+// Requested by Tom Maher  2013-05-02
+herokuapp.com
+herokussl.com
+
+// iki.fi
+// Requested by Hannu Aronsson  2009-11-05
+iki.fi
+
+// info.at : http://www.info.at/
+biz.at
+info.at
+
+// Michau Enterprises Limited : http://www.co.pl/
+co.pl
+
+// NYC.mn : http://www.information.nyc.mn
+// Requested by Matthew Brown  2013-03-11
+nyc.mn
+
+// Opera Software, A.S.A.
+// Requested by Yngve Pettersen  2009-11-26
+operaunite.com
+
+// Red Hat, Inc. OpenShift : https://openshift.redhat.com/
+// Requested by Tim Kramer  2012-10-24
+rhcloud.com
+
+// priv.at : http://www.nic.priv.at/
+// Requested by registry  2008-06-09
+priv.at
+
+// ZaNiC : http://www.za.net/
+// Requested by registry  2009-10-03
+za.net
+za.org
+
 // ===END PRIVATE DOMAINS===
diff --git a/src/test/java/org/archive/url/PublicSuffixesTest.java b/src/test/java/org/archive/net/PublicSuffixesTest.java
similarity index 94%
rename from src/test/java/org/archive/url/PublicSuffixesTest.java
rename to src/test/java/org/archive/net/PublicSuffixesTest.java
index e2bb288a..b88acb6d 100644
--- a/src/test/java/org/archive/url/PublicSuffixesTest.java
+++ b/src/test/java/org/archive/net/PublicSuffixesTest.java
@@ -17,7 +17,7 @@
  *  limitations under the License.
  */
 
-package org.archive.url;
+package org.archive.net;
 
 import java.io.PrintWriter;
 import java.io.StringWriter;
@@ -26,7 +26,7 @@
 
 import junit.framework.TestCase;
 
-import org.archive.url.PublicSuffixes.Node;
+import org.archive.net.PublicSuffixes.Node;
 
 /**
  * Test cases for PublicSuffixes utility. Confirm expected matches/nonmatches
@@ -132,9 +132,9 @@ public void testBasics() {
         matchPrefix("uk,co,virgin,", "uk,co,virgin,");
         matchPrefix("au,com,example,www,", "au,com,example,");
         matchPrefix("au,com,example,", "au,com,example,");
-        matchPrefix("jp,tokyo,public,assigned,www,",
-                "jp,tokyo,public,assigned,");
-        matchPrefix("jp,tokyo,public,assigned,", "jp,tokyo,public,assigned,");
+        matchPrefix("jp,yokohama,public,assigned,www,",
+                "jp,yokohama,public,assigned,");
+        matchPrefix("jp,yokohama,public,assigned,", "jp,yokohama,public,assigned,");
     }
 
     public void testDomainWithDash() {
@@ -161,8 +161,8 @@ public void testIPV6() {
     public void testExceptions() {
         matchPrefix("uk,bl,www,", "uk,bl,");
         matchPrefix("uk,bl,", "uk,bl,");
-        matchPrefix("jp,tokyo,metro,subdomain,", "jp,tokyo,metro,");
-        matchPrefix("jp,tokyo,metro,", "jp,tokyo,metro,");
+        matchPrefix("jp,tokyo,city,subdomain,", "jp,tokyo,city,");
+        matchPrefix("jp,tokyo,city,", "jp,tokyo,city,");
     }
 
     public void testFakeTLD() {