diff --git a/README.md b/README.md index ae865f7e..9bd2e12a 100644 --- a/README.md +++ b/README.md @@ -3,4 +3,6 @@ OpenWayback Web Commons [![Build Status](https://travis-ci.org/iipc/iipc-web-commons.png?branch=master)](https://travis-ci.org/iipc/iipc-web-commons/) -This repository contains common utility code for the OpenWayback project. +This repository contains common utility code for the [OpenWayback][1] project. + +[1]: https://github.com/iipc/openwayback diff --git a/pom-cdh4.xml b/pom-cdh4.xml new file mode 100644 index 00000000..de19d8d0 --- /dev/null +++ b/pom-cdh4.xml @@ -0,0 +1,229 @@ + + 4.0.0 + + org.archive + ia-web-commons + 1.0-SNAPSHOT + jar + + ia-web-commons + http://maven.apache.org + + + UTF-8 + ${maven.build.timestamp} + yyyyMMddhhmmss + + + + + junit + junit + 3.8.1 + test + + + + com.google.guava + guava + 14.0.1 + + + + org.json + json + 20090211 + + + org.htmlparser + htmlparser + 1.6 + + + + org.mozilla + juniversalchardet + 1.0.3 + + + + commons-httpclient + commons-httpclient + 3.1 + + + + org.apache.hadoop + hadoop-core + 2.0.0-mr1-cdh4.2.0 + + + commons-httpclient + commons-httpclient + + + javax.servlet + servlet-api + + + javax.servlet.jsp + jsp-api + + + org.mortbay.jetty + jetty + + + org.mortbay.jetty + jetty-util + + + tomcat + jasper-runtime + + + tomcat + jasper-compiler + + + + + org.apache.hadoop + hadoop-common + 2.0.0-cdh4.2.0 + + + org.apache.hadoop + hadoop-mapreduce-client-common + 2.0.0-cdh4.2.0 + + + org.apache.hadoop + hadoop-mapreduce-client-core + 2.0.0-cdh4.2.0 + + + + org.apache.pig + pig + 0.11.1 + provided + + + + commons-lang + commons-lang + 2.5 + + + + commons-io + commons-io + 2.4 + + + + org.gnu.inet + libidn + 1.15 + + + it.unimi.dsi + mg4j + 1.0.1 + compile + + + org.apache.httpcomponents + httpcore + 4.3 + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 2.3.2 + + 1.6 + 1.6 + + + + maven-assembly-plugin + 2.4 + + + jar-with-dependencies + + ia-web-commons + + + + package + + single + + + + + + + + src/main/resources + true + + + + + + + internetarchive + Internet Archive Maven Repository + http://builds.archive.org:8080/maven2 + default + + + true + daily + warn + + + true + daily + warn + + + + + cloudera + Cloudera Hadoop + https://repository.cloudera.com/artifactory/cloudera-repos/ + default + + + true + daily + warn + + + true + daily + warn + + + + + + + + repository + + ${repository.url} + + + + diff --git a/pom.xml b/pom.xml index f285a382..a1d3de27 100644 --- a/pom.xml +++ b/pom.xml @@ -162,7 +162,23 @@ dsiutils 2.0.12 compile - + + + org.apache.httpcomponents + httpcore + 4.3 + + + joda-time + joda-time + 1.6 + + + fastutil + fastutil + 5.0.7 + compile + diff --git a/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java b/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java new file mode 100644 index 00000000..0d564a6f --- /dev/null +++ b/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java @@ -0,0 +1,150 @@ +package org.archive.extract; + +import java.io.IOException; +import java.io.PrintWriter; +import java.net.MalformedURLException; +import java.net.URISyntaxException; +import java.net.URL; +import java.util.List; +import java.util.logging.Logger; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.archive.format.gzip.GZIPFormatException; +import org.archive.format.json.JSONUtils; +import org.archive.format.json.SimpleJSONPathSpec; +import org.archive.resource.MetaData; +import org.archive.resource.Resource; +import org.archive.util.IAUtils; +import org.archive.util.StreamCopy; +import org.json.JSONArray; +import org.json.JSONException; +import org.json.JSONObject; + +import com.google.common.io.CountingOutputStream; +import com.google.common.io.NullOutputStream; + +public class WARCMetadataRecordExtractorOutput implements ExtractorOutput { + private static final Logger LOG = + Logger.getLogger(WARCMetadataRecordExtractorOutput.class.getName()); + + private PrintWriter out; + SimpleJSONPathSpec formatSpec = new SimpleJSONPathSpec("Envelope.Format"); + SimpleJSONPathSpec warcURL = new SimpleJSONPathSpec("Envelope.WARC-Header-Metadata.WARC-Target-URI"); + SimpleJSONPathSpec warcDate = new SimpleJSONPathSpec("Envelope.WARC-Header-Metadata.WARC-Date"); + SimpleJSONPathSpec warcType = new SimpleJSONPathSpec("Envelope.WARC-Header-Metadata.WARC-Type"); + SimpleJSONPathSpec warcMetadataRecord = new SimpleJSONPathSpec("Envelope.Payload-Metadata.WARC-Metadata-Metadata.Metadata-Records"); + + private String outputType = "outlinks"; + + public WARCMetadataRecordExtractorOutput(PrintWriter out, String outputType) { + this.out = out; + this.outputType = outputType; + } + + public WARCMetadataRecordExtractorOutput(PrintWriter out) { + this(out,"outlinks"); + } + + public void output(Resource resource) throws IOException { + NullOutputStream nullo = new NullOutputStream(); + CountingOutputStream co = new CountingOutputStream(nullo); + try { + StreamCopy.copy(resource.getInputStream(), co); + } catch(GZIPFormatException e) { + e.printStackTrace(); + return; + } + long bytes = co.getCount(); + if(bytes > 0) { + LOG.info(bytes + " unconsumed bytes in Resource InputStream."); + } + try { + MetaData m = resource.getMetaData().getTopMetaData(); + // URL DATE OURL MIME HTTP-CODE SHA1 META REDIR OFFSET LENGTH FILE + String format = getEnvelopeFormat(m); + String origUrl = "TBD"; + String date = "TBD"; + String canUrl = "TBD"; + + if(format.equals("WARC")) { + origUrl = getWARCURL(m); + date = getWARCDate(m); + String type = getWARCType(m); + if(type.equals("metadata")) { + String warcMetadataRecord = getWARCMetadataRecord(m); + + JSONArray array = new JSONArray(warcMetadataRecord); + String viaUrl = "-"; + String viaPath = "-"; + String sourceTag = "-"; + for(int i=0;i 2) + //'outlinks': 'origUrl date origOutlinkUrl linktype linktext' + out.format("%s\t%s\t%s\t%s\t\n",origUrl,date,linkParts[0],linkParts[2]); + } + } else if(outputType.equals("hopinfo")) { + String key = obj.get("Name").toString(); + String value = obj.get("Value").toString(); + if(key.equals("via")) { + viaUrl = value; + } else if (key.equals("hopsFromSeed")) { + viaPath = value; + } else if (key.equals("sourceTag")) { + sourceTag = value; + } + } + } + if(outputType.equals("hopinfo")) { + //'hopinfo': 'origCrawledUrl date origViaUrl hopPathFromVia sourceTag' + out.format("%s\t%s\t%s\t%s\t%s\n",origUrl,date,viaUrl,viaPath,sourceTag); + } + } + } + + } + catch (Exception e) { + throw new IOException(e); + } + out.flush(); + } + + private String getEnvelopeFormat(MetaData m) { + return unwrapFirst(formatSpec.extract(m),"-"); + } + private String getWARCURL(MetaData m) { + return unwrapFirst(warcURL.extract(m),"-"); + } + private String getWARCDate(MetaData m) { + return unwrapFirst(warcDate.extract(m),"-"); + } + private String getWARCType(MetaData m) { + return unwrapFirst(warcType.extract(m),"-"); + } + private String getWARCMetadataRecord(MetaData m) { + return unwrapFirst(warcMetadataRecord.extract(m),"-"); + } + + private String unwrapFirst(List> l, String defaultValue) { + if(l != null) { + if(l.size() > 0) { + if(l.get(0) != null) { + if(l.get(0).size() > 0) { + String v = l.get(0).get(0); + if(v != null) { + if(v.length() > 0) { + return v; + } + } + } + } + } + } + return defaultValue; + } +} diff --git a/src/main/java/org/archive/format/arc/ARCConstants.java b/src/main/java/org/archive/format/arc/ARCConstants.java index 6bfc5a99..a336ddeb 100755 --- a/src/main/java/org/archive/format/arc/ARCConstants.java +++ b/src/main/java/org/archive/format/arc/ARCConstants.java @@ -1,8 +1,20 @@ package org.archive.format.arc; import java.nio.charset.Charset; +import java.util.Arrays; +import java.util.List; +import java.util.zip.Deflater; +import java.util.zip.GZIPInputStream; -public interface ARCConstants { +import org.archive.format.ArchiveFileConstants; +import org.archive.util.zip.GzipHeader; + +/** + * Constants used by ARC files and in ARC file processing. + * + * @author stack + */ +public interface ARCConstants extends ArchiveFileConstants { public final static int MAX_META_LENGTH = 1024 * 32; public final static Charset ARC_META_CHARSET = Charset.forName("utf-8"); public final static int NEW_LINE_ORD = 10; @@ -25,4 +37,201 @@ public interface ARCConstants { public static final String FILEDESC_SCHEME = "filedesc:/"; public static final String DNS_MIME = "text/dns"; public static final String ALEXA_DAT_MIME = "alexa/dat"; + + /** + * Default maximum ARC file size. + */ + public static final long DEFAULT_MAX_ARC_FILE_SIZE = 100000000; + + /** + * Maximum length for a metadata line. + */ + public static final int MAX_METADATA_LINE_LENGTH = (4 * 1024); + + /** + * ARC file extention. + */ + public static final String ARC_FILE_EXTENSION = "arc"; + + /** + * Dot ARC file extension. + */ + public static final String DOT_ARC_FILE_EXTENSION = + "." + ARC_FILE_EXTENSION; + + public static final String DOT_COMPRESSED_FILE_EXTENSION = + ArchiveFileConstants.DOT_COMPRESSED_FILE_EXTENSION; + + /** + * Compressed arc file extension. + */ + public static final String COMPRESSED_ARC_FILE_EXTENSION = + ARC_FILE_EXTENSION + DOT_COMPRESSED_FILE_EXTENSION; + + /** + * Compressed dot arc file extension. + */ + public static final String DOT_COMPRESSED_ARC_FILE_EXTENSION = + DOT_ARC_FILE_EXTENSION + DOT_COMPRESSED_FILE_EXTENSION; + + /** + * Encoding to use getting bytes from strings. + * + * Specify an encoding rather than leave it to chance: i.e whatever the + * JVMs encoding. Use an encoding that gets the stream as bytes, not chars. + */ + public static final String DEFAULT_ENCODING = "ISO-8859-1"; + + /** + * ARC file line seperator character. + * + * This is what the alexa c-code looks for delimiting lines. + */ + public static final char LINE_SEPARATOR = '\n'; + + /** + * ARC header field seperator character. + */ + public static final char HEADER_FIELD_SEPARATOR = ' '; + + /** + * ARC file *MAGIC NUMBER*. + * + * Every ARC file must begin w/ this. + */ + public static final String ARC_MAGIC_NUMBER = "filedesc://"; + + /** + * The FLG.FEXTRA field that is added to ARC files. (See RFC1952 to + * understand FLG.FEXTRA). + */ + public static final byte[] ARC_GZIP_EXTRA_FIELD = { 8, 0, 'L', 'X', 4, 0, + 0, 0, 0, 0 }; + + /** + * Key for the ARC Header IP field. + * + * Lowercased. + */ + public static final String IP_HEADER_FIELD_KEY = "ip-address"; + + /** + * Key for the ARC Header Result Code field. + * + * Lowercased. + */ + public static final String CODE_HEADER_FIELD_KEY = "result-code"; + + /** + * Key for the ARC Header Checksum field. + * + * Lowercased. + */ + public static final String CHECKSUM_HEADER_FIELD_KEY = "checksum"; + + /** + * Key for the ARC Header Location field. + * + * Lowercased. + */ + public static final String LOCATION_HEADER_FIELD_KEY = "location"; + + /** + * Key for the ARC Header Offset field. + * + * Lowercased. + */ + public static final String OFFSET_HEADER_FIELD_KEY = "offset"; + + /** + * Key for the ARC Header filename field. + * + * Lowercased. + */ + public static final String FILENAME_HEADER_FIELD_KEY = "filename"; + + /** + * Key for statuscode field. + */ + public static final String STATUSCODE_FIELD_KEY = "statuscode"; + + /** + * Key for offset field. + */ + public static final String OFFSET_FIELD_KEY = OFFSET_HEADER_FIELD_KEY; + + /** + * Key for filename field. + */ + public static final String FILENAME_FIELD_KEY = FILENAME_HEADER_FIELD_KEY; + + /** + * Key for checksum field. + */ + public static final String CHECKSUM_FIELD_KEY = CHECKSUM_HEADER_FIELD_KEY; + + /** + * Tokenized field prefix. + * + * Use this prefix for tokenized fields when naming fields in + * an index. + */ + public static final String TOKENIZED_PREFIX = "tokenized_"; + + /** + * Assumed maximum size of a record meta header line. + * + * This 100k which seems massive but its the same as the LINE_LENGTH from + * alexa/include/a_arcio.h: + *
+     * #define LINE_LENGTH     (100*1024)
+     * 
+ */ + public static final int MAX_HEADER_LINE_LENGTH = 1024 * 100; + + /** + * Version 1 required metadata fields. + */ + public static List REQUIRED_VERSION_1_HEADER_FIELDS = Arrays + .asList(new String[] { URL_FIELD_KEY, IP_HEADER_FIELD_KEY, + DATE_FIELD_KEY, MIMETYPE_FIELD_KEY, + LENGTH_FIELD_KEY, VERSION_FIELD_KEY, + ABSOLUTE_OFFSET_KEY }); + + /** + * Minimum possible record length. + * + * This is a rough calc. When the header is data it will occupy less space. + */ + public static int MINIMUM_RECORD_LENGTH = 1 + "://".length() + 1 + + ARC_FILE_EXTENSION.length() + " ".length() + +1 + " ".length() + + 1 + " ".length() + 1 + "/".length() + 1 + " ".length() + 1; + + /** + * Start of a GZIP header that uses default deflater. + */ + public static final byte[] GZIP_HEADER_BEGIN = { + (byte) GZIPInputStream.GZIP_MAGIC, // Magic number (short) + (byte) (GZIPInputStream.GZIP_MAGIC >> 8), // Magic number (short) + Deflater.DEFLATED // Compression method (CM) + }; + + /** + * Length of minimual 'default GZIP header. + * + * See RFC1952 for explaination of value of 10. + */ + public static final int DEFAULT_GZIP_HEADER_LENGTH = + GzipHeader.MINIMAL_GZIP_HEADER_LENGTH; + + /** + * set of known errors encountered reading ARCs + */ + public enum ArcRecordErrors { + HTTP_HEADER_TRUNCATED, + HTTP_STATUS_LINE_INVALID, + HTTP_STATUS_LINE_EXCEPTION, + } + + } diff --git a/src/main/java/org/archive/format/cdx/CDXFile.java b/src/main/java/org/archive/format/cdx/CDXFile.java index 0c3a777a..7dca0464 100644 --- a/src/main/java/org/archive/format/cdx/CDXFile.java +++ b/src/main/java/org/archive/format/cdx/CDXFile.java @@ -97,4 +97,10 @@ public static BufferedReader createStreamingLineReader(String uri, boolean gzipp BufferedReader reader = new BufferedReader(new InputStreamReader(input)); return reader; } + + @Override + public long getTotalLines() { + //TODO: Implement + return 0; + } } diff --git a/src/main/java/org/archive/format/cdx/CDXInputSource.java b/src/main/java/org/archive/format/cdx/CDXInputSource.java index 0a926ebc..34abde53 100644 --- a/src/main/java/org/archive/format/cdx/CDXInputSource.java +++ b/src/main/java/org/archive/format/cdx/CDXInputSource.java @@ -9,4 +9,6 @@ public interface CDXInputSource { public CloseableIterator getCDXIterator(String key, String prefix, boolean exact, ZipNumParams params) throws IOException; public CloseableIterator getCDXIterator(String key, String start, String startEndUrl, ZipNumParams params) throws IOException; + + public long getTotalLines(); } diff --git a/src/main/java/org/archive/format/cdx/MultiCDXInputSource.java b/src/main/java/org/archive/format/cdx/MultiCDXInputSource.java index 66367077..cbf70c0e 100644 --- a/src/main/java/org/archive/format/cdx/MultiCDXInputSource.java +++ b/src/main/java/org/archive/format/cdx/MultiCDXInputSource.java @@ -8,6 +8,7 @@ import org.archive.format.gzip.zipnum.ZipNumIndex; import org.archive.format.gzip.zipnum.ZipNumParams; +import org.archive.util.iterator.CloseableCompositeIterator; import org.archive.util.iterator.CloseableIterator; import org.archive.util.iterator.SortedCompositeIterator; @@ -40,18 +41,22 @@ public void setCdxUris(List cdxUris) throws IOException { } - Comparator comparator = new Comparator() { + public final static Comparator defaultComparator = new Comparator() { public int compare(String s1, String s2) { return s1.compareTo(s2); } }; - Comparator reverseComparator = new Comparator() { + public final static Comparator defaultReverseComparator = new Comparator() { public int compare(String s1, String s2) { return -s1.compareTo(s2); } }; + protected Comparator comparator = defaultComparator; + protected Comparator reverseComparator = defaultReverseComparator; + + public CloseableIterator getCDXIterator(String key, String prefix, boolean exact, ZipNumParams params) throws IOException { SortedCompositeIterator scitr = new SortedCompositeIterator(cdx.size(), params.isReverse() ? reverseComparator : comparator); @@ -70,9 +75,112 @@ public CloseableIterator getCDXIterator(String key, String prefix, boole return scitr; } + // A special iterator which initializes on actual first use + protected static class LazyInitIterator implements CloseableIterator + { + CDXInputSource source; + CloseableIterator iter; + boolean failed = false; + + String key, start, end; + ZipNumParams params; + + protected LazyInitIterator(CDXInputSource source, String key, String start, String end, ZipNumParams params) + { + this.key = key; + this.start = start; + this.end = end; + + this.params = params; + + this.source = source; + } + + protected void initIter() + { + if (iter != null) { + return; + } + + try { + iter = source.getCDXIterator(key, start, end, params); + } catch (IOException io) { + LOGGER.warning(io.toString()); + iter = null; + } + } + + @Override + public boolean hasNext() { + initIter(); + + if (iter == null) { + return false; + } + + return iter.hasNext(); + } + + @Override + public String next() { + initIter(); + + if (iter == null) { + return null; + } + + return iter.next(); + } + + @Override + public void remove() { + + } + + @Override + public void close() throws IOException { + if (iter != null) { + iter.close(); + } + } + } + + public CloseableIterator createSeqIterator(String key, String start, String end, ZipNumParams params) + { + CloseableCompositeIterator composite = new CloseableCompositeIterator(); + CloseableIterator iter = null; + + for (int i = 0; i < cdx.size(); i++) { + try { + CDXInputSource cdxReader = cdx.get(i); + + if (i == (cdx.size() - 1)) { + iter = cdxReader.getCDXIterator(key, start, end, params); + } else { + iter = new LazyInitIterator(cdxReader, key, start, end, params); + } + + if (!params.isReverse()) { + composite.addLast(iter); + } else { + composite.addFirst(iter); + } + + } catch (IOException io) { + LOGGER.warning(io.toString()); + } + } + + return composite; + } + public CloseableIterator getCDXIterator(String key, String start, String end, ZipNumParams params) throws IOException { + if (params.isSequential()) { + return this.createSeqIterator(key, start, end, params); + } + SortedCompositeIterator scitr = new SortedCompositeIterator(cdx.size(), params.isReverse() ? reverseComparator : comparator); CloseableIterator iter = null; @@ -88,4 +196,15 @@ public CloseableIterator getCDXIterator(String key, String start, String return scitr; } + + @Override + public long getTotalLines() { + long sum = 0; + + for (CDXInputSource cdxReader : cdx) { + sum += cdxReader.getTotalLines(); + } + + return sum; + } } diff --git a/src/main/java/org/archive/format/cdx/StandardCDXLineFactory.java b/src/main/java/org/archive/format/cdx/StandardCDXLineFactory.java index d2c299e5..33da41f1 100644 --- a/src/main/java/org/archive/format/cdx/StandardCDXLineFactory.java +++ b/src/main/java/org/archive/format/cdx/StandardCDXLineFactory.java @@ -39,6 +39,17 @@ public FieldSplitFormat getParseFormat() return parseFormat; } + public CDXLine createStandardCDXLine(String input) + { + if (parseFormat == cdx11) { + return new CDX11Line(input, parseFormat); + } else if (parseFormat == cdx09) { + return new CDX09Line(input, parseFormat); + } else { + return new CDXLine(input, parseFormat); + } + } + public CDXLine createStandardCDXLine(String input, FieldSplitFormat exFormat) { if (parseFormat == cdx11) { diff --git a/src/main/java/org/archive/format/gzip/zipnum/SummaryBlockIterator.java b/src/main/java/org/archive/format/gzip/zipnum/SummaryBlockIterator.java index 0046625c..cbf947f6 100644 --- a/src/main/java/org/archive/format/gzip/zipnum/SummaryBlockIterator.java +++ b/src/main/java/org/archive/format/gzip/zipnum/SummaryBlockIterator.java @@ -120,7 +120,7 @@ public CloseableIterator getNextInner() { SeekableLineReader currReader = zipnumIndex.doBlockLoad(currPartId, startOffset, totalLength); if ((currReader == null) && zipnumIndex.isRequired()) { - throw new RuntimeIOException(); + throw new RuntimeIOException("Failed to load shards for: " + currPartId); } if (currReader != null) { diff --git a/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java b/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java index a1682818..2247eda4 100644 --- a/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java +++ b/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java @@ -156,8 +156,20 @@ public SeekableLineReader attemptLoadBlock(String location, long startOffset, in } catch (IOException io) { Level level = (isRequired ? Level.SEVERE : Level.WARNING); + String actualLocation = null; + + if (currReader instanceof HTTPSeekableLineReader) { + actualLocation = ((HTTPSeekableLineReader)currReader).getConnectedUrl(); + } + + if (actualLocation == null) { + actualLocation = location; + } + + String msg = io.toString() + " -- -r " + startOffset + "-" + (startOffset + totalLength - 1) + " " + actualLocation; + if (LOGGER.isLoggable(level)) { - LOGGER.log(level, io.toString() + " -- -r " + startOffset + "-" + (startOffset + totalLength - 1) + " " + location + " req? " + isRequired); + LOGGER.log(level, msg); } if (currReader != null) { @@ -170,7 +182,7 @@ public SeekableLineReader attemptLoadBlock(String location, long startOffset, in } if (isRequired) { - throw new RuntimeIOException(io); + throw new RuntimeIOException(msg); } } diff --git a/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java b/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java index 5e91c507..bc773a58 100644 --- a/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java +++ b/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java @@ -75,7 +75,7 @@ public void run() { Thread.sleep(checkInterval); if (summary != null) { - summary.reloadFactory(); + summary.reloadFactory(); } } @@ -122,7 +122,15 @@ class BlockSize protected boolean newIsDisabled = false; protected boolean disabled = false; - final static int DEFAULT_LOC_CACHE_EXPIRE_MILLIS = 5000; + //final static int DEFAULT_LOC_CACHE_EXPIRE_MILLIS = 120000; + + protected ConcurrentHashMap locCacheMap; + + protected boolean cacheRemoteLoc = false; + + protected int locCacheExpireMillis = 120000; + + protected int locCacheMaxDuration = 1000; class LocCacheEntry { @@ -151,14 +159,7 @@ public boolean equals(Object obj) return false; } - } - - protected ConcurrentHashMap locCacheMap; - - protected boolean cacheRemoteLoc = false; - - protected int locCacheExpireMillis = DEFAULT_LOC_CACHE_EXPIRE_MILLIS; - + } @Override public void init() throws IOException @@ -189,6 +190,7 @@ public void init() throws IOException startDate = newStartDate; endDate = newEndDate; locRoot = newLocRoot; + this.cdxLinesTotalCount = computeTotalLines(); if (!disabled) { this.loadLastBlockSizes(blockSizesFile); @@ -240,6 +242,12 @@ protected void syncLoad(long newModTime) endDate = newEndDate; disabled = newIsDisabled; locRoot = newLocRoot; + + this.cdxLinesTotalCount = computeTotalLines(); + } + + if (this.locCacheMap != null) { + locCacheMap.clear(); } closeExistingFiles(filesToClose); @@ -287,6 +295,14 @@ public void setLocCacheExpireMillis(int locCacheExpireMillis) { this.locCacheExpireMillis = locCacheExpireMillis; } + public int getLocCacheMaxDuration() { + return locCacheMaxDuration; + } + + public void setLocCacheMaxDuration(int locCacheMaxDuration) { + this.locCacheMaxDuration = locCacheMaxDuration; + } + public boolean isCacheRemoteLoc() { return cacheRemoteLoc; } @@ -471,8 +487,9 @@ public long getLastBlockDiff(String startKey, int startPart, int endPart) { return diff; } + // Adjust from shorter blocks, if loaded - public long getTotalLines() + public long computeTotalLines() { long numLines = 0; @@ -525,25 +542,19 @@ SeekableLineReader doBlockLoad(String partId, long startOffset, int totalLength) } // Attempt cached load for http - if (cacheRemoteLoc && (locCacheMap != null)) { - // Non-http requests follow standard load path - if ((locations.length > 0) && GeneralURIStreamFactory.isHttp(locations[0])) { - reader = loadCachedBalancedReader(partId, locations, startOffset, totalLength); - } - } - - if (reader != null) { - return reader; - } - - for (String location : locations) { - reader = blockLoader.attemptLoadBlock(location, startOffset, totalLength, true, isRequired()); - if (reader != null) { - return reader; + if (cacheRemoteLoc && (locCacheMap != null) && (locations.length > 0) && GeneralURIStreamFactory.isHttp(locations[0])) { + reader = loadCachedBalancedReader(partId, locations, startOffset, totalLength); + } else { + // Standard block load path + for (String location : locations) { + reader = blockLoader.attemptLoadBlock(location, startOffset, totalLength, true, isRequired()); + if (reader != null) { + return reader; + } } } - return null; + return reader; } protected String locCacheGet(String key) @@ -574,12 +585,18 @@ SeekableLineReader loadCachedBalancedReader(String partId, String[] locations, l String cachedUrl = locCacheGet(partId); if (cachedUrl != null) { - reader = blockLoader.attemptLoadBlock(cachedUrl, offset, length, true, isRequired()); + long start = System.currentTimeMillis(); + + reader = blockLoader.attemptLoadBlock(cachedUrl, offset, length, true, false); + long duration = System.currentTimeMillis() - start; + + if ((reader == null) || (duration > locCacheMaxDuration)) { + locCacheMap.remove(partId, cachedUrl); + } + if (reader != null) { return reader; - } else { - locCacheMap.remove(partId, cachedUrl); } } @@ -592,13 +609,29 @@ SeekableLineReader loadCachedBalancedReader(String partId, String[] locations, l Collections.shuffle(indexs); } - for (int index : indexs) { - reader = blockLoader.attemptLoadBlock(locations[index], offset, length, true, isRequired()); + final int lastIndex = locations.length - 1; + + for (int i = 0; i < indexs.size(); i++) { + + int index = indexs.get(i); + + // Skip failed cached url + if ((cachedUrl != null) && locations[index].equals(cachedUrl)) { + continue; + } + + long start = System.currentTimeMillis(); + + boolean required = (isRequired() && (i == lastIndex)); + + reader = blockLoader.attemptLoadBlock(locations[index], offset, length, true, required); + + long duration = System.currentTimeMillis() - start; if (reader != null) { String connectedUrl = ((HTTPSeekableLineReader)reader).getConnectedUrl(); - if (connectedUrl != null) { + if ((duration < locCacheMaxDuration) && (connectedUrl != null)) { locCachePut(partId, connectedUrl); } diff --git a/src/main/java/org/archive/format/gzip/zipnum/ZipNumIndex.java b/src/main/java/org/archive/format/gzip/zipnum/ZipNumIndex.java index 7860be36..ad8c9297 100644 --- a/src/main/java/org/archive/format/gzip/zipnum/ZipNumIndex.java +++ b/src/main/java/org/archive/format/gzip/zipnum/ZipNumIndex.java @@ -29,6 +29,9 @@ public class ZipNumIndex implements CDXInputSource { // Used only for reference / user info protected int cdxLinesPerBlock = 3000; + + protected long cdxLinesTotalCount = 0; + //protected HashMap locMap = null; protected final static boolean DEFAULT_USE_NIO = true; @@ -528,4 +531,9 @@ public boolean isRequired() { public void setRequired(boolean required) { this.required = required; } + + @Override + public long getTotalLines() { + return cdxLinesTotalCount; + } } diff --git a/src/main/java/org/archive/format/gzip/zipnum/ZipNumParams.java b/src/main/java/org/archive/format/gzip/zipnum/ZipNumParams.java index 15e22e1d..668743ae 100644 --- a/src/main/java/org/archive/format/gzip/zipnum/ZipNumParams.java +++ b/src/main/java/org/archive/format/gzip/zipnum/ZipNumParams.java @@ -6,6 +6,7 @@ public class ZipNumParams protected int timestampDedupLength = 0; protected int maxBlocks = 0; private boolean reverse = false; + private boolean sequential = false; public ZipNumParams() { @@ -56,4 +57,12 @@ public boolean isReverse() { public void setReverse(boolean reverse) { this.reverse = reverse; } + + public boolean isSequential() { + return sequential; + } + + public void setSequential(boolean sequential) { + this.sequential = sequential; + } } \ No newline at end of file diff --git a/src/main/java/org/archive/httpclient/ConfigurableX509TrustManager.java b/src/main/java/org/archive/httpclient/ConfigurableX509TrustManager.java new file mode 100644 index 00000000..45a89ba6 --- /dev/null +++ b/src/main/java/org/archive/httpclient/ConfigurableX509TrustManager.java @@ -0,0 +1,188 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.httpclient; + +import java.security.KeyStore; +import java.security.KeyStoreException; +import java.security.NoSuchAlgorithmException; +import java.security.cert.CertificateException; +import java.security.cert.X509Certificate; +import java.util.logging.Logger; + +import javax.net.ssl.TrustManager; +import javax.net.ssl.TrustManagerFactory; +import javax.net.ssl.X509TrustManager; + +/** + * A configurable trust manager built on X509TrustManager. + * + * If set to 'open' trust, the default, will get us into sites for whom we do + * not have the CA or any of intermediary CAs that go to make up the cert chain + * of trust. Will also get us past selfsigned and expired certs. 'loose' + * trust will get us into sites w/ valid certs even if they are just + * selfsigned. 'normal' is any valid cert not including selfsigned. 'strict' + * means cert must be valid and the cert DN must match server name. + * + *

Based on pointers in + * SSL + * Guide, + * and readings done in JSSE + * Guide. + * + *

TODO: Move to an ssl subpackage when we have other classes other than + * just this one. + * + * @author stack + * @version $Id$ + */ +public class ConfigurableX509TrustManager implements X509TrustManager +{ + /** + * Logging instance. + */ + protected static Logger logger = Logger.getLogger( + "org.archive.httpclient.ConfigurableX509TrustManager"); + + public static enum TrustLevel { + /** + * Trust anything given us. + * + * Default setting. + * + *

See + * e502. Disabling Certificate Validation in an HTTPS Connection from + * the java almanac for how to trust all. + */ + OPEN, + + /** + * Trust any valid cert including self-signed certificates. + */ + LOOSE, + + /** + * Normal jsse behavior. + * + * Seemingly any certificate that supplies valid chain of trust. + */ + NORMAL, + + /** + * Strict trust. + * + * Ensure server has same name as cert DN. + */ + STRICT, + } + + /** + * Default setting for trust level. + */ + public final static TrustLevel DEFAULT = TrustLevel.OPEN; + + /** + * Trust level. + */ + private TrustLevel trustLevel = DEFAULT; + + + /** + * An instance of the SUNX509TrustManager that we adapt variously + * depending upon passed configuration. + * + * We have it do all the work we don't want to. + */ + private X509TrustManager standardTrustManager = null; + + + public ConfigurableX509TrustManager() + throws NoSuchAlgorithmException, KeyStoreException { + this(DEFAULT); + } + + /** + * Constructor. + * + * @param level Level of trust to effect. + * + * @throws NoSuchAlgorithmException + * @throws KeyStoreException + */ + public ConfigurableX509TrustManager(TrustLevel level) + throws NoSuchAlgorithmException, KeyStoreException { + super(); + TrustManagerFactory factory = TrustManagerFactory. + getInstance(TrustManagerFactory.getDefaultAlgorithm()); + + // Pass in a null (Trust) KeyStore. Null says use the 'default' + // 'trust' keystore (KeyStore class is used to hold keys and to hold + // 'trusts' (certs)). See 'X509TrustManager Interface' in this doc: + // http://java.sun.com + // /j2se/1.4.2/docs/guide/security/jsse/JSSERefGuide.html#Introduction + factory.init((KeyStore)null); + TrustManager[] trustmanagers = factory.getTrustManagers(); + if (trustmanagers.length == 0) { + throw new NoSuchAlgorithmException(TrustManagerFactory. + getDefaultAlgorithm() + " trust manager not supported"); + } + this.standardTrustManager = (X509TrustManager)trustmanagers[0]; + + this.trustLevel = level; + } + + public void checkClientTrusted(X509Certificate[] certificates, String type) + throws CertificateException { + if (this.trustLevel.equals(TrustLevel.OPEN)) { + return; + } + + this.standardTrustManager.checkClientTrusted(certificates, type); + } + + public void checkServerTrusted(X509Certificate[] certificates, String type) + throws CertificateException { + if (this.trustLevel.equals(TrustLevel.OPEN)) { + return; + } + + try { + this.standardTrustManager.checkServerTrusted(certificates, type); + if (this.trustLevel.equals(TrustLevel.STRICT)) { + logger.severe(TrustLevel.STRICT + " not implemented."); + } + } catch (CertificateException e) { + if (this.trustLevel.equals(TrustLevel.LOOSE) && + certificates != null && certificates.length == 1) + { + // If only one cert and its valid and it caused a + // CertificateException, assume its selfsigned. + X509Certificate certificate = certificates[0]; + certificate.checkValidity(); + } else { + // If we got to here, then we're probably NORMAL. Rethrow. + throw e; + } + } + } + + public X509Certificate[] getAcceptedIssuers() { + return this.standardTrustManager.getAcceptedIssuers(); + } +} diff --git a/src/main/java/org/archive/httpclient/HttpRecorderGetMethod.java b/src/main/java/org/archive/httpclient/HttpRecorderGetMethod.java new file mode 100644 index 00000000..105c4f7e --- /dev/null +++ b/src/main/java/org/archive/httpclient/HttpRecorderGetMethod.java @@ -0,0 +1,120 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.httpclient; + +import java.io.IOException; +import java.util.logging.Logger; + +import org.apache.commons.httpclient.HttpConnection; +import org.apache.commons.httpclient.HttpException; +import org.apache.commons.httpclient.HttpState; +import org.apache.commons.httpclient.methods.GetMethod; +import org.archive.util.Recorder; + + +/** + * Override of GetMethod that marks the passed HttpRecorder w/ the transition + * from HTTP head to body and that forces a close on the http connection. + * + * The actions done in this subclass used to be done by copying + * org.apache.commons.HttpMethodBase, overlaying our version in place of the + * one that came w/ httpclient. Here is the patch of the difference between + * shipped httpclient code and our mods: + *

+ *    -- -1338,6 +1346,12 --
+ *
+ *        public void releaseConnection() {
+ *
+ *   +        // HERITRIX always ants the streams closed.
+ *   +        if (responseConnection != null)
+ *   +        {
+ *   +            responseConnection.close();
+ *   +        }
+ *   +
+ *            if (responseStream != null) {
+ *                try {
+ *                    // FYI - this may indirectly invoke responseBodyConsumed.
+ *   -- -1959,6 +1973,11 --
+ *                        this.statusLine = null;
+ *                    }
+ *                }
+ *   +            // HERITRIX mark transition from header to content.
+ *   +            if (this.httpRecorder != null)
+ *   +            {
+ *   +                this.httpRecorder.markContentBegin();
+ *   +            }
+ *                readResponseBody(state, conn);
+ *                processResponseBody(state, conn);
+ *            } catch (IOException e) {
+ * 
+ * + *

We're not supposed to have access to the underlying connection object; + * am only violating contract because see cases where httpclient is skipping + * out w/o cleaning up after itself. + * + * @author stack + * @version $Revision$, $Date$ + */ +public class HttpRecorderGetMethod extends GetMethod { + + protected static Logger logger = + Logger.getLogger(HttpRecorderGetMethod.class.getName()); + + /** + * Instance of http recorder method. + */ + protected HttpRecorderMethod httpRecorderMethod = null; + + + public HttpRecorderGetMethod(String uri, Recorder recorder) { + super(uri); + this.httpRecorderMethod = new HttpRecorderMethod(recorder); + } + + protected void readResponseBody(HttpState state, HttpConnection connection) + throws IOException, HttpException { + // We're about to read the body. Mark transition in http recorder. + this.httpRecorderMethod.markContentBegin(connection); + super.readResponseBody(state, connection); + } + + protected boolean shouldCloseConnection(HttpConnection conn) { + // Always close connection after each request. As best I can tell, this + // is superfluous -- we've set our client to be HTTP/1.0. Doing this + // out of paranoia. + return true; + } + + public int execute(HttpState state, HttpConnection conn) + throws HttpException, IOException { + // Save off the connection so we can close it on our way out in case + // httpclient fails to (We're not supposed to have access to the + // underlying connection object; am only violating contract because + // see cases where httpclient is skipping out w/o cleaning up + // after itself). + this.httpRecorderMethod.setConnection(conn); + return super.execute(state, conn); + } + + protected void addProxyConnectionHeader(HttpState state, HttpConnection conn) + throws IOException, HttpException { + super.addProxyConnectionHeader(state, conn); + this.httpRecorderMethod.handleAddProxyConnectionHeader(this); + } +} diff --git a/src/main/java/org/archive/httpclient/HttpRecorderMethod.java b/src/main/java/org/archive/httpclient/HttpRecorderMethod.java new file mode 100644 index 00000000..932e7e98 --- /dev/null +++ b/src/main/java/org/archive/httpclient/HttpRecorderMethod.java @@ -0,0 +1,107 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.httpclient; + +import java.util.logging.Logger; + +import org.apache.commons.httpclient.Header; +import org.apache.commons.httpclient.HttpConnection; +import org.apache.commons.httpclient.HttpMethod; +import org.archive.util.Recorder; + + +/** + * This class encapsulates the specializations supplied by the + * overrides {@link HttpRecorderGetMethod} and {@link HttpRecorderPostMethod}. + * + * It keeps instance of HttpRecorder and HttpConnection. + * + * @author stack + * @version $Revision$, $Date$ + */ +public class HttpRecorderMethod { + protected static Logger logger = + Logger.getLogger(HttpRecorderMethod.class.getName()); + + /** + * Instance of http recorder we're using recording this http get. + */ + private Recorder httpRecorder = null; + + /** + * Save around so can force close. + * + * See [ 922080 ] IllegalArgumentException (size is wrong). + * https://sourceforge.net/tracker/?func=detail&aid=922080&group_id=73833&atid=539099 + */ + private HttpConnection connection = null; + + + public HttpRecorderMethod(Recorder recorder) { + this.httpRecorder = recorder; + } + + public void markContentBegin(HttpConnection c) { + if (c != this.connection) { + // We're checking that we're not being asked to work on + // a connection that is other than the one we started + // this method#execute with. + throw new IllegalArgumentException("Connections differ: " + + this.connection + " " + c + " " + + Thread.currentThread().getName()); + } + this.httpRecorder.markContentBegin(); + } + + /** + * @return Returns the connection. + */ + public HttpConnection getConnection() { + return this.connection; + } + + /** + * @param connection The connection to set. + */ + public void setConnection(HttpConnection connection) { + this.connection = connection; + } + /** + * @return Returns the httpRecorder. + */ + public Recorder getHttpRecorder() { + return httpRecorder; + } + + /** + * If a 'Proxy-Connection' header has been added to the request, + * it'll be of a 'keep-alive' type. Until we support 'keep-alives', + * override the Proxy-Connection setting and instead pass a 'close' + * (Otherwise every request has to timeout before we notice + * end-of-document). + * @param method Method to find proxy-connection header in. + */ + public void handleAddProxyConnectionHeader(HttpMethod method) { + Header h = method.getRequestHeader("Proxy-Connection"); + if (h != null) { + h.setValue("close"); + method.setRequestHeader(h); + } + } +} diff --git a/src/main/java/org/archive/httpclient/HttpRecorderPostMethod.java b/src/main/java/org/archive/httpclient/HttpRecorderPostMethod.java new file mode 100644 index 00000000..20f1bfd1 --- /dev/null +++ b/src/main/java/org/archive/httpclient/HttpRecorderPostMethod.java @@ -0,0 +1,82 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.httpclient; + +import java.io.IOException; + +import org.apache.commons.httpclient.HttpConnection; +import org.apache.commons.httpclient.HttpException; +import org.apache.commons.httpclient.HttpState; +import org.apache.commons.httpclient.methods.PostMethod; +import org.archive.util.Recorder; + + +/** + * Override of PostMethod that marks the passed HttpRecorder w/ the transition + * from HTTP head to body and that forces a close on the responseConnection. + * + * This is a copy of {@link HttpRecorderGetMethod}. Only difference is the + * parent subclass. + * + * @author stack + * @version $Date$ $Revision$ + */ +public class HttpRecorderPostMethod extends PostMethod { + /** + * Instance of http recorder method. + */ + protected HttpRecorderMethod httpRecorderMethod = null; + + + public HttpRecorderPostMethod(String uri, Recorder recorder) { + super(uri); + this.httpRecorderMethod = new HttpRecorderMethod(recorder); + } + + protected void readResponseBody(HttpState state, HttpConnection connection) + throws IOException, HttpException { + // We're about to read the body. Mark transition in http recorder. + this.httpRecorderMethod.markContentBegin(connection); + super.readResponseBody(state, connection); + } + + protected boolean shouldCloseConnection(HttpConnection conn) { + // Always close connection after each request. As best I can tell, this + // is superfluous -- we've set our client to be HTTP/1.0. Doing this + // out of paranoia. + return true; + } + + public int execute(HttpState state, HttpConnection conn) + throws HttpException, IOException { + // Save off the connection so we can close it on our way out in case + // httpclient fails to (We're not supposed to have access to the + // underlying connection object; am only violating contract because + // see cases where httpclient is skipping out w/o cleaning up + // after itself). + this.httpRecorderMethod.setConnection(conn); + return super.execute(state, conn); + } + + protected void addProxyConnectionHeader(HttpState state, HttpConnection conn) + throws IOException, HttpException { + super.addProxyConnectionHeader(state, conn); + this.httpRecorderMethod.handleAddProxyConnectionHeader(this); + } +} diff --git a/src/main/java/org/archive/httpclient/SingleHttpConnectionManager.java b/src/main/java/org/archive/httpclient/SingleHttpConnectionManager.java new file mode 100644 index 00000000..4ba6a837 --- /dev/null +++ b/src/main/java/org/archive/httpclient/SingleHttpConnectionManager.java @@ -0,0 +1,70 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.httpclient; + +import java.io.IOException; +import java.io.InputStream; + +import org.apache.commons.httpclient.HostConfiguration; +import org.apache.commons.httpclient.HttpConnection; +import org.apache.commons.httpclient.SimpleHttpConnectionManager; + +/** + * An HttpClient-compatible HttpConnection "manager" that actually + * just gives out a new connection each time -- skipping the overhead + * of connection management, since we already throttle our crawler + * with external mechanisms. + * + * @author gojomo + */ +public class SingleHttpConnectionManager extends SimpleHttpConnectionManager { + + public SingleHttpConnectionManager() { + super(); + } + + public HttpConnection getConnectionWithTimeout( + HostConfiguration hostConfiguration, long timeout) { + + HttpConnection conn = new HttpConnection(hostConfiguration); + conn.setHttpConnectionManager(this); + conn.getParams().setDefaults(this.getParams()); + return conn; + } + + public void releaseConnection(HttpConnection conn) { + // ensure connection is closed + conn.close(); + finishLast(conn); + } + + protected static void finishLast(HttpConnection conn) { + // copied from superclass because it wasn't made available to subclasses + InputStream lastResponse = conn.getLastResponseInputStream(); + if (lastResponse != null) { + conn.setLastResponseInputStream(null); + try { + lastResponse.close(); + } catch (IOException ioe) { + //FIXME: badness - close to force reconnect. + conn.close(); + } + } + } +} diff --git a/src/main/java/org/archive/httpclient/ThreadLocalHttpConnectionManager.java b/src/main/java/org/archive/httpclient/ThreadLocalHttpConnectionManager.java new file mode 100644 index 00000000..91e850ea --- /dev/null +++ b/src/main/java/org/archive/httpclient/ThreadLocalHttpConnectionManager.java @@ -0,0 +1,291 @@ +/** + * ==================================================================== + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * ==================================================================== + * + */ +package org.archive.httpclient; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.logging.Level; +import java.util.logging.Logger; + +import org.apache.commons.httpclient.HostConfiguration; +import org.apache.commons.httpclient.HttpConnection; +import org.apache.commons.httpclient.HttpConnectionManager; +import org.apache.commons.httpclient.params.HttpConnectionManagerParams; + +/** + * A simple, but thread-safe HttpClient {@link HttpConnectionManager}. + * Based on {@link org.apache.commons.httpclient.SimpleHttpConnectionManager}. + * + * Java >= 1.4 is recommended. + * + * @author Christian Kohlschuetter + */ +public final class ThreadLocalHttpConnectionManager implements + HttpConnectionManager { + + private static final CloserThread closer = new CloserThread(); + private static final Logger logger = Logger + .getLogger(ThreadLocalHttpConnectionManager.class.getName()); + + private final ThreadLocal tl = new ThreadLocal() { + protected synchronized ConnectionInfo initialValue() { + return new ConnectionInfo(); + } + }; + + private ConnectionInfo getConnectionInfo() { + return (ConnectionInfo) tl.get(); + } + + private static final class ConnectionInfo { + /** The http connection */ + private HttpConnection conn = null; + + /** + * The time the connection was made idle. + */ + private long idleStartTime = Long.MAX_VALUE; + } + + public ThreadLocalHttpConnectionManager() { + } + + /** + * Since the same connection is about to be reused, make sure the + * previous request was completely processed, and if not + * consume it now. + * @param conn The connection + * @return true, if the connection is reusable + */ + private static boolean finishLastResponse(final HttpConnection conn) { + InputStream lastResponse = conn.getLastResponseInputStream(); + if(lastResponse != null) { + conn.setLastResponseInputStream(null); + try { + lastResponse.close(); + return true; + } catch (IOException ioe) { + // force reconnect. + return false; + } + } else { + return false; + } + } + + /** + * Collection of parameters associated with this connection manager. + */ + private HttpConnectionManagerParams params = new HttpConnectionManagerParams(); + + /** + * @see HttpConnectionManager#getConnection(HostConfiguration) + */ + public HttpConnection getConnection( + final HostConfiguration hostConfiguration) { + return getConnection(hostConfiguration, 0); + } + + /** + * Gets the staleCheckingEnabled value to be set on HttpConnections that are created. + * + * @return true if stale checking will be enabled on HttpConections + * + * @see HttpConnection#isStaleCheckingEnabled() + * + * @deprecated Use {@link HttpConnectionManagerParams#isStaleCheckingEnabled()}, + * {@link HttpConnectionManager#getParams()}. + */ + public boolean isConnectionStaleCheckingEnabled() { + return this.params.isStaleCheckingEnabled(); + } + + /** + * Sets the staleCheckingEnabled value to be set on HttpConnections that are created. + * + * @param connectionStaleCheckingEnabled true if stale checking will be enabled + * on HttpConections + * + * @see HttpConnection#setStaleCheckingEnabled(boolean) + * + * @deprecated Use {@link HttpConnectionManagerParams#setStaleCheckingEnabled(boolean)}, + * {@link HttpConnectionManager#getParams()}. + */ + public void setConnectionStaleCheckingEnabled( + final boolean connectionStaleCheckingEnabled) { + this.params.setStaleCheckingEnabled(connectionStaleCheckingEnabled); + } + + /** + * @see HttpConnectionManager#getConnectionWithTimeout(HostConfiguration, long) + * + * @since 3.0 + */ + public HttpConnection getConnectionWithTimeout( + final HostConfiguration hostConfiguration, final long timeout) { + + final ConnectionInfo ci = getConnectionInfo(); + HttpConnection httpConnection = ci.conn; + + // make sure the host and proxy are correct for this connection + // close it and set the values if they are not + if(httpConnection == null || !finishLastResponse(httpConnection) + || !hostConfiguration.hostEquals(httpConnection) + || !hostConfiguration.proxyEquals(httpConnection)) { + + if(httpConnection != null && httpConnection.isOpen()) { + closer.closeConnection(httpConnection); + } + + httpConnection = new HttpConnection(hostConfiguration); + httpConnection.setHttpConnectionManager(this); + httpConnection.getParams().setDefaults(this.params); + ci.conn = httpConnection; + + httpConnection.setHost(hostConfiguration.getHost()); + httpConnection.setPort(hostConfiguration.getPort()); + httpConnection.setProtocol(hostConfiguration.getProtocol()); + httpConnection.setLocalAddress(hostConfiguration.getLocalAddress()); + + httpConnection.setProxyHost(hostConfiguration.getProxyHost()); + httpConnection.setProxyPort(hostConfiguration.getProxyPort()); + } + + // remove the connection from the timeout handler + ci.idleStartTime = Long.MAX_VALUE; + + return httpConnection; + } + + /** + * @see HttpConnectionManager#getConnection(HostConfiguration, long) + * + * @deprecated Use #getConnectionWithTimeout(HostConfiguration, long) + */ + public HttpConnection getConnection( + final HostConfiguration hostConfiguration, final long timeout) { + return getConnectionWithTimeout(hostConfiguration, timeout); + } + + /** + * @see HttpConnectionManager#releaseConnection(org.apache.commons.httpclient.HttpConnection) + */ + public void releaseConnection(final HttpConnection conn) { + final ConnectionInfo ci = getConnectionInfo(); + HttpConnection httpConnection = ci.conn; + + if(conn != httpConnection) { + throw new IllegalStateException( + "Unexpected release of an unknown connection."); + } + + finishLastResponse(httpConnection); + + // track the time the connection was made idle + ci.idleStartTime = System.currentTimeMillis(); + } + + /** + * Returns {@link HttpConnectionManagerParams parameters} associated + * with this connection manager. + * + * @since 2.1 + * + * @see HttpConnectionManagerParams + */ + public HttpConnectionManagerParams getParams() { + return this.params; + } + + /** + * Assigns {@link HttpConnectionManagerParams parameters} for this + * connection manager. + * + * @since 2.1 + * + * @see HttpConnectionManagerParams + */ + public void setParams(final HttpConnectionManagerParams p) { + if(p == null) { + throw new IllegalArgumentException("Parameters may not be null"); + } + this.params = p; + } + + /** + * @since 3.0 + */ + public void closeIdleConnections(final long idleTimeout) { + long maxIdleTime = System.currentTimeMillis() - idleTimeout; + + final ConnectionInfo ci = getConnectionInfo(); + + if(ci.idleStartTime <= maxIdleTime) { + ci.conn.close(); + } + } + + private static final class CloserThread extends Thread { + private List connections + = new ArrayList(); + + private static final int SLEEP_INTERVAL = 5000; + + public CloserThread() { + super("HttpConnection closer"); + // Make this a daemon thread so it can't be responsible for the JVM + // not shutting down. + setDaemon(true); + start(); + } + + public void closeConnection(final HttpConnection conn) { + synchronized (connections) { + connections.add(conn); + } + } + + public void run() { + try { + while (!Thread.interrupted()) { + Thread.sleep(SLEEP_INTERVAL); + + List s; + synchronized (connections) { + s = connections; + connections = new ArrayList(); + } + logger.log(Level.INFO, "Closing " + s.size() + + " HttpConnections"); + for(final Iterator it = s.iterator(); + it.hasNext();) { + HttpConnection conn = it.next(); + conn.close(); + conn.setHttpConnectionManager(null); + it.remove(); + } + } + } catch (InterruptedException e) { + return; + } + } + } +} diff --git a/src/main/java/org/archive/httpclient/package.html b/src/main/java/org/archive/httpclient/package.html new file mode 100644 index 00000000..87ae77ed --- /dev/null +++ b/src/main/java/org/archive/httpclient/package.html @@ -0,0 +1,24 @@ + + + +org.archive.httpclient package + +Provides specializations on + apache jakarta + commons httpclient. + +

HttpRecorderGetMethod

+

Class that the passed HttpRecorder w/ boundary between + HTTP header and content. Also forces a close on the response on + call to releaseConnection.

+ +

ConfigurableTrustManagerProtocolSocketFactory

+

A protocol socket factory that allows setting of trust level on + construction.

+ +

References

+

JavaTM Secure Socket Extension (JSSE): Reference Guide

+ + + diff --git a/src/main/java/org/archive/io/ArchiveFileConstants.java b/src/main/java/org/archive/io/ArchiveFileConstants.java new file mode 100644 index 00000000..b1a39194 --- /dev/null +++ b/src/main/java/org/archive/io/ArchiveFileConstants.java @@ -0,0 +1,24 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + +@Deprecated +public interface ArchiveFileConstants extends org.archive.format.ArchiveFileConstants { +} diff --git a/src/main/java/org/archive/io/ArchiveReader.java b/src/main/java/org/archive/io/ArchiveReader.java new file mode 100644 index 00000000..66056d33 --- /dev/null +++ b/src/main/java/org/archive/io/ArchiveReader.java @@ -0,0 +1,761 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + + +import java.io.BufferedInputStream; +import java.io.BufferedWriter; +import java.io.Closeable; +import java.io.EOFException; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileWriter; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.logging.Level; +import java.util.logging.Logger; + +import org.apache.commons.cli.Option; +import org.apache.commons.cli.Options; +import org.archive.util.MimetypeUtils; +import org.archive.util.zip.GZIPMembersInputStream; + +import com.google.common.io.CountingInputStream; + + +/** + * Reader for an Archive file of Archive {@link ArchiveRecord}s. + * @author stack + * @version $Date$ $Version$ + */ +public abstract class ArchiveReader implements ArchiveFileConstants, Iterable, Closeable { + /** + * Is this Archive file compressed? + */ + private boolean compressed = false; + + /** + * Should we digest as we read? + */ + private boolean digest = true; + + /** + * Should the parse be strict? + */ + private boolean strict = false; + + /** + * Archive file input stream. + * + * Keep it around so we can close it when done. + * + *

Set in constructor. Should support at least 1 byte mark/reset. + * Make it protected so subclasses have access. + */ + protected InputStream in = null; + + /** + * Maximum amount of recoverable exceptions in a row. + * If more than this amount in a row, we'll let out the exception rather + * than go back in for yet another retry. + */ + public static final int MAX_ALLOWED_RECOVERABLES = 10; + + + /** + * The Record currently being read. + * + * Keep this ongoing reference so we'll close the record even if the caller + * doesn't. + */ + private ArchiveRecord currentRecord = null; + + /** + * Descriptive string for the Archive file we're going against: + * full path, url, etc. -- depends on context in which file was made. + */ + private String identifier = null; + + /** + * Archive file version. + */ + private String version = null; + + + protected ArchiveReader() { + super(); + } + + /** + * Convenience method used by subclass constructors. + * @param i Identifier for Archive file this reader goes against. + */ + protected void initialize(final String i) { + setReaderIdentifier(i); + } + + /** + * Convenience method for constructors. + * + * @param f File to read. + * @param offset Offset at which to start reading. + * @return InputStream to read from. + * @throws IOException If failed open or fail to get a memory + * mapped byte buffer on file. + */ + protected InputStream getInputStream(final File f, final long offset) + throws IOException { + FileInputStream fin = new FileInputStream(f); + return new BufferedInputStream(fin); + } + + public boolean isCompressed() { + return this.compressed; + } + + /** + * Get record at passed offset. + * + * @param offset Byte index into file at which a record starts. + * @return An Archive Record reference. + * @throws IOException + */ + public ArchiveRecord get(long offset) throws IOException { + cleanupCurrentRecord(); + long posn = positionForRecord(in); + if(offset>=posn) { + in.skip(offset-posn); + } else { + throw new UnsupportedOperationException("no reverse seeking: at "+posn+" requested "+offset); + } + return createArchiveRecord(this.in, offset); + } + + /** + * @return Return Archive Record created against current offset. + * @throws IOException + */ + public ArchiveRecord get() throws IOException { + return createArchiveRecord(this.in, positionForRecord(in)); + } + + public void close() throws IOException { + if (this.in != null) { + this.in.close(); + this.in = null; + } + } + + /** + * Cleanout the current record if there is one. + * @throws IOException + */ + protected void cleanupCurrentRecord() throws IOException { + if (this.currentRecord != null) { + this.currentRecord.close(); + gotoEOR(this.currentRecord); + this.currentRecord = null; + } + } + + /** + * Return an Archive Record homed on offset into + * is. + * @param is Stream to read Record from. + * @param offset Offset to find Record at. + * @return ArchiveRecord instance. + * @throws IOException + */ + protected abstract ArchiveRecord createArchiveRecord(InputStream is, + long offset) + throws IOException; + + /** + * Skip over any trailing new lines at end of the record so we're lined up + * ready to read the next. + * @param record + * @throws IOException + */ + protected abstract void gotoEOR(ArchiveRecord record) throws IOException; + + public abstract String getFileExtension(); + public abstract String getDotFileExtension(); + + /** + * @return Version of this Archive file. + */ + public String getVersion() { + return this.version; + } + + /** + * Validate the Archive file. + * + * This method iterates over the file throwing exception if it fails + * to successfully parse any record. + * + *

Assumes the stream is at the start of the file. + * @return List of all read Archive Headers. + * + * @throws IOException + */ + public List validate() throws IOException { + return validate(-1); + } + + /** + * Validate the Archive file. + * + * This method iterates over the file throwing exception if it fails + * to successfully parse. + * + *

We start validation from wherever we are in the stream. + * + * @param numRecords Number of records expected. Pass -1 if number is + * unknown. + * + * @return List of all read metadatas. As we validate records, we add + * a reference to the read metadata. + * + * @throws IOException + */ + public List validate(int numRecords) + throws IOException { + List hdrList = new ArrayList(); + int recordCount = 0; + setStrict(true); + for (Iterator i = iterator(); i.hasNext();) { + recordCount++; + ArchiveRecord r = i.next(); + if (r.getHeader().getLength() <= 0 + && r.getHeader().getMimetype(). + equals(MimetypeUtils.NO_TYPE_MIMETYPE)) { + throw new IOException("record content is empty."); + } + r.close(); + hdrList.add(r.getHeader()); + } + + if (numRecords != -1) { + if (recordCount != numRecords) { + throw new IOException("Count of records, " + + Integer.toString(recordCount) + + " is not equal to expected " + + Integer.toString(numRecords)); + } + } + + return hdrList; + } + + /** + * Test Archive file is valid. + * Assumes the stream is at the start of the file. Be aware that this + * method makes a pass over the whole file. + * @return True if file can be successfully parsed. + */ + public boolean isValid() { + boolean valid = false; + try { + validate(); + valid = true; + } catch(Exception e) { + // File is not valid if exception thrown parsing. + valid = false; + } + + return valid; + } + + /** + * @return Returns the strict. + */ + public boolean isStrict() { + return this.strict; + } + + /** + * @param s The strict to set. + */ + public void setStrict(boolean s) { + this.strict = s; + } + + /** + * @param d True if we're to digest. + */ + public void setDigest(boolean d) { + this.digest = d; + } + + /** + * @return True if we're digesting as we read. + */ + public boolean isDigest() { + return this.digest; + } + + protected Logger getLogger() { + return Logger.getLogger(this.getClass().getName()); + } + + /** + * Returns an ArchiveRecord iterator. + * Of note, on IOException, especially if ZipException reading compressed + * ARCs, rather than fail the iteration, try moving to the next record. + * If {@link ArchiveReader#strict} is not set, this will usually succeed. + * @return An iterator over ARC records. + */ + public Iterator iterator() { + // Eat up any record outstanding. + try { + cleanupCurrentRecord(); + } catch (IOException e) { + throw new RuntimeException(e); + } + + return new ArchiveRecordIterator(); + } + + protected void setCompressed(boolean compressed) { + this.compressed = compressed; + } + + /** + * @return The current ARC record or null if none. + * After construction has the arcfile header record. + * @see #get() + */ + protected ArchiveRecord getCurrentRecord() { + return this.currentRecord; + } + + protected ArchiveRecord currentRecord(final ArchiveRecord r) { + this.currentRecord = r; + return r; + } + + protected InputStream getIn() { + return in; + } + + protected void setIn(InputStream in) { + this.in = in; + } + + protected void setVersion(String version) { + this.version = version; + } + + public String getReaderIdentifier() { + return this.identifier; + } + + protected void setReaderIdentifier(final String i) { + this.identifier = i; + } + + /** + * Log on stderr. + * Logging should go via the logging system. This method + * bypasses the logging system going direct to stderr. + * Should not generally be used. Its used for rare messages + * that come of cmdline usage of ARCReader ERRORs and WARNINGs. + * Override if using ARCReader in a context where no stderr or + * where you'd like to redirect stderr to other than System.err. + * @param level Level to log message at. + * @param message Message to log. + */ + public void logStdErr(Level level, String message) { + System.err.println(level.toString() + " " + message); + } + +// /** +// * Add buffering to RandomAccessInputStream. +// */ +// protected class RandomAccessBufferedInputStream +// extends BufferedInputStream implements RepositionableStream { +// +// public RandomAccessBufferedInputStream(RandomAccessInputStream is) +// throws IOException { +// super(is); +// } +// +// public RandomAccessBufferedInputStream(RandomAccessInputStream is, int size) +// throws IOException { +// super(is, size); +// } +// +// public long position() throws IOException { +// // Current position is the underlying files position +// // minus the amount thats in the buffer yet to be read. +// return ((RandomAccessInputStream)this.in).position() - +// (this.count - this.pos); +// } +// +// public void position(long position) throws IOException { +// // Force refill of buffer whenever there's been a seek. +// this.pos = 0; +// this.count = 0; +// ((RandomAccessInputStream)this.in).position(position); +// } +// +// public int available() throws IOException { +// // Avoid overflow on large datastreams +// long amount = (long)in.available() + (long)(count - pos); +// return (amount >= Integer.MAX_VALUE)? Integer.MAX_VALUE: (int)amount; +// } +// } + + /** + * Inner ArchiveRecord Iterator class. + * Throws RuntimeExceptions in {@link #hasNext()} and {@link #next()} if + * trouble pulling record from underlying stream. + * @author stack + */ + protected class ArchiveRecordIterator implements Iterator { + private final Logger logger = + Logger.getLogger(this.getClass().getName()); + /** + * @return True if we have more records to read. + * @exception RuntimeException Can throw an IOException wrapped in a + * RuntimeException if a problem reading underlying stream (Corrupted + * gzip, etc.). + */ + public boolean hasNext() { + // Call close on any extant record. This will scoot us past + // any content not yet read. + try { + cleanupCurrentRecord(); + } catch (IOException e) { + if (isStrict()) { + throw new RuntimeException(e); + } + if (e instanceof EOFException) { + logger.warning("Premature EOF cleaning up " + + currentRecord.getHeader().toString() + ": " + + e.getMessage()); + return false; + } + // If not strict, try going again. We might be able to skip + // over the bad record. + logger.log(Level.WARNING,"Trying skip of failed record cleanup of " + + currentRecord.getHeader().toString() + ": " + + e.getMessage(), e); + } + return innerHasNext(); + } + + protected boolean innerHasNext(){ + try { + getIn().mark(1); + int c = getIn().read(); + getIn().reset(); + return c > -1; + } catch (IOException e) { + logger.log(Level.WARNING,"problem probing for more content",e); + return false; + } + } + + /** + * Tries to move to next record if we get + * {@link RecoverableIOException}. If not strict + * tries to move to next record if we get an + * {@link IOException}. + * @return Next object. + * @exception RuntimeException Throws a runtime exception, + * usually a wrapping of an IOException, if trouble getting + * a record (Throws exception rather than return null). + */ + public ArchiveRecord next() { + long offset = -1; + try { + offset = positionForRecord(getIn()); + return exceptionNext(); + } catch (IOException e) { + if (!isStrict()) { + // Retry though an IOE. Maybe we will succeed reading + // subsequent record. + try { + if (hasNext()) { + getLogger().warning("Bad Record. Trying skip " + + "(Record start " + offset + "): " + + e.getMessage()); + return exceptionNext(); + } + // Else we are at last record. Iterator#next is + // expecting value. We do not have one. Throw exception. + throw new RuntimeException("Retried but no next " + + "record (Record start " + offset + ")", e); + } catch (IOException e1) { + throw new RuntimeException("After retry (Offset " + + offset + ")", e1); + } + } + throw new RuntimeException("(Record start " + offset + ")", e); + } + } + + /** + * A next that throws exceptions and has handling of + * recoverable exceptions moving us to next record. Can call + * hasNext which itself may throw exceptions. + * @return Next record. + * @throws IOException + * @throws RuntimeException Thrown when we've reached maximum + * retries. + */ + protected ArchiveRecord exceptionNext() + throws IOException, RuntimeException { + ArchiveRecord result = null; + IOException ioe = null; + for (int i = MAX_ALLOWED_RECOVERABLES; i > 0 && + result == null; i--) { + ioe = null; + try { + result = innerNext(); + } catch (RecoverableIOException e) { + ioe = e; + getLogger().warning(e.getMessage()); + if (hasNext()) { + continue; + } + // No records left. Throw exception rather than + // return null. The caller is expecting to get + // back a record since they've just called + // hasNext. + break; + } + } + if (ioe != null) { + // Then we did MAX_ALLOWED_RECOVERABLES retries. Throw + // the recoverable ioe wrapped in a RuntimeException so + // it goes out pass checks for IOE. + throw new RuntimeException("Retried " + + MAX_ALLOWED_RECOVERABLES + " times in a row", ioe); + } + return result; + } + + protected ArchiveRecord innerNext() throws IOException { + return get(positionForRecord(getIn())); + } + + public void remove() { + throw new UnsupportedOperationException(); + } + } + + protected static long positionForRecord(InputStream in) { + return (in instanceof GZIPMembersInputStream) + ? ((GZIPMembersInputStream)in).getCurrentMemberStart() + : ((CountingInputStream)in).getCount(); + } + + protected static String stripExtension(final String name, + final String ext) { + return (!name.endsWith(ext))? name: + name.substring(0, name.length() - ext.length()); + } + + /** + * @return short name of Archive file. + */ + public String getFileName() { + return (new File(getReaderIdentifier())).getName(); + } + + /** + * @return short name of Archive file. + */ + public String getStrippedFileName() { + return getStrippedFileName(getFileName(), + getDotFileExtension()); + } + + /** + * @param name Name of ARCFile. + * @param dotFileExtension '.arc' or '.warc', etc. + * @return short name of Archive file. + */ + public static String getStrippedFileName(String name, + final String dotFileExtension) { + name = stripExtension(name, + ArchiveFileConstants.DOT_COMPRESSED_FILE_EXTENSION); + return stripExtension(name, dotFileExtension); + } + + /** + * @param value Value to test. + * @return True if value is 'true', else false. + */ + protected static boolean getTrueOrFalse(final String value) { + if (value == null || value.length() <= 0) { + return false; + } + return Boolean.TRUE.toString().equals(value.toLowerCase()); + } + + /** + * @param format Format to use outputting. + * @throws IOException + * @throws java.text.ParseException + * @return True if handled. + */ + protected boolean output(final String format) + throws IOException, java.text.ParseException { + boolean result = true; + // long start = System.currentTimeMillis(); + + // Write output as pseudo-CDX file. See + // http://www.archive.org/web/researcher/cdx_legend.php + // and http://www.archive.org/web/researcher/example_cdx.php. + // Hash is hard-coded straight SHA-1 hash of content. + if (format.equals(DUMP)) { + // No point digesting dumping. + setDigest(false); + dump(false); + } else if (format.equals(GZIP_DUMP)) { + // No point digesting dumping. + setDigest(false); + dump(true); + } else if (format.equals(CDX)) { + cdxOutput(false); + } else if (format.equals(CDX_FILE)) { + cdxOutput(true); + } else { + result = false; + } + return result; + } + + protected void cdxOutput(boolean toFile) + throws IOException { + BufferedWriter cdxWriter = null; + if (toFile) { + String cdxFilename = stripExtension(getReaderIdentifier(), + DOT_COMPRESSED_FILE_EXTENSION); + cdxFilename = stripExtension(cdxFilename, getDotFileExtension()); + cdxFilename += ('.' + CDX); + cdxWriter = new BufferedWriter(new FileWriter(cdxFilename)); + } + + String header = "CDX b e a m s c " + ((isCompressed()) ? "V" : "v") + + " n g"; + if (toFile) { + cdxWriter.write(header); + cdxWriter.newLine(); + } else { + System.out.println(header); + } + + String strippedFileName = getStrippedFileName(); + try { + for (Iterator ii = iterator(); ii.hasNext();) { + ArchiveRecord r = ii.next(); + if (toFile) { + cdxWriter.write(r.outputCdx(strippedFileName)); + cdxWriter.newLine(); + } else { + System.out.println(r.outputCdx(strippedFileName)); + } + } + } finally { + if (toFile) { + cdxWriter.close(); + } + } + } + + /** + * Output passed record using passed format specifier. + * @param format What format to use outputting. + * @throws IOException + * @return True if handled. + */ + public boolean outputRecord(final String format) + throws IOException { + boolean result = true; + if (format.equals(CDX)) { + System.out.println(get().outputCdx(getStrippedFileName())); + } else if(format.equals(ArchiveFileConstants.DUMP)) { + // No point digesting if dumping content. + setDigest(false); + get().dump(); + } else { + result = false; + } + return result; + } + + /** + * Dump this file on STDOUT + * @throws compress True if dumped output is compressed. + * @throws IOException + * @throws java.text.ParseException + */ + public abstract void dump(final boolean compress) + throws IOException, java.text.ParseException; + + /** + * @return an ArchiveReader that will delete a local file on close. Used + * when we bring Archive files local and need to clean up afterward. + */ + public abstract ArchiveReader getDeleteFileOnCloseReader(final File f); + + /** + * Output passed record using passed format specifier. + * @param r ARCReader instance to output. + * @param format What format to use outputting. + * @throws IOException + */ + protected static void outputRecord(final ArchiveReader r, + final String format) + throws IOException { + if (!r.outputRecord(format)) { + throw new IOException("Unsupported format" + + " (or unsupported on a single record): " + format); + } + } + + /** + * @return Base Options object filled out with help, digest, strict, etc. + * options. + */ + protected static Options getOptions() { + Options options = new Options(); + options.addOption(new Option("h","help", false, + "Prints this message and exits.")); + options.addOption(new Option("o","offset", true, + "Outputs record at this offset into file.")); + options.addOption(new Option("d","digest", true, + "Pass true|false. Expensive. Default: true (SHA-1).")); + options.addOption(new Option("s","strict", false, + "Strict mode. Fails parse if incorrectly formatted file.")); + options.addOption(new Option("f","format", true, + "Output options: 'cdx', cdxfile', 'dump', 'gzipdump'," + + "'or 'nohead'. Default: 'cdx'.")); + return options; + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/ArchiveReaderFactory.java b/src/main/java/org/archive/io/ArchiveReaderFactory.java new file mode 100644 index 00000000..17f14d3a --- /dev/null +++ b/src/main/java/org/archive/io/ArchiveReaderFactory.java @@ -0,0 +1,301 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.net.HttpURLConnection; +import java.net.MalformedURLException; +import java.net.URL; +import java.net.URLConnection; + +import org.archive.io.arc.ARCReaderFactory; +import org.archive.io.warc.WARCReaderFactory; +import org.archive.net.md5.Md5URLConnection; +import org.archive.net.rsync.RsyncURLConnection; +import org.archive.url.UsableURI; +import org.archive.util.FileUtils; + + +/** + * Factory that returns an Archive file Reader. + * Returns Readers for ARCs or WARCs. + * @author stack + * @version $Date$ $Revision$ + */ +public class ArchiveReaderFactory implements ArchiveFileConstants { + // Static block to enable S3 URLs + static { + if (System.getProperty("java.protocol.handler.pkgs") != null) { + System.setProperty("java.protocol.handler.pkgs", + System.getProperty("java.protocol.handler.pkgs") + + "|" + "org.archive.net"); + } else { + System.setProperty("java.protocol.handler.pkgs", "org.archive.net"); + } + } + + private static final ArchiveReaderFactory factory = + new ArchiveReaderFactory(); + + /** + * Shutdown any public access to default constructor. + */ + protected ArchiveReaderFactory() { + super(); + } + + /** + * Get an Archive file Reader on passed path or url. + * Does primitive heuristic figuring if path or URL. + * @param arcFileOrUrl File path or URL pointing at an Archive file. + * @return An Archive file Reader. + * @throws IOException + * @throws MalformedURLException + * @throws IOException + */ + public static ArchiveReader get(final String arcFileOrUrl) + throws MalformedURLException, IOException { + return ArchiveReaderFactory.factory.getArchiveReader(arcFileOrUrl); + } + + protected ArchiveReader getArchiveReader(final String arcFileOrUrl) + throws MalformedURLException, IOException { + return getArchiveReader(arcFileOrUrl, 0); + } + + protected ArchiveReader getArchiveReader(final String arcFileOrUrl, + final long offset) + throws MalformedURLException, IOException { + return UsableURI.hasScheme(arcFileOrUrl) && arcFileOrUrl.indexOf(":")>1? + get(new URL(arcFileOrUrl), offset): + get(new File(arcFileOrUrl), offset); + } + + /** + * @param f An Archive file to read. + * @return An ArchiveReader + * @throws IOException + */ + public static ArchiveReader get(final File f) throws IOException { + return ArchiveReaderFactory.factory.getArchiveReader(f); + } + + protected ArchiveReader getArchiveReader(final File f) + throws IOException { + return getArchiveReader(f, 0); + } + + /** + * @param f An Archive file to read. + * @param offset Have returned Reader set to start reading at this offset. + * @return An ArchiveReader + * @throws IOException + */ + public static ArchiveReader get(final File f, final long offset) + throws IOException { + return ArchiveReaderFactory.factory.getArchiveReader(f, offset); + } + + protected ArchiveReader getArchiveReader(final File f, + final long offset) + throws IOException { + if (ARCReaderFactory.isARCSuffix(f.getName())) { + return ARCReaderFactory.get(f, true, offset); + } else if (WARCReaderFactory.isWARCSuffix(f.getName())) { + return WARCReaderFactory.get(f, offset); + } + throw new IOException("Unknown file extension (Not ARC nor WARC): " + + f.getName()); + } + + /** + * Wrap a Reader around passed Stream. + * @param s Identifying String for this Stream used in error messages. + * Must be a string that ends with the name of the file we're to put + * an ArchiveReader on. This code looks at file endings to figure + * whether to return an ARC or WARC reader. + * @param is Stream. Stream will be wrapped with implementation of + * RepositionableStream unless already supported. + * @param atFirstRecord Are we at first Record? + * @return ArchiveReader. + * @throws IOException + */ + public static ArchiveReader get(final String s, final InputStream is, + final boolean atFirstRecord) + throws IOException { + return ArchiveReaderFactory.factory.getArchiveReader(s, is, + atFirstRecord); + } + + protected ArchiveReader getArchiveReader(final String id, + final InputStream is, final boolean atFirstRecord) + throws IOException { + final InputStream stream = is; + if (ARCReaderFactory.isARCSuffix(id)) { + return ARCReaderFactory.get(id, stream, atFirstRecord); + } else if (WARCReaderFactory.isWARCSuffix(id)) { + return WARCReaderFactory.get(id, stream, atFirstRecord); + } + throw new IOException("Unknown extension (Not ARC nor WARC): " + id); + } + + /** + * Get an Archive Reader aligned at offset. + * This version of get will not bring the file local but will try to + * stream across the net making an HTTP 1.1 Range request on remote + * http server (RFC1435 Section 14.35). + * @param u HTTP URL for an Archive file. + * @param offset Offset into file at which to start fetching. + * @return An ArchiveReader aligned at offset. + * @throws IOException + */ + public static ArchiveReader get(final URL u, final long offset) + throws IOException { + return ArchiveReaderFactory.factory.getArchiveReader(u, offset); + } + + protected ArchiveReader getArchiveReader(final URL f, final long offset) + throws IOException { + // Get URL connection. + URLConnection connection = f.openConnection(); + if (connection instanceof HttpURLConnection) { + addUserAgent((HttpURLConnection)connection); + } + if (offset != 0) { + // Use a Range request (Assumes HTTP 1.1 on other end). If + // length >= 0, add open-ended range header to the request. Else, + // because end-byte is inclusive, subtract 1. + connection.addRequestProperty("Range", "bytes=" + offset + "-"); + // TODO: should actually verify that server respected 'Range' request + // (spec allows them to ignore; 206 response or Content-Range header + // should be present if Range satisfied; multipart/byteranges could be + // a problem). + } + + return getArchiveReader(f.toString(), connection.getInputStream(), (offset == 0)); + } + + /** + * Get an ARCReader. + * Pulls the ARC local into whereever the System Property + * java.io.tmpdir points. It then hands back an ARCReader that + * points at this local copy. A close on this ARCReader instance will + * remove the local copy. + * @param u An URL that points at an ARC. + * @return An ARCReader. + * @throws IOException + */ + public static ArchiveReader get(final URL u) + throws IOException { + return ArchiveReaderFactory.factory.getArchiveReader(u); + } + + protected ArchiveReader getArchiveReader(final URL u) + throws IOException { + // If url represents a local file then return file it points to. + if (u.getPath() != null) { + // TODO: Add scheme check and host check. + File f = new File(u.getPath()); + if (f.exists()) { + return get(f, 0); + } + } + + String scheme = u.getProtocol(); + if (scheme.startsWith("http") || scheme.equals("s3")) { + // Try streaming if http or s3 URLs rather than copying local + // and then reading (Passing an offset will get us an Reader + // that wraps a Stream). + return get(u, 0); + } + + return makeARCLocal(u.openConnection()); + } + + protected ArchiveReader makeARCLocal(final URLConnection connection) + throws IOException { + File localFile = null; + if (connection instanceof HttpURLConnection) { + // If http url connection, bring down the resource local. + String p = connection.getURL().getPath(); + int index = p.lastIndexOf('/'); + if (index >= 0) { + // Name file for the file we're making local. + localFile = File.createTempFile("",p.substring(index + 1)); + if (localFile.exists()) { + // If file of same name already exists in TMPDIR, then + // clean it up (Assuming only reason a file of same name in + // TMPDIR is because we failed a previous download). + localFile.delete(); + } + } else { + localFile = File.createTempFile(ArchiveReader.class.getName(), + ".tmp"); + } + addUserAgent((HttpURLConnection)connection); + connection.connect(); + try { + FileUtils.readFullyToFile(connection.getInputStream(), localFile); + } catch (IOException ioe) { + localFile.delete(); + throw ioe; + } + } else if (connection instanceof RsyncURLConnection) { + // Then, connect and this will create a local file. + // See implementation of the rsync handler. + connection.connect(); + localFile = ((RsyncURLConnection)connection).getFile(); + } else if (connection instanceof Md5URLConnection) { + // Then, connect and this will create a local file. + // See implementation of the md5 handler. + connection.connect(); + localFile = ((Md5URLConnection)connection).getFile(); + } else { + throw new UnsupportedOperationException("No support for " + + connection); + } + + ArchiveReader reader = null; + try { + reader = get(localFile, 0); + } catch (IOException e) { + localFile.delete(); + throw e; + } + + // Return a delegate that does cleanup of downloaded file on close. + return reader.getDeleteFileOnCloseReader(localFile); + } + + protected void addUserAgent(final HttpURLConnection connection) { + connection.addRequestProperty("User-Agent", this.getClass().getName()); + } + + /** + * @param f File to test. + * @return True if f is compressed. + * @throws IOException + */ + protected boolean isCompressed(final File f) throws IOException { + return f.getName().toLowerCase(). + endsWith(DOT_COMPRESSED_FILE_EXTENSION); + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/ArchiveRecord.java b/src/main/java/org/archive/io/ArchiveRecord.java new file mode 100644 index 00000000..63bfe628 --- /dev/null +++ b/src/main/java/org/archive/io/ArchiveRecord.java @@ -0,0 +1,409 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.util.logging.Level; + +import org.archive.util.Base32; + +/** + * Archive file Record. + * @author stack + * @version $Date$ $Version$ + */ +public abstract class ArchiveRecord extends InputStream { + + /** + * Minimal http response or request header length. + * + * I've seen in arcs content length of 1 with no header. + */ + protected static final long MIN_HTTP_HEADER_LENGTH = + Math.min("HTTP/1.1 200 OK\r\n".length(), "GET / HTTP/1.0\n\r".length()); + + protected ArchiveRecordHeader header = null; + + /** + * Stream to read this record from. + * + * Stream can only be read sequentially. Will only return this records' + * content returning a -1 if you try to read beyond the end of the current + * record. + * + *

Streams can be markable or not. If they are, we'll be able to roll + * back when we've read too far. If not markable, assumption is that + * the underlying stream is managing our not reading too much (This pertains + * to the skipping over the end of the ARCRecord. See {@link #skip()}. + */ + protected InputStream in = null; + + /** + * Position w/i the Record content, within in. + * This position is relative within this Record. Its not same as the + * Archive file position. + */ + protected long position = 0; + + /** + * Set flag when we've reached the end-of-record. + */ + protected boolean eor = false; + + /** + * Compute digest on what we read and add to metadata when done. + * + * Currently hardcoded as sha-1. TODO: Remove when archive records + * digest or else, add a facility that allows the arc reader to + * compare the calculated digest to that which is recorded in + * the arc. + * + *

Protected instead of private so subclasses can update and complete + * the digest. + */ + protected MessageDigest digest = null; + private String digestStr = null; + + protected boolean strict = false; + + + /** + * Constructor. + * + * @param in Stream cue'd up to be at the start of the record this instance + * is to represent. + * @throws IOException + */ + public ArchiveRecord(InputStream in) + throws IOException { + this(in, null, 0, true, false); + } + + /** + * Constructor. + * + * @param in Stream cue'd up to be at the start of the record this instance + * is to represent. + * @param header Header data. + * @throws IOException + */ + public ArchiveRecord(InputStream in, ArchiveRecordHeader header) + throws IOException { + this(in, header, 0, true, false); + } + + /** + * Constructor. + * + * @param in Stream cue'd up to be at the start of the record this instance + * is to represent. + * @param header Header data. + * @param bodyOffset Offset into the body. Usually 0. + * @param digest True if we're to calculate digest for this record. Not + * digesting saves about ~15% of cpu during an ARC parse. + * @param strict Be strict parsing (Parsing stops if ARC inproperly + * formatted). + * @throws IOException + */ + public ArchiveRecord(InputStream in, ArchiveRecordHeader header, + int bodyOffset, boolean digest, boolean strict) + throws IOException { + this.in = in; + this.header = header; + this.position = bodyOffset; + if (digest) { + try { + this.digest = MessageDigest.getInstance("SHA1"); + } catch (NoSuchAlgorithmException e) { + // Convert to IOE because thats more amenable to callers + // -- they are dealing with it anyways. + throw new IOException(e.getMessage()); + } + } + this.strict = strict; + } + + public boolean markSupported() { + return false; + } + + /** + * @return Header data for this record. + */ + public ArchiveRecordHeader getHeader() { + return this.header; + } + + protected void setHeader(ArchiveRecordHeader header) { + this.header = header; + } + + /** + * Calling close on a record skips us past this record to the next record + * in the stream. + * + * It does not actually close the stream. The underlying steam is probably + * being used by the next arc record. + * + * @throws IOException + */ + public void close() throws IOException { + if (this.in != null) { + skip(); + this.in = null; + if (this.digest != null) { + this.digestStr = Base32.encode(this.digest.digest()); + } + } + } + + /** + * @return Next character in this Record content else -1 if at EOR. + * @throws IOException + */ + public int read() throws IOException { + int c = -1; + if (available() > 0) { + c = this.in.read(); + if (c == -1) { + throw new IOException("Premature EOF before end-of-record."); + } + if (this.digest != null) { + this.digest.update((byte) c); + } + incrementPosition(); + } + return c; + } + + public int read(byte[] b, int offset, int length) throws IOException { + int read = Math.min(length, available()); + if (read == -1 || read == 0) { + read = -1; + } else { + read = this.in.read(b, offset, read); + if (read == -1) { + String msg = "Premature EOF before end-of-record: " + + getHeader().getHeaderFields(); + if (isStrict()) { + throw new IOException(msg); + } + setEor(true); + System.err.println(Level.WARNING.toString() + " " + msg); + } + if (this.digest != null && read >= 0) { + this.digest.update(b, offset, read); + } + incrementPosition(read); + } + return read; + } + + /** + * This available is not the stream's available. Its an available based on + * what the stated Archive record length is minus what we've read to date. + * + * @return True if bytes remaining in record content. + */ + public int available() { + long amount = getHeader().getLength() - getPosition(); + return (amount > Integer.MAX_VALUE? Integer.MAX_VALUE: (int)amount); + } + + /** + * Skip over this records content. + * + * @throws IOException + */ + protected void skip() throws IOException { + if (this.eor) { + return; + } + + // Read to the end of the body of the record. Exhaust the stream. + // Can't skip direct to end because underlying stream may be compressed + // and we're calculating the digest for the record. + int r = available(); + while (r > 0 && !this.eor) { + skip(r); + r = available(); + } + } + + public long skip(long n) throws IOException { + final int SKIP_BUFFERSIZE = 1024 * 4; + byte[] b = new byte[SKIP_BUFFERSIZE]; + long total = 0; + for (int read = 0; (total < n) && (read != -1);) { + read = Math.min(SKIP_BUFFERSIZE, (int) (n - total)); + // TODO: Interesting is that reading from compressed stream, we only + // read about 500 characters at a time though we ask for 4k. + // Look at this sometime. + read = read(b, 0, read); + if (read <= 0) { + read = -1; + } else { + total += read; + } + } + return total; + } + + /** + * @return Returns the strict. + */ + public boolean isStrict() { + return this.strict; + } + + /** + * @param strict The strict to set. + */ + public void setStrict(boolean strict) { + this.strict = strict; + } + + protected InputStream getIn() { + return this.in; + } + + public String getDigestStr() { + return this.digestStr; + } + + protected void incrementPosition() { + this.position++; + } + + protected void incrementPosition(final long incr) { + this.position += incr; + } + + public long getPosition() { + return this.position; + } + + protected boolean isEor() { + return eor; + } + + protected void setEor(boolean eor) { + this.eor = eor; + } + + protected String getStatusCode4Cdx(final ArchiveRecordHeader h) { + return "-"; + } + + protected String getIp4Cdx(final ArchiveRecordHeader h) { + return "-"; + } + + protected String getDigest4Cdx(final ArchiveRecordHeader h) { + return getDigestStr() == null? "-": getDigestStr(); + } + + protected String getMimetype4Cdx(final ArchiveRecordHeader h) { + return h.getMimetype(); + } + + protected String outputCdx(final String strippedFileName) + throws IOException { + // Read the whole record so we get out a hash. Should be safe calling + // close on already closed Record. + close(); + ArchiveRecordHeader h = getHeader(); + StringBuilder buffer = + new StringBuilder(ArchiveFileConstants.CDX_LINE_BUFFER_SIZE); + buffer.append(h.getDate()); + buffer.append(ArchiveFileConstants.SINGLE_SPACE); + buffer.append(getIp4Cdx(h)); + buffer.append(ArchiveFileConstants.SINGLE_SPACE); + buffer.append(h.getUrl()); + buffer.append(ArchiveFileConstants.SINGLE_SPACE); + buffer.append(getMimetype4Cdx(h)); + buffer.append(ArchiveFileConstants.SINGLE_SPACE); + buffer.append(getStatusCode4Cdx(h)); + buffer.append(ArchiveFileConstants.SINGLE_SPACE); + buffer.append(getDigest4Cdx(h)); + buffer.append(ArchiveFileConstants.SINGLE_SPACE); + buffer.append(h.getOffset()); + buffer.append(ArchiveFileConstants.SINGLE_SPACE); + buffer.append(h.getLength()); + buffer.append(ArchiveFileConstants.SINGLE_SPACE); + buffer.append(strippedFileName != null? strippedFileName: '-'); + return buffer.toString(); + } + + /** + * Writes output on STDOUT. + * @throws IOException + */ + public void dump() + throws IOException { + dump(System.out); + } + + /** + * Writes output on passed os. + * @throws IOException + */ + public void dump(final OutputStream os) + throws IOException { + final byte [] outputBuffer = new byte [16*1024]; + int read = outputBuffer.length; + while ((read = read(outputBuffer, 0, outputBuffer.length)) != -1) { + os.write(outputBuffer, 0, read); + } + os.flush(); + } + + /** + * Is it likely that this record contains headers? + * This method will return true if the body is a http response that includes + * http response headers or the body is a http request that includes request + * headers, etc. Be aware that headers in content are distinct from + * {@link ArchiveRecordHeader} 'headers'. + * @return True if this Record's content has headers: + */ + public boolean hasContentHeaders() { + final String url = getHeader().getUrl(); + if (url == null) { + return false; + } + + if (!url.toLowerCase().startsWith("http")) { + return false; + } + + if (getHeader().getLength() <= MIN_HTTP_HEADER_LENGTH) { + return false; + } + + return true; + } + + protected void setBodyOffset(int bodyOffset) { + this.position = bodyOffset; + } +} diff --git a/src/main/java/org/archive/io/ArchiveRecordHeader.java b/src/main/java/org/archive/io/ArchiveRecordHeader.java new file mode 100644 index 00000000..953537b1 --- /dev/null +++ b/src/main/java/org/archive/io/ArchiveRecordHeader.java @@ -0,0 +1,111 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.util.Map; +import java.util.Set; + +/** + * Archive Record Header. + * @author stack + * @version $Date$ $Version$ + */ +public interface ArchiveRecordHeader { + /** + * Get the time when the record was created. + * @return Date in 14 digit time format (UTC). + * @see org.archive.util.ArchiveUtils#parse14DigitDate(String) + */ + public abstract String getDate(); + + /** + * @return Return length of record. + */ + public abstract long getLength(); + + /** + * @return Return Content-Length of the contents of the record + */ + public abstract long getContentLength(); + + + /** + * @return Record subject-url. + */ + public abstract String getUrl(); + + /** + * @return Record mimetype. + */ + public abstract String getMimetype(); + + /** + * @return Record version. + */ + public abstract String getVersion(); + + /** + * @return Offset into Archive file at which this record begins. + */ + public abstract long getOffset(); + + /** + * @param key Key to use looking up field value. + * @return value for passed key of null if no such entry. + */ + public abstract Object getHeaderValue(final String key); + + /** + * @return Header field name keys. + */ + public abstract Set getHeaderFieldKeys(); + + /** + * @return Map of header fields. + */ + public abstract Map getHeaderFields(); + + /** + * @return Returns identifier for current Archive file. Be aware this + * may not be a file name or file path. It may just be an URL. Depends + * on how Archive file was made. + */ + public abstract String getReaderIdentifier(); + + /** + * @return Identifier for the record. If ARC, the URL + date. If WARC, + * the GUID assigned. + */ + public abstract String getRecordIdentifier(); + + /** + * @return Returns digest as String for this record. Only available after + * the record has been read in totality. + */ + public abstract String getDigest(); + + /** + * Offset at which the content begins. + * For ARCs, its used to delimit where http headers end and content begins. + * For WARCs, its end of Named Fields before payload starts. + */ + public int getContentBegin(); + + public abstract String toString(); +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/ArraySeekInputStream.java b/src/main/java/org/archive/io/ArraySeekInputStream.java new file mode 100644 index 00000000..5b30747e --- /dev/null +++ b/src/main/java/org/archive/io/ArraySeekInputStream.java @@ -0,0 +1,106 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.io.IOException; + + +/** + * A repositionable stream backed by an array. + * + * @author pjack + */ +public class ArraySeekInputStream extends SeekInputStream { + + + /** + * The array of bytes to read from. + */ + private byte[] array; + + + /** + * The offset in the array of the next byte to read. + */ + private int offset; + + + /** + * Constructor. Note that changes to the given array will be reflected + * in the stream. + * + * @param array The array to read bytes from. + */ + public ArraySeekInputStream(byte[] array) { + this.array = array; + this.offset = 0; + } + + + @Override + public int read() { + if (offset >= array.length) { + return -1; + } + int r = array[offset] & 0xFF; + offset++; + return r; + } + + + @Override + public int read(byte[] buf, int ofs, int len) { + if (offset >= array.length) { + return 0; + } + len = Math.min(len, array.length - offset); + System.arraycopy(array, offset, buf, ofs, len); + offset += len; + return len; + } + + + @Override + public int read(byte[] buf) { + return read(buf, 0, buf.length); + } + + + /** + * Returns the position of the stream. + */ + public long position() { + return offset; + } + + + /** + * Repositions the stream. + * + * @param p the new position for the stream + * @throws IOException if the given position is out of bounds + */ + public void position(long p) throws IOException { + if ((p < 0) || (p > array.length)) { + throw new IOException("Invalid position: " + p); + } + offset = (int)p; + } + +} diff --git a/src/main/java/org/archive/io/BufferedSeekInputStream.java b/src/main/java/org/archive/io/BufferedSeekInputStream.java new file mode 100644 index 00000000..2fdc72b7 --- /dev/null +++ b/src/main/java/org/archive/io/BufferedSeekInputStream.java @@ -0,0 +1,217 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + + +import java.io.IOException; + + +/** + * Buffers data from some other SeekInputStream. + * + * @author pjack + */ +public class BufferedSeekInputStream extends SeekInputStream { + + + /** + * The underlying input stream. + */ + final private SeekInputStream input; + + + /** + * The buffered data. + */ + final private byte[] buffer; + + + /** + * The maximum offset of valid data in the buffer. Usually the same + * as buffer.length, but may be shorter if we're in the last region + * of the stream. + */ + private int maxOffset; + + + /** + * The offset of within the buffer of the next byte to read. + */ + private int offset; + + + /** + * Constructor. + * + * @param input the underlying input stream + * @param capacity the size of the buffer + * @throws IOException if an IO occurs filling the first buffer + */ + public BufferedSeekInputStream(SeekInputStream input, int capacity) + throws IOException { + this.input = input; + this.buffer = new byte[capacity]; + buffer(); + } + + /** + * Fills the buffer. + * + * @throws IOException if an IO error occurs + */ + private void buffer() throws IOException { + int remaining = buffer.length; + while (remaining > 0) { + int r = input.read(buffer, buffer.length - remaining, remaining); + if (r <= 0) { + // Not enough information to fill the buffer + offset = 0; + maxOffset = buffer.length - remaining; + return; + } + remaining -= r; + } + maxOffset = buffer.length; + offset = 0; + } + + + /** + * Ensures that the buffer is valid. + * + * @throws IOException if an IO error occurs + */ + private void ensureBuffer() throws IOException { + if (offset >= maxOffset) { + buffer(); + } + } + + + /** + * Returns the number of unread bytes in the current buffer. + * + * @return the remaining bytes + */ + private int remaining() { + return maxOffset - offset; + } + + + @Override + public int read() throws IOException { + ensureBuffer(); + if (maxOffset == 0) { + return -1; + } + int ch = buffer[offset] & 0xFF; + offset++; + return ch; + } + + + @Override + public int read(byte[] buf, int ofs, int len) throws IOException { + ensureBuffer(); + if (maxOffset == 0) { + return 0; + } + len = Math.min(len, remaining()); + System.arraycopy(buffer, offset, buf, ofs, len); + offset += len; + return len; + } + + + @Override + public int read(byte[] buf) throws IOException { + return read(buf, 0, buf.length); + } + + + @Override + public long skip(long c) throws IOException { + ensureBuffer(); + if (maxOffset == 0) { + return 0; + } + int count = (c > Integer.MAX_VALUE) ? Integer.MAX_VALUE : (int)c; + int skip = Math.min(count, remaining()); + offset += skip; + return skip; + } + + + /** + * Returns the stream's current position. + * + * @return the current position + */ + public long position() throws IOException { + return input.position() - buffer.length + offset; + } + + + /** + * Seeks to the given position. This method avoids re-filling the buffer + * if at all possible. + * + * @param p the position to set + * @throws IOException if an IO error occurs + */ + public void position(long p) throws IOException { + long blockStart = (input.position() - maxOffset) + / buffer.length * buffer.length; + long blockEnd = blockStart + maxOffset; + if ((p >= blockStart) && (p < blockEnd)) { + // Desired position is somewhere inside current buffer + long adj = p - blockStart; + offset = (int)adj; + return; + } + positionDirect(p); + } + + + /** + * Positions the underlying stream at the given position, then refills + * the buffer. + * + * @param p the position to set + * @throws IOException if an IO error occurs + */ + private void positionDirect(long p) throws IOException { + long newBlockStart = p / buffer.length * buffer.length; + input.position(newBlockStart); + buffer(); + offset = (int)(p % buffer.length); + } + + /** + * Close the stream, including the wrapped input stream. + */ + public void close() throws IOException { + super.close(); + if(this.input!=null) { + this.input.close(); + } + } + + +} diff --git a/src/main/java/org/archive/io/CharSubSequence.java b/src/main/java/org/archive/io/CharSubSequence.java new file mode 100644 index 00000000..1e89da56 --- /dev/null +++ b/src/main/java/org/archive/io/CharSubSequence.java @@ -0,0 +1,90 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + + +/** + * Provides a subsequence view onto a CharSequence. + * + * @author gojomo + * @version $Revision$, $Date$ + */ +public class CharSubSequence implements CharSequence { + + protected CharSequence inner; + protected int start; + protected int end; + + public CharSubSequence(CharSequence inner, int start, int end) { + if (end < start) { + throw new IllegalArgumentException("Start " + start + " is > " + + " than end " + end); + } + + if (end < 0 || start < 0) { + throw new IllegalArgumentException("Start " + start + " or end " + + end + " is < 0."); + } + + if (inner == null) { + throw new NullPointerException("Passed charsequence is null."); + } + + this.inner = inner; + this.start = start; + this.end = end; + } + + /* + * (non-Javadoc) + * @see java.lang.CharSequence#length() + */ + public int length() { + return this.end - this.start; + } + + /* + * (non-Javadoc) + * @see java.lang.CharSequence#charAt(int) + */ + public char charAt(int index) { + return this.inner.charAt(this.start + index); + } + + /* + * (non-Javadoc) + * @see java.lang.CharSequence#subSequence(int, int) + */ + public CharSequence subSequence(int begin, int finish) { + return new CharSubSequence(this, begin, finish); + } + + /* + * (non-Javadoc) + * @see java.lang.CharSequence#toString() + */ + public String toString() { + StringBuffer sb = new StringBuffer(length()); + // could use StringBuffer.append(CharSequence) if willing to do 1.5 & up + for (int i = 0;i filenames; + + /* (non-Javadoc) + * @see java.io.InputStream#read() + */ + public int read() throws IOException { + int c = super.read(); + if( c == -1 && filenames.hasNext() ) { + cueStream(); + return read(); + } + return c; + } + /* (non-Javadoc) + * @see java.io.InputStream#read(byte[], int, int) + */ + public int read(byte[] b, int off, int len) throws IOException { + int c = super.read(b, off, len); + if( c == -1 && filenames.hasNext() ) { + cueStream(); + return read(b,off,len); + } + return c; + } + /* (non-Javadoc) + * @see java.io.InputStream#read(byte[]) + */ + public int read(byte[] b) throws IOException { + int c = super.read(b); + if( c == -1 && filenames.hasNext() ) { + cueStream(); + return read(b); + } + return c; + } + + /* (non-Javadoc) + * @see java.io.InputStream#skip(long) + */ + public long skip(long n) throws IOException { + long s = super.skip(n); + if( s files) throws IOException { + super(null); + filenames = files.iterator(); + cueStream(); + } + + private void cueStream() throws IOException { + if(filenames.hasNext()) { + this.in = new FileInputStream(filenames.next()); + } + } + +} diff --git a/src/main/java/org/archive/io/CompositeFileReader.java b/src/main/java/org/archive/io/CompositeFileReader.java new file mode 100644 index 00000000..14b56219 --- /dev/null +++ b/src/main/java/org/archive/io/CompositeFileReader.java @@ -0,0 +1,40 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.io.File; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.List; + + +/** + * @author gojomo + */ +public class CompositeFileReader extends InputStreamReader { + + /** + * @param filenames + * @throws IOException + */ + public CompositeFileReader(List filenames) throws IOException { + super(new CompositeFileInputStream(filenames)); + } + +} diff --git a/src/main/java/org/archive/io/Endian.java b/src/main/java/org/archive/io/Endian.java new file mode 100644 index 00000000..f6d89aaa --- /dev/null +++ b/src/main/java/org/archive/io/Endian.java @@ -0,0 +1,125 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + + +import java.io.EOFException; +import java.io.IOException; +import java.io.InputStream; + + +/** + * Reads integers stored in big or little endian streams. + * + * @author pjack + */ +public class Endian { + + + /** + * Static utility class. + */ + private Endian() { + } + + + /** + * Reads the next little-endian unsigned 16 bit integer from the + * given stream. + * + * @param input the input stream to read from + * @return the next 16-bit little-endian integer + * @throws IOException if an IO error occurs + */ + public static char littleChar(InputStream input) throws IOException { + int lo = input.read(); + if (lo < 0) { + throw new EOFException(); + } + int hi = input.read(); + if (hi < 0) { + throw new EOFException(); + } + return (char)((hi << 8) | lo); + } + + + /** + * Reads the next little-endian signed 16-bit integer from the + * given stream. + * + * @param input the input stream to read from + * @return the next 16-bit little-endian integer + * @throws IOException if an IO error occurs + */ + public static short littleShort(InputStream input) throws IOException { + return (short)littleChar(input); + } + + + /** + * Reads the next little-endian signed 32-bit integer from the + * given stream. + * + * @param input the input stream to read from + * @return the next 32-bit little-endian integer + * @throws IOException if an IO error occurs + */ + public static int littleInt(InputStream input) throws IOException { + char lo = littleChar(input); + char hi = littleChar(input); + return (hi << 16) | lo; + } + + + /** + * Reads the next big-endian unsigned 16 bit integer from the + * given stream. + * + * @param input the input stream to read from + * @return the next 16-bit big-endian integer + * @throws IOException if an IO error occurs + */ + public static char bigChar(InputStream input) throws IOException { + int hi = input.read(); + if (hi < 0) { + throw new EOFException(); + } + int lo = input.read(); + if (lo < 0) { + throw new EOFException(); + } + return (char)((hi << 8) | lo); + } + + + /** + * Reads the next big-endian signed 32-bit integer from the + * given stream. + * + * @param input the input stream to read from + * @return the next 32-bit big-endian integer + * @throws IOException if an IO error occurs + */ + public static int bigInt(InputStream input) throws IOException { + char hi = bigChar(input); + char lo = bigChar(input); + return (hi << 16) | lo; + } +} diff --git a/src/main/java/org/archive/io/GZIPMembersInputStream.java b/src/main/java/org/archive/io/GZIPMembersInputStream.java new file mode 100644 index 00000000..35fb9e90 --- /dev/null +++ b/src/main/java/org/archive/io/GZIPMembersInputStream.java @@ -0,0 +1,38 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.io.IOException; +import java.io.InputStream; + +/** + * @deprecated use {@link org.archive.util.zip.GZIPMembersInputStream} + */ +@Deprecated +public class GZIPMembersInputStream extends org.archive.util.zip.GZIPMembersInputStream { + + public GZIPMembersInputStream(InputStream in) throws IOException { + super(in); + } + + public GZIPMembersInputStream(InputStream in, int size) throws IOException { + super(in, size); + } + +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/GenerationFileHandler.java b/src/main/java/org/archive/io/GenerationFileHandler.java new file mode 100644 index 00000000..c1ce8d79 --- /dev/null +++ b/src/main/java/org/archive/io/GenerationFileHandler.java @@ -0,0 +1,200 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.LinkedList; +import java.util.List; +import java.util.logging.FileHandler; +import java.util.logging.Formatter; +import java.util.logging.LogRecord; + +import org.archive.util.FileUtils; + + +/** + * FileHandler with support for rotating the current file to + * an archival name with a specified integer suffix, and + * provision of a new replacement FileHandler with the current + * filename. + * + * @author gojomo + */ +public class GenerationFileHandler extends FileHandler { + private LinkedList filenameSeries = new LinkedList(); + private boolean shouldManifest = false; + + /** + * @return Returns the filenameSeries. + */ + public List getFilenameSeries() { + return filenameSeries; + } + + /** + * Constructor. + * @param pattern + * @param append + * @param shouldManifest + * @throws IOException + * @throws SecurityException + */ + public GenerationFileHandler(String pattern, boolean append, + boolean shouldManifest) + throws IOException, SecurityException { + super(pattern, append); + filenameSeries.addFirst(pattern); + this.shouldManifest = shouldManifest; + } + + /** + * @param filenameSeries + * @param shouldManifest + * @throws IOException + */ + public GenerationFileHandler(LinkedList filenameSeries, + boolean shouldManifest) + throws IOException { + super((String)filenameSeries.getFirst(), false); // Never append in this case + this.filenameSeries = filenameSeries; + this.shouldManifest = shouldManifest; + } + + /** + * Move the current file to a new filename with the storeSuffix in place + * of the activeSuffix; continuing logging to a new file under the + * original filename. + * + * @param storeSuffix Suffix to put in place of activeSuffix + * @param activeSuffix Suffix to replace with storeSuffix. + * @return GenerationFileHandler instance. + * @throws IOException + */ + public GenerationFileHandler rotate(String storeSuffix, + String activeSuffix) + throws IOException { + return rotate(storeSuffix, activeSuffix, false); + } + + public GenerationFileHandler rotate(String storeSuffix, + String activeSuffix, boolean mergeOld) throws IOException { + close(); + String filename = (String) filenameSeries.getFirst(); + if (!filename.endsWith(activeSuffix)) { + throw new FileNotFoundException("Active file does not have" + + " expected suffix"); + } + String storeFilename = filename.substring(0, filename.length() + - activeSuffix.length()) + + storeSuffix; + File activeFile = new File(filename); + File storeFile = new File(storeFilename); + FileUtils.moveAsideIfExists(storeFile); + + if (mergeOld) { + File fileToAppendTo = new File(filenameSeries.getLast()); + for (int i = filenameSeries.size() - 2; i >= 0; i--) { + File f = new File(filenameSeries.get(i)); + FileUtils.appendTo(fileToAppendTo, f); + f.delete(); + } + filenameSeries.clear(); + filenameSeries.add(filename); + if (!fileToAppendTo.renameTo(storeFile)) { + throw new IOException("Unable to move " + fileToAppendTo + " to " + + storeFilename); + } + } else { + if (!activeFile.renameTo(storeFile)) { + throw new IOException("Unable to move " + filename + " to " + + storeFilename); + } + } + filenameSeries.add(1, storeFilename); + GenerationFileHandler newGfh = new GenerationFileHandler( + filenameSeries, shouldManifest); + newGfh.setFormatter(this.getFormatter()); + return newGfh; + } + + /** + * @return True if should manifest. + */ + public boolean shouldManifest() { + return this.shouldManifest; + } + + /** + * Constructor-helper that rather than clobbering any existing + * file, moves it aside with a timestamp suffix. + * + * @param filename + * @param append + * @param shouldManifest + * @return + * @throws SecurityException + * @throws IOException + */ + public static GenerationFileHandler makeNew(String filename, boolean append, boolean shouldManifest) throws SecurityException, IOException { + FileUtils.moveAsideIfExists(new File(filename)); + return new GenerationFileHandler(filename, append, shouldManifest); + } + + @Override + public void publish(LogRecord record) { + // when possible preformat outside synchronized superclass method + // (our most involved UriProcessingFormatter can cache result) + Formatter f = getFormatter(); + if(!(f instanceof Preformatter)) { + super.publish(record); + } else { + try { + ((Preformatter)f).preformat(record); + super.publish(record); + } finally { + ((Preformatter)f).clear(); + } + } + } +// +// TODO: determine if there's another way to have this optimization without +// negative impact on log-following (esp. in web UI) +// /** +// * Flush only 1/100th of the usual once-per-record, to reduce the time +// * spent holding the synchronization lock. (Flush is primarily called in +// * a superclass's synchronized publish()). +// * +// * The eventual close calls a direct flush on the target writer, so all +// * rotates/ends will ultimately be fully flushed. +// * +// * @see java.util.logging.StreamHandler#flush() +// */ +// @Override +// public synchronized void flush() { +// flushCount++; +// if(flushCount==100) { +// super.flush(); +// flushCount=0; +// } +// } +// int flushCount; + +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/GenericReplayCharSequence.java b/src/main/java/org/archive/io/GenericReplayCharSequence.java new file mode 100644 index 00000000..1af3922b --- /dev/null +++ b/src/main/java/org/archive/io/GenericReplayCharSequence.java @@ -0,0 +1,412 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.io.Writer; +import java.nio.CharBuffer; +import java.nio.channels.FileChannel; +import java.nio.charset.CharacterCodingException; +import java.nio.charset.Charset; +import java.text.NumberFormat; +import java.util.logging.Level; +import java.util.logging.Logger; + +import org.apache.commons.io.IOUtils; +import org.archive.util.DevUtils; + +import com.google.common.base.Charsets; +import com.google.common.primitives.Ints; + +/** + * (Replay)CharSequence view on recorded streams. + * + * For small streams, use {@link InMemoryReplayCharSequence}. + * + *

Call {@link close()} on this class when done to clean up resources. + * + * @contributor stack + * @contributor nlevitt + * @version $Revision$, $Date$ + */ +public class GenericReplayCharSequence implements ReplayCharSequence { + + protected static Logger logger = Logger + .getLogger(GenericReplayCharSequence.class.getName()); + + /** + * Name of the encoding we use writing out concatenated decoded prefix + * buffer and decoded backing file. + * + *

This define is also used as suffix for the file that holds the + * decodings. The name of the file that holds the decoding is the name + * of the backing file w/ this encoding for a suffix. + * + *

See Encoding. + */ + public static final Charset WRITE_ENCODING = Charsets.UTF_16BE; + + private static final long MAP_MAX_BYTES = 64 * 1024 * 1024; // 64M + + /** + * When the memory map moves away from the beginning of the file + * (to the "right") in order to reach a certain index, it will + * map up to this many bytes preceding (to the left of) the target character. + * Consequently it will map up to + * MAP_MAX_BYTES - MAP_TARGET_LEFT_PADDING + * bytes to the right of the target. + */ + private static final long MAP_TARGET_LEFT_PADDING_BYTES = (long) (MAP_MAX_BYTES * 0.01); + + /** + * Total length of character stream to replay minus the HTTP headers + * if present. + * + * If the backing file is larger than Integer.MAX_VALUE (i.e. 2gb), + * only the first Integer.MAX_VALUE characters are available through this API. + * We're overriding java.lang.CharSequence so that we can use + * java.util.regex directly on the data, and the CharSequence + * API uses int for the length and index. + */ + protected int length; + + /** counter of decoding exceptions for report at end */ + protected long decodingExceptions = 0; + protected CharacterCodingException codingException = null; + + /** + * Byte offset into the file where the memory mapped portion begins. + */ + private long mapByteOffset; + + // XXX do we need to keep the input stream around? + private FileInputStream backingFileIn = null; + + private FileChannel backingFileChannel = null; + + private long bytesPerChar; + + private CharBuffer mappedBuffer = null; + + /** + * File that has decoded content. + * + * Keep it around so we can remove on close. + */ + private File decodedFile = null; + + /* + * This portion of the CharSequence precedes what's in the backing file. In + * cases where we decodeToFile(), this is always empty, because we decode + * the entire input stream. + */ + private CharBuffer prefixBuffer = null; + + private boolean isOpen = true; + + protected Charset charset = null; + + /** + * Constructor. + * + * @param contentReplayInputStream inputStream of content + * @param charset Encoding to use reading the passed prefix + * buffer and backing file. Must not be null. + * @param backingFilename Path to backing file with content in excess of + * whats in buffer. + * + * @throws IOException + */ + public GenericReplayCharSequence(InputStream contentReplayInputStream, + int prefixMax, + String backingFilename, + Charset charset) throws IOException { + super(); + logger.fine("characterEncoding=" + charset + " backingFilename=" + + backingFilename); + + if(charset==null) { + charset = ReplayCharSequence.FALLBACK_CHARSET; + } + // decodes only up to Integer.MAX_VALUE characters + decode(contentReplayInputStream, prefixMax, backingFilename, charset); + + this.bytesPerChar = 2; + + if(length>prefixBuffer.position()) { + this.backingFileIn = new FileInputStream(decodedFile); + this.backingFileChannel = backingFileIn.getChannel(); + this.mapByteOffset = 0; + updateMemoryMappedBuffer(); + } + } + + private void updateMemoryMappedBuffer() { + long charLength = (long) this.length() - (long) prefixBuffer.limit(); // in characters + long mapSize = Math.min((charLength * bytesPerChar) - mapByteOffset, MAP_MAX_BYTES); + logger.fine("updateMemoryMappedBuffer: mapOffset=" + + NumberFormat.getInstance().format(mapByteOffset) + + " mapSize=" + NumberFormat.getInstance().format(mapSize)); + try { + // TODO: stress-test without these possibly-costly requests! +// System.gc(); +// System.runFinalization(); + // TODO: Confirm the READ_ONLY works. I recall it not working. + // The buffers seem to always say that the buffer is writable. + mappedBuffer = backingFileChannel.map( + FileChannel.MapMode.READ_ONLY, mapByteOffset, mapSize) + .asReadOnlyBuffer().asCharBuffer(); + } catch (IOException e) { + // TODO convert this to a runtime error? + DevUtils.logger.log(Level.SEVERE, + " backingFileChannel.map() mapByteOffset=" + mapByteOffset + + " mapSize=" + mapSize + "\n" + "decodedFile=" + + decodedFile + " length=" + length + "\n" + + DevUtils.extraInfo(), e); + throw new RuntimeException(e); + } + } + + /** + * Converts the first Integer.MAX_VALUE characters from the + * file backingFilename from encoding encoding to + * encoding WRITE_ENCODING and saves as + * this.decodedFile, which is named backingFilename + * + "." + WRITE_ENCODING. + * + * @throws IOException + */ + protected void decode(InputStream inStream, int prefixMax, + String backingFilename, Charset charset) throws IOException { + + this.charset = charset; + + // TODO: consider if BufferedReader is helping any + // TODO: consider adding TBW 'LimitReader' to stop reading at + // Integer.MAX_VALUE characters because of charAt(int) limit + BufferedReader reader = new BufferedReader(new InputStreamReader( + inStream, charset)); + + logger.fine("backingFilename=" + backingFilename + " encoding=" + + charset + " decodedFile=" + decodedFile); + + this.prefixBuffer = CharBuffer.allocate(prefixMax); + + long count = 0; + while(count < prefixMax) { + int read = reader.read(prefixBuffer); + if(read<0) { + break; + } + count += read; + } + + int ch = reader.read(); + if(ch >= 0) { + count++; + + // more to decode to file overflow + this.decodedFile = new File(backingFilename + "." + WRITE_ENCODING); + + FileOutputStream fos; + try { + fos = new FileOutputStream(this.decodedFile); + } catch (FileNotFoundException e) { + // Windows workaround attempt + System.gc(); + System.runFinalization(); + this.decodedFile = new File(decodedFile.getAbsolutePath()+".win"); + logger.info("Windows 'file with a user-mapped section open' " + + "workaround gc/finalization/name-extension performed."); + // try again + fos = new FileOutputStream(this.decodedFile); + } + + Writer writer = new OutputStreamWriter(fos,WRITE_ENCODING); + writer.write(ch); + count += IOUtils.copyLarge(reader, writer); + writer.close(); + reader.close(); + } + + this.length = Ints.saturatedCast(count); + if(count>Integer.MAX_VALUE) { + logger.warning("input stream is longer than Integer.MAX_VALUE=" + + NumberFormat.getInstance().format(Integer.MAX_VALUE) + + " characters -- only first " + + NumberFormat.getInstance().format(Integer.MAX_VALUE) + + " are accessible through this GenericReplayCharSequence"); + } + + logger.fine("decode: decoded " + count + " characters" + + ((decodedFile==null) ? "" + : " ("+(count-prefixBuffer.length())+" to "+decodedFile+")")); + } + + /** + * Get character at passed absolute position. + * @param index Index into content + * @return Character at offset index. + */ + public char charAt(int index) { + if (index < 0 || index >= this.length()) { + throw new IndexOutOfBoundsException("index=" + index + + " - should be between 0 and length()=" + this.length()); + } + + // is it in the buffer + if (index < prefixBuffer.limit()) { + return prefixBuffer.get(index); + } + + // otherwise we gotta get it from disk via memory map + long charFileIndex = (long) index - (long) prefixBuffer.limit(); + long charFileLength = (long) this.length() - (long) prefixBuffer.limit(); // in characters + if (charFileIndex * bytesPerChar < mapByteOffset) { + logger.log(Level.WARNING,"left-fault; probably don't want to use CharSequence that far backward"); + } + if (charFileIndex * bytesPerChar < mapByteOffset + || charFileIndex - (mapByteOffset / bytesPerChar) >= mappedBuffer.limit()) { + // fault + /* + * mapByteOffset is bounded by 0 and file size +/- size of the map, + * and starts as close to fileIndex - + * MAP_TARGET_LEFT_PADDING_BYTES as it can while also not + * being smaller than it needs to be. + */ + mapByteOffset = Math.min(charFileIndex * bytesPerChar - MAP_TARGET_LEFT_PADDING_BYTES, + charFileLength * bytesPerChar - MAP_MAX_BYTES); + mapByteOffset = Math.max(0, mapByteOffset); + updateMemoryMappedBuffer(); + } + + return mappedBuffer.get((int)(charFileIndex-(mapByteOffset/bytesPerChar))); + } + + public CharSequence subSequence(int start, int end) { + return new CharSubSequence(this, start, end); + } + + private void deleteFile(File fileToDelete) { + deleteFile(fileToDelete, null); + } + + private void deleteFile(File fileToDelete, final Exception e) { + if (e != null) { + // Log why the delete to help with debug of + // java.io.FileNotFoundException: + // ....tt53http.ris.UTF-16BE. + logger.severe("Deleting " + fileToDelete + " because of " + + e.toString()); + } + if (fileToDelete != null && fileToDelete.exists()) { + logger.fine("deleting file: " + fileToDelete); + fileToDelete.delete(); + } + } + + + @Override + public boolean isOpen() { + return this.isOpen; + } + + public void close() throws IOException { + this.isOpen = false; + + logger.fine("closing"); + + if (this.backingFileChannel != null && this.backingFileChannel.isOpen()) { + this.backingFileChannel.close(); + } + if (backingFileIn != null) { + backingFileIn.close(); + } + + deleteFile(this.decodedFile); + + // clear decodedFile -- so that double-close (as in finalize()) won't + // delete a later instance with same name see bug [ 1218961 ] + // "failed get of replay" in ExtractorHTML... usu: UTF-16BE + this.decodedFile = null; + } + + /* + * (non-Javadoc) + * + * @see java.lang.Object#finalize() + */ + protected void finalize() throws Throwable { + super.finalize(); + logger.fine("finalizing"); + close(); + } + + /** + * Convenience method for getting a substring. + * + * @deprecated please use subSequence() and then toString() directly + */ + public String substring(int offset, int len) { + return subSequence(offset, offset + len).toString(); + } + + public String toString() { + StringBuilder sb = new StringBuilder(this.length()); + sb.append(this); + return sb.toString(); + } + + public int length() { + return length; + } + + /* (non-Javadoc) + * @see org.archive.io.ReplayCharSequence#getDecodeExceptionCount() + */ + @Override + public long getDecodeExceptionCount() { + return decodingExceptions; + } + + + /* (non-Javadoc) + * @see org.archive.io.ReplayCharSequence#getCodingException() + */ + @Override + public CharacterCodingException getCodingException() { + return codingException; + } + + /* (non-Javadoc) + * @see org.archive.io.ReplayCharSequence#getCharset() + */ + public Charset getCharset() { + return charset; + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/GzipHeader.java b/src/main/java/org/archive/io/GzipHeader.java new file mode 100644 index 00000000..6b8263bc --- /dev/null +++ b/src/main/java/org/archive/io/GzipHeader.java @@ -0,0 +1,26 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +/** + * @deprecated use {@link org.archive.util.zip.GzipHeader} + */ +@Deprecated +public class GzipHeader extends org.archive.util.zip.GzipHeader { +} diff --git a/src/main/java/org/archive/io/HeaderedArchiveRecord.java b/src/main/java/org/archive/io/HeaderedArchiveRecord.java new file mode 100644 index 00000000..3cce595b --- /dev/null +++ b/src/main/java/org/archive/io/HeaderedArchiveRecord.java @@ -0,0 +1,423 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.io.PrintStream; + +import org.apache.commons.httpclient.Header; +import org.apache.commons.httpclient.HttpParser; +import org.apache.commons.httpclient.StatusLine; +import org.apache.commons.httpclient.util.EncodingUtil; +import org.archive.io.arc.ARCConstants; +import org.archive.util.LaxHttpParser; + +/** + * An ArchiveRecord whose content has a preamble of RFC822-like headers: e.g. + * The ArchiveRecord is a http response that leads off with http response + * headers. Use this ArchiveRecord Decorator to get at the content headers and + * the header/content demarcation. + * + * @author stack + * @author Olaf Freyer + */ +public class HeaderedArchiveRecord extends ArchiveRecord { + private int contentHeadersLength = -1; + private int statusCode = -1; + + /** + * Http header bytes. + * + * If non-null and bytes available, give out its contents before we + * go back to the underlying stream. + */ + private InputStream contentHeaderStream = null; + + /** + * Content headers. + * + * Only available after the reading of headers. + */ + private Header [] contentHeaders = null; + + + public HeaderedArchiveRecord(final ArchiveRecord ar) throws IOException { + super(ar); + } + + public HeaderedArchiveRecord(final ArchiveRecord ar, + final boolean readContentHeader) throws IOException { + super(ar); + if (readContentHeader) { + this.contentHeaderStream = readContentHeaders(); + } + } + + /** + * Skip over the the content headers if present. + * + * Subsequent reads will get the body. + * + *

Calling this method in the midst of reading the header + * will make for strange results. Otherwise, safe to call + * at any time though before reading any of the record + * content is only time that it makes sense. + * + *

After calling this method, you can call + * {@link #getContentHeaders()} to get the read http header. + * + * @throws IOException + */ + public void skipHttpHeader() throws IOException { + if (this.contentHeaderStream == null) { + return; + } + // Empty the contentHeaderStream + for (int available = this.contentHeaderStream.available(); + this.contentHeaderStream != null + && (available = this.contentHeaderStream.available()) > 0;) { + // We should be in this loop once only we should only do this + // buffer allocation once. + byte[] buffer = new byte[available]; + // The read nulls out httpHeaderStream when done with it so + // need check for null in the loop control line. + read(buffer, 0, available); + } + } + + public void dumpHttpHeader() throws IOException { + dumpHttpHeader(System.out); + } + + public void dumpHttpHeader(final PrintStream stream) throws IOException { + if (this.contentHeaderStream == null) { + return; + } + // Dump the httpHeaderStream to STDOUT + for (int available = this.contentHeaderStream.available(); + this.contentHeaderStream != null + && (available = this.contentHeaderStream.available()) > 0;) { + // We should be in this loop only once and should do this + // buffer allocation once. + byte[] buffer = new byte[available]; + // The read nulls out httpHeaderStream when done with it so + // need check for null in the loop control line. + int read = read(buffer, 0, available); + stream.write(buffer, 0, read); + } + } + + /** + * Read header if present. Technique borrowed from HttpClient HttpParse + * class. Using http parser code for now. Later move to more generic header + * parsing code if there proves a need. + * + * @return ByteArrayInputStream with the http header in it or null if no + * http header. + * @throws IOException + */ + private InputStream readContentHeaders() throws IOException { + // If judged a record that doesn't have an http header, return + // immediately. + if (!hasContentHeaders()) { + return null; + } + byte [] statusBytes = LaxHttpParser.readRawLine(getIn()); + int eolCharCount = getEolCharsCount(statusBytes); + if (eolCharCount <= 0) { + throw new IOException("Failed to read raw lie where one " + + " was expected: " + new String(statusBytes)); + } + String statusLine = EncodingUtil.getString(statusBytes, 0, + statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING); + if (statusLine == null) { + throw new NullPointerException("Expected status line is null"); + } + // TODO: Tighten up this test. + boolean isHttpResponse = StatusLine.startsWithHTTP(statusLine); + boolean isHttpRequest = false; + if (!isHttpResponse) { + isHttpRequest = statusLine.toUpperCase().startsWith("GET") || + !statusLine.toUpperCase().startsWith("POST"); + } + if (!isHttpResponse && !isHttpRequest) { + throw new UnexpectedStartLineIOException("Failed parse of " + + "status line: " + statusLine); + } + this.statusCode = isHttpResponse? + (new StatusLine(statusLine)).getStatusCode(): -1; + + // Save off all bytes read. Keep them as bytes rather than + // convert to strings so we don't have to worry about encodings + // though this should never be a problem doing http headers since + // its all supposed to be ascii. + ByteArrayOutputStream baos = + new ByteArrayOutputStream(statusBytes.length + 4 * 1024); + baos.write(statusBytes); + + // Now read rest of the header lines looking for the separation + // between header and body. + for (byte [] lineBytes = null; true;) { + lineBytes = LaxHttpParser.readRawLine(getIn()); + eolCharCount = getEolCharsCount(lineBytes); + if (eolCharCount <= 0) { + throw new IOException("Failed reading headers: " + + ((lineBytes != null)? new String(lineBytes): null)); + } + // Save the bytes read. + baos.write(lineBytes); + if ((lineBytes.length - eolCharCount) <= 0) { + // We've finished reading the http header. + break; + } + } + + byte [] headerBytes = baos.toByteArray(); + // Save off where content body, post content headers, starts. + this.contentHeadersLength = headerBytes.length; + ByteArrayInputStream bais = + new ByteArrayInputStream(headerBytes); + if (!bais.markSupported()) { + throw new IOException("ByteArrayInputStream does not support mark"); + } + bais.mark(headerBytes.length); + // Read the status line. Don't let it into the parseHeaders function. + // It doesn't know what to do with it. + bais.read(statusBytes, 0, statusBytes.length); + this.contentHeaders = LaxHttpParser.parseHeaders(bais, + ARCConstants.DEFAULT_ENCODING); + bais.reset(); + return bais; + } + + public static class UnexpectedStartLineIOException + extends RecoverableIOException { + private static final long serialVersionUID = 1L; + + public UnexpectedStartLineIOException(final String reason) { + super(reason); + } + } + + /** + * @param bytes Array of bytes to examine for an EOL. + * @return Count of end-of-line characters or zero if none. + */ + private int getEolCharsCount(byte [] bytes) { + int count = 0; + if (bytes != null && bytes.length >=1 && + bytes[bytes.length - 1] == '\n') { + count++; + if (bytes.length >=2 && bytes[bytes.length -2] == '\r') { + count++; + } + } + return count; + } + + /** + * @return If headers are for a http response AND the headers have been + * read, return status code. Else return -1. + */ + public int getStatusCode() { + return this.statusCode; + } + + /** + * @return Returns length of content headers or -1 if headers have + * not yet been read. + */ + public int getContentHeadersLength() { + return this.contentHeadersLength; + } + + public Header[] getContentHeaders() { + return contentHeaders; + } + + /** + * @return Next character in this ARCRecord's content else -1 if at end of + * this record. + * @throws IOException + */ + public int read() throws IOException { + int c = -1; + if (this.contentHeaderStream != null && + (this.contentHeaderStream.available() > 0)) { + // If http header, return bytes from it before we go to underlying + // stream. + c = this.contentHeaderStream.read(); + // If done with the header stream, null it out. + if (this.contentHeaderStream.available() <= 0) { + this.contentHeaderStream = null; + } + // do not increment position - + // the underlying ArchiveRecord stream allready did this + // incrementPosition(); + } else { + c = super.read(); + } + return c; + } + + public int read(byte [] b, int offset, int length) throws IOException { + int read = -1; + if (this.contentHeaderStream != null && + (this.contentHeaderStream.available() > 0)) { + // If http header, return bytes from it before we go to underlying + // stream. + read = Math.min(length, this.contentHeaderStream.available()); + if (read == 0) { + read = -1; + } else { + read = this.contentHeaderStream.read(b, offset, read); + } + // If done with the header stream, null it out. + if (this.contentHeaderStream.available() <= 0) { + this.contentHeaderStream = null; + } + // do not increment position - + // the underlying ArchiveRecord stream allready did this + //incrementPosition(); + } else { + read = super.read(b, offset, length); + } + return read; + } + + @Override + public int available() { + return ((ArchiveRecord)this.in).available(); + } + + @Override + public void close() throws IOException { + ((ArchiveRecord)this.in).close(); + } + + @Override + public void dump() throws IOException { + ((ArchiveRecord)this.in).dump(); + } + + @Override + public void dump(OutputStream os) throws IOException { + ((ArchiveRecord)this.in).dump(os); + } + + @Override + protected String getDigest4Cdx(ArchiveRecordHeader h) { + return ((ArchiveRecord)this.in).getDigest4Cdx(h); + } + + @Override + public String getDigestStr() { + return ((ArchiveRecord)this.in).getDigestStr(); + } + + @Override + public ArchiveRecordHeader getHeader() { + return ((ArchiveRecord)this.in).getHeader(); + } + + @Override + protected String getIp4Cdx(ArchiveRecordHeader h) { + return ((ArchiveRecord)this.in).getIp4Cdx(h); + } + + @Override + protected String getMimetype4Cdx(ArchiveRecordHeader h) { + return ((ArchiveRecord)this.in).getMimetype4Cdx(h); + } + + @Override + public long getPosition() { + return ((ArchiveRecord)this.in).getPosition(); + } + + @Override + protected String getStatusCode4Cdx(ArchiveRecordHeader h) { + return ((ArchiveRecord)this.in).getStatusCode4Cdx(h); + } + + @Override + public boolean hasContentHeaders() { + return ((ArchiveRecord)this.in).hasContentHeaders(); + } + + @Override + protected void incrementPosition() { + ((ArchiveRecord)this.in).incrementPosition(); + } + + @Override + protected void incrementPosition(long incr) { + ((ArchiveRecord)this.in).incrementPosition(incr); + } + + @Override + protected boolean isEor() { + return ((ArchiveRecord)this.in).isEor(); + } + + @Override + public boolean isStrict() { + return ((ArchiveRecord)this.in).isStrict(); + } + + @Override + public boolean markSupported() { + return ((ArchiveRecord)this.in).markSupported(); + } + + @Override + protected String outputCdx(String strippedFileName) throws IOException { + return ((ArchiveRecord)this.in).outputCdx(strippedFileName); + } + + @Override + protected void setEor(boolean eor) { + ((ArchiveRecord)this.in).setEor(eor); + } + + @Override + protected void setHeader(ArchiveRecordHeader header) { + ((ArchiveRecord)this.in).setHeader(header); + } + + @Override + public void setStrict(boolean strict) { + ((ArchiveRecord)this.in).setStrict(strict); + } + + @Override + protected void skip() throws IOException { + ((ArchiveRecord)this.in).skip(); + } + + @Override + public long skip(long n) throws IOException { + return ((ArchiveRecord)this.in).skip(n); + } +} diff --git a/src/main/java/org/archive/io/LoudObjectOutputStream.java b/src/main/java/org/archive/io/LoudObjectOutputStream.java new file mode 100644 index 00000000..959c2620 --- /dev/null +++ b/src/main/java/org/archive/io/LoudObjectOutputStream.java @@ -0,0 +1,63 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + +import java.io.IOException; +import java.io.ObjectOutputStream; +import java.io.OutputStream; +import java.util.HashSet; +import java.util.Set; +import java.util.logging.Logger; + +/** + * ObjectOutputStream that logs class name of each object that is written + * to the stream. Useful for tracking down sources of NotSerializableException. + * + * @author pjack + * + */ +public class LoudObjectOutputStream extends ObjectOutputStream { + + + final private static Logger LOGGER = Logger.getLogger( + LoudObjectOutputStream.class.getName()); + + // Only log each class name once + private Set alreadyLogged = new HashSet(); + + public LoudObjectOutputStream(OutputStream out) throws IOException { + super(out); + this.enableReplaceObject(true); + } + + + @Override + protected Object replaceObject(Object obj) throws IOException { + if (obj != null) { + String name = obj.getClass().getName(); + if (alreadyLogged.add(name)) { + LOGGER.info("WROTE: " + name); + } + } + return obj; + } + + +} diff --git a/src/main/java/org/archive/io/MiserOutputStream.java b/src/main/java/org/archive/io/MiserOutputStream.java new file mode 100644 index 00000000..f10ac9ca --- /dev/null +++ b/src/main/java/org/archive/io/MiserOutputStream.java @@ -0,0 +1,82 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + +import java.io.FilterOutputStream; +import java.io.IOException; +import java.io.OutputStream; + +/** + * A filter stream that both counts bytes written, and optionally swallows + * flush() requests. + * + * @contributor gojomo + */ +public class MiserOutputStream extends FilterOutputStream { + protected long count; + protected boolean passFlushes; + + /** + * Wraps another output stream, counting the number of bytes written. + * + * @param out the output stream to be wrapped + */ + public MiserOutputStream(OutputStream out) { + this(out,true); + } + + /** + * Wraps another output stream, counting the number of bytes written. + * + * @param out the output stream to be wrapped + */ + public MiserOutputStream(OutputStream out, boolean passFlushes) { + super(out); + this.passFlushes = passFlushes; + } + + /** Returns the number of bytes written. */ + public long getCount() { + return count; + } + + @Override public void write(byte[] b, int off, int len) throws IOException { + out.write(b, off, len); + count += len; + } + + @Override public void write(int b) throws IOException { + out.write(b); + count++; + } + + @Override + public void close() throws IOException { + passFlushes = true; + super.close(); + } + + @Override + public void flush() throws IOException { + if(passFlushes) { + super.flush(); + } + } +} diff --git a/src/main/java/org/archive/io/NoGzipMagicException.java b/src/main/java/org/archive/io/NoGzipMagicException.java new file mode 100644 index 00000000..27d1058a --- /dev/null +++ b/src/main/java/org/archive/io/NoGzipMagicException.java @@ -0,0 +1,26 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +/** + * @deprecated use {@link org.archive.util.zip.NoGzipMagicException} + */ +@Deprecated +public class NoGzipMagicException extends org.archive.util.zip.NoGzipMagicException { +} diff --git a/src/main/java/org/archive/io/ObjectPlusFilesInputStream.java b/src/main/java/org/archive/io/ObjectPlusFilesInputStream.java new file mode 100644 index 00000000..892860ed --- /dev/null +++ b/src/main/java/org/archive/io/ObjectPlusFilesInputStream.java @@ -0,0 +1,143 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.ObjectInputStream; +import java.util.Iterator; +import java.util.LinkedList; + +import org.archive.util.FileUtils; + + +/** + * Enhanced ObjectOutputStream with support for restoring + * files that had been saved, in parallel with object + * serialization. + * + * @author gojomo + * + */ +public class ObjectPlusFilesInputStream extends ObjectInputStream { + protected LinkedList auxiliaryDirectoryStack = new LinkedList(); + protected LinkedList postRestoreTasks = new LinkedList(); + + /** + * Instantiate over the given stream and using the supplied + * auxiliary storage directory. + * + * @param in + * @param storeDir + * @throws IOException + */ + public ObjectPlusFilesInputStream(InputStream in, File storeDir) + throws IOException { + super(in); + auxiliaryDirectoryStack.addFirst(storeDir); + } + + /** + * Push another default storage directory for use + * until popped. + * + * @param dir + */ + public void pushAuxiliaryDirectory(String dir) { + auxiliaryDirectoryStack. + addFirst(new File(getAuxiliaryDirectory(), dir)); + } + + /** + * Discard the top auxiliary directory. + */ + public void popAuxiliaryDirectory() { + auxiliaryDirectoryStack.removeFirst(); + } + + /** + * Return the top auxiliary directory, from + * which saved files are restored. + * + * @return Auxillary directory. + */ + public File getAuxiliaryDirectory() { + return (File)auxiliaryDirectoryStack.getFirst(); + } + + /** + * Restore a file from storage, using the name and length + * info on the serialization stream and the file from the + * current auxiliary directory, to the given File. + * + * @param destination + * @throws IOException + */ + public void restoreFile(File destination) throws IOException { + String nameAsStored = readUTF(); + long lengthAtStoreTime = readLong(); + File storedFile = new File(getAuxiliaryDirectory(),nameAsStored); + FileUtils.copyFile(storedFile, destination, lengthAtStoreTime); + } + + /** + * Restore a file from storage, using the name and length + * info on the serialization stream and the file from the + * current auxiliary directory, to the given File. + * + * @param directory + * @throws IOException + */ + public void restoreFileTo(File directory) throws IOException { + String nameAsStored = readUTF(); + long lengthAtStoreTime = readLong(); + File storedFile = new File(getAuxiliaryDirectory(),nameAsStored); + File destination = new File(directory,nameAsStored); + FileUtils.copyFile(storedFile, destination, lengthAtStoreTime); + } + + /** + * Register a task to be done when the ObjectPlusFilesInputStream + * is closed. + * + * @param task + */ + public void registerFinishTask(Runnable task) { + postRestoreTasks.addFirst(task); + } + + private void doFinishTasks() { + Iterator iter = postRestoreTasks.iterator(); + while(iter.hasNext()) { + ((Runnable)iter.next()).run(); + } + } + + /** + * In addition to default, do any registered cleanup tasks. + * + * @see java.io.InputStream#close() + */ + public void close() throws IOException { + super.close(); + doFinishTasks(); + } +} diff --git a/src/main/java/org/archive/io/ObjectPlusFilesOutputStream.java b/src/main/java/org/archive/io/ObjectPlusFilesOutputStream.java new file mode 100644 index 00000000..224f24e7 --- /dev/null +++ b/src/main/java/org/archive/io/ObjectPlusFilesOutputStream.java @@ -0,0 +1,134 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.io.File; +import java.io.IOException; +import java.io.ObjectOutputStream; +import java.io.OutputStream; +import java.util.LinkedList; + +import org.archive.util.FileUtils; + + +/** + * Enhanced ObjectOutputStream which maintains (a stack of) auxiliary + * directories and offers convenience methods for serialized objects + * to save their related disk files alongside their serialized version. + * + * @author gojomo + */ +public class ObjectPlusFilesOutputStream extends ObjectOutputStream { + protected LinkedList auxiliaryDirectoryStack = new LinkedList(); + + /** + * Constructor + * + * @param out + * @param topDirectory + * @throws java.io.IOException + */ + public ObjectPlusFilesOutputStream(OutputStream out, File topDirectory) throws IOException { + super(out); + auxiliaryDirectoryStack.addFirst(topDirectory); + } + + /** + * Add another subdirectory for any file-capture needs during the + * current serialization. + * + * @param dir + */ + public void pushAuxiliaryDirectory(String dir) { + auxiliaryDirectoryStack.addFirst(new File(getAuxiliaryDirectory(),dir)); + } + + /** + * Remove the top subdirectory. + * + */ + public void popAuxiliaryDirectory() { + auxiliaryDirectoryStack.removeFirst(); + } + + /** + * Return the current auxiliary directory for storing + * files associated with serialized objects. + * + * @return Auxillary directory. + */ + public File getAuxiliaryDirectory() { + return (File)auxiliaryDirectoryStack.getFirst(); + } + + /** + * Store a snapshot of an object's supporting file to the + * current auxiliary directory. Should only be used for + * files which are strictly appended-to, because it tries + * to use a "hard link" where possible (meaning that + * future edits to the original file's contents will + * also affect the snapshot). + * + * Remembers current file extent to allow a future restore + * to ignore subsequent appended data. + * + * @param file + * @throws IOException + */ + public void snapshotAppendOnlyFile(File file) throws IOException { + // write filename + String name = file.getName(); + writeUTF(name); + // write current file length + writeLong(file.length()); + File auxDir = getAuxiliaryDirectory(); + if(!auxDir.exists()) { + FileUtils.ensureWriteableDirectory(auxDir); + } + File destination = new File(auxDir,name); + hardlinkOrCopy(file, destination); + } + + /** + * Create a backup of this given file, first by trying a "hard + * link", then by using a copy if hard linking is unavailable + * (either because it is unsupported or the origin and checkpoint + * directories are on different volumes). + * + * @param file + * @param destination + * @throws IOException + */ + private void hardlinkOrCopy(File file, File destination) throws IOException { + // For Linux/UNIX, try a hard link first. + Process link = Runtime.getRuntime().exec("ln "+file.getAbsolutePath()+" "+destination.getAbsolutePath()); + // TODO NTFS also supports hard links; add appropriate try + try { + link.waitFor(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + if(link.exitValue()!=0) { + // hard link failed + FileUtils.copyFile(file,destination); + } + } + +} diff --git a/src/main/java/org/archive/io/OriginSeekInputStream.java b/src/main/java/org/archive/io/OriginSeekInputStream.java new file mode 100644 index 00000000..00605d82 --- /dev/null +++ b/src/main/java/org/archive/io/OriginSeekInputStream.java @@ -0,0 +1,121 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + + +import java.io.IOException; + + +/** + * Alters the origin of some other SeekInputStream. This class allows you + * to completely ignore everything in the underlying stream before a specified + * position, the origin position. + * + *

With the exception of {@link #position()} and {@link position(long)}, + * all of the methods in this class simply delegate to the underlying input + * stream. The position methods adjust the position of the + * underlying stream relative to the origin specified at construction time. + * + * @author pjack + */ +public class OriginSeekInputStream extends SeekInputStream { + + + /** + * The underlying stream. + */ + final private SeekInputStream input; + + + /** + * The origin position. In other words, this.position(0) + * resolves to input.position(start). + */ + final private long origin; + + + /** + * Constructor. + * + * @param input the underlying stream + * @param origin the origin position + * @throws IOException if an IO error occurs + */ + public OriginSeekInputStream(SeekInputStream input, long origin) + throws IOException { + this.input = input; + this.origin = origin; + input.position(origin); + } + + + @Override + public int available() throws IOException { + return input.available(); + } + + + @Override + public int read() throws IOException { + return input.read(); + } + + + @Override + public int read(byte[] buf, int ofs, int len) throws IOException { + return input.read(buf, ofs, len); + } + + + @Override + public int read(byte[] buf) throws IOException { + return input.read(buf); + } + + + @Override + public long skip(long count) throws IOException { + return input.skip(count); + } + + + /** + * Returns the position of the underlying stream relative to the origin. + * + * @return the relative position + * @throws IOException if an IO error occurs + */ + public long position() throws IOException { + return input.position() - origin; + } + + + /** + * Positions the underlying stream relative to the origin. + * In other words, this.position(0) resolves to input.position(origin), + * where input is underlying stream and origin is the origin specified + * at construction time. + * + * @param p the new position for this stream + * @throws IOException if an IO error occurs + */ + public void position(long p) throws IOException { + input.position(p + origin); + } +} diff --git a/src/main/java/org/archive/io/Preformatter.java b/src/main/java/org/archive/io/Preformatter.java new file mode 100644 index 00000000..dcd31bb6 --- /dev/null +++ b/src/main/java/org/archive/io/Preformatter.java @@ -0,0 +1,32 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.util.logging.LogRecord; + +/** + * Interface indicating a logging Formatter can preformat a record (outside + * the standard-implementation synchronized block) and cache it, returning it + * for the next request for formatting from the same thread. + * @contributor gojomo + */ +public interface Preformatter { + public void preformat(LogRecord record); + public void clear(); +} diff --git a/src/main/java/org/archive/io/RandomAccessInputStream.java b/src/main/java/org/archive/io/RandomAccessInputStream.java new file mode 100644 index 00000000..d8dd260b --- /dev/null +++ b/src/main/java/org/archive/io/RandomAccessInputStream.java @@ -0,0 +1,180 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + + +import java.io.File; +import java.io.IOException; +import java.io.RandomAccessFile; + + +/** + * Wraps a RandomAccessFile with an InputStream interface. + * + * @author gojomo + */ +public class RandomAccessInputStream extends SeekInputStream { + + /** + * Reference to the random access file this stream is reading from. + */ + private RandomAccessFile raf = null; + + /** + * When mark is called, save here the current position so we can go back + * on reset. + */ + private long markpos = -1; + + /** + * True if we are to close the underlying random access file when this + * stream is closed. + */ + private boolean sympathyClose; + + /** + * Constructor. + * + * If using this constructor, caller created the RAF and therefore + * its assumed wants to control close of the RAF. The RAF.close + * is not called if this constructor is used on close of this stream. + * + * @param raf RandomAccessFile to wrap. + * @throws IOException + */ + public RandomAccessInputStream(RandomAccessFile raf) + throws IOException { + this(raf, false, 0); + } + + /** + * Constructor. + * + * @param file File to get RAFIS on. Creates an RAF from passed file. + * Closes the created RAF when this stream is closed. + * @throws IOException + */ + public RandomAccessInputStream(final File file) + throws IOException { + this(new RandomAccessFile(file, "r"), true, 0); + } + + /** + * Constructor. + * + * @param file File to get RAFIS on. Creates an RAF from passed file. + * Closes the created RAF when this stream is closed. + * @param offset + * @throws IOException + */ + public RandomAccessInputStream(final File file, final long offset) + throws IOException { + this(new RandomAccessFile(file, "r"), true, offset); + } + + /** + * @param raf RandomAccessFile to wrap. + * @param sympathyClose Set to true if we are to close the RAF + * file when this stream is closed. + * @param offset + * @throws IOException + */ + public RandomAccessInputStream(final RandomAccessFile raf, + final boolean sympathyClose, final long offset) + throws IOException { + super(); + this.sympathyClose = sympathyClose; + this.raf = raf; + if (offset > 0) { + this.raf.seek(offset); + } + } + + /* (non-Javadoc) + * @see java.io.InputStream#read() + */ + public int read() throws IOException { + return this.raf.read(); + } + + /* (non-Javadoc) + * @see java.io.InputStream#read(byte[], int, int) + */ + public int read(byte[] b, int off, int len) throws IOException { + return this.raf.read(b, off, len); + } + + /* (non-Javadoc) + * @see java.io.InputStream#read(byte[]) + */ + public int read(byte[] b) throws IOException { + return this.raf.read(b); + } + + /* (non-Javadoc) + * @see java.io.InputStream#skip(long) + */ + public long skip(long n) throws IOException { + this.raf.seek(this.raf.getFilePointer() + n); + return n; + } + + public long position() throws IOException { + return this.raf.getFilePointer(); + } + + public void position(long position) throws IOException { + this.raf.seek(position); + } + + public int available() throws IOException { + long amount = this.raf.length() - this.position(); + return (amount >= Integer.MAX_VALUE)? Integer.MAX_VALUE: (int)amount; + } + + public boolean markSupported() { + return true; + } + + public synchronized void mark(int readlimit) { + try { + this.markpos = position(); + } catch (IOException e) { + // Set markpos to -1. Will cause exception reset. + this.markpos = -1; + } + } + + public synchronized void reset() throws IOException { + if (this.markpos == -1) { + throw new IOException("Mark has not been set."); + } + position(this.markpos); + } + + public void close() throws IOException { + try { + super.close(); + } finally { + if (this.sympathyClose) { + this.raf.close(); + } + } + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/RandomAccessOutputStream.java b/src/main/java/org/archive/io/RandomAccessOutputStream.java new file mode 100644 index 00000000..225f995f --- /dev/null +++ b/src/main/java/org/archive/io/RandomAccessOutputStream.java @@ -0,0 +1,69 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.io.IOException; +import java.io.OutputStream; +import java.io.RandomAccessFile; + + +/** + * Wraps a RandomAccessFile with OutputStream interface. + * + * @author gojomo + */ +public class RandomAccessOutputStream extends OutputStream { + protected RandomAccessFile raf; + + /** + * Wrap the given RandomAccessFile + */ + public RandomAccessOutputStream(RandomAccessFile raf) { + super(); + this.raf = raf; + } + + /* (non-Javadoc) + * @see java.io.OutputStream#write(int) + */ + public void write(int b) throws IOException { + raf.write(b); + } + + /* (non-Javadoc) + * @see java.io.OutputStream#close() + */ + public void close() throws IOException { + raf.close(); + } + + /* (non-Javadoc) + * @see java.io.OutputStream#write(byte[], int, int) + */ + public void write(byte[] b, int off, int len) throws IOException { + raf.write(b, off, len); + } + + /* (non-Javadoc) + * @see java.io.OutputStream#write(byte[]) + */ + public void write(byte[] b) throws IOException { + raf.write(b); + } +} diff --git a/src/main/java/org/archive/io/ReadSource.java b/src/main/java/org/archive/io/ReadSource.java new file mode 100644 index 00000000..a3c29967 --- /dev/null +++ b/src/main/java/org/archive/io/ReadSource.java @@ -0,0 +1,37 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + +import java.io.Reader; + +/** + * Interface for objects that can provide a Reader view of their + * contents. + * + */ +public interface ReadSource { + /** + * Obtain a Reader. Not named 'getReader' so that it is not + * considered a simple costless read-only property by + * bean-convention introspection tools. + * @return a Reader on this object + */ + Reader obtainReader(); +} diff --git a/src/main/java/org/archive/io/RecorderIOException.java b/src/main/java/org/archive/io/RecorderIOException.java new file mode 100644 index 00000000..07b30061 --- /dev/null +++ b/src/main/java/org/archive/io/RecorderIOException.java @@ -0,0 +1,38 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.io.IOException; + +/** + * + * @author Gordon Mohr + */ +public class RecorderIOException extends IOException { + + private static final long serialVersionUID = 5907470275350314277L; + + public RecorderIOException() { + super(); + } + + public RecorderIOException(String msg) { + super(msg); + } +} diff --git a/src/main/java/org/archive/io/RecorderLengthExceededException.java b/src/main/java/org/archive/io/RecorderLengthExceededException.java new file mode 100644 index 00000000..8c3e067d --- /dev/null +++ b/src/main/java/org/archive/io/RecorderLengthExceededException.java @@ -0,0 +1,39 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + + +/** + * Indicates a length exception thrown by the Recorder. + * + * @author Gordon Mohr + */ +public class RecorderLengthExceededException +extends RecorderIOException { + + private static final long serialVersionUID = 6655419033414648444L; + + public RecorderLengthExceededException() { + super(); + } + + public RecorderLengthExceededException(String msg) { + super(msg); + } +} diff --git a/src/main/java/org/archive/io/RecorderTimeoutException.java b/src/main/java/org/archive/io/RecorderTimeoutException.java new file mode 100644 index 00000000..32be5b5d --- /dev/null +++ b/src/main/java/org/archive/io/RecorderTimeoutException.java @@ -0,0 +1,37 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +/** + * Indicates a timeout thrown by the RecordingInputStream. + * + * @author Gordon Mohr + */ +public class RecorderTimeoutException extends RecorderIOException { + + private static final long serialVersionUID = 7433214063765078269L; + + public RecorderTimeoutException() { + super(); + } + + public RecorderTimeoutException(String msg) { + super(msg); + } +} diff --git a/src/main/java/org/archive/io/RecorderTooMuchHeaderException.java b/src/main/java/org/archive/io/RecorderTooMuchHeaderException.java new file mode 100644 index 00000000..23f5d264 --- /dev/null +++ b/src/main/java/org/archive/io/RecorderTooMuchHeaderException.java @@ -0,0 +1,40 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + + +/** + * Indicates a too much header material exception thrown by the Recorder + * (specificially the RecordingOutputStream) + * + * @author Gordon Mohr + */ +public class RecorderTooMuchHeaderException +extends RecorderIOException { + + private static final long serialVersionUID = 3528516034898129150L; + + public RecorderTooMuchHeaderException() { + super(); + } + + public RecorderTooMuchHeaderException(String msg) { + super(msg); + } +} diff --git a/src/main/java/org/archive/io/RecordingInputStream.java b/src/main/java/org/archive/io/RecordingInputStream.java new file mode 100644 index 00000000..b46905ed --- /dev/null +++ b/src/main/java/org/archive/io/RecordingInputStream.java @@ -0,0 +1,355 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.io.IOException; +import java.io.InputStream; +import java.net.SocketException; +import java.net.SocketTimeoutException; +import java.security.MessageDigest; +import java.util.logging.Level; +import java.util.logging.Logger; + +import org.apache.commons.io.IOUtils; + + +/** + * Stream which records all data read from it, which it acquires from a wrapped + * input stream. + * + * Makes use of a RecordingOutputStream for recording because of its being + * file backed so we can write massive amounts of data w/o worrying about + * overflowing memory. + * + * @author gojomo + * + */ +public class RecordingInputStream + extends InputStream { + + protected static Logger logger = + Logger.getLogger("org.archive.io.RecordingInputStream"); + + /** + * Where we are recording to. + */ + private RecordingOutputStream recordingOutputStream; + + /** + * Stream to record. + */ + private InputStream in = null; + + /** + * Reusable buffer to avoid reallocation on each readFullyUntil + */ + protected byte[] drainBuffer = new byte[16*1024]; + + /** + * Create a new RecordingInputStream. + * + * @param bufferSize Size of buffer to use. + * @param backingFilename Name of backing file. + */ + public RecordingInputStream(int bufferSize, String backingFilename) + { + this.recordingOutputStream = new RecordingOutputStream(bufferSize, + backingFilename); + } + + public void open(InputStream wrappedStream) throws IOException { + logger.fine(Thread.currentThread().getName() + " opening " + + wrappedStream + ", " + Thread.currentThread().getName()); + if(isOpen()) { + // error; should not be opening/wrapping in an unclosed + // stream remains open + throw new IOException("RIS already open for " + +Thread.currentThread().getName()); + } + try { + this.in = wrappedStream; + this.recordingOutputStream.open(); + } catch (IOException ioe) { + close(); // ...and rethrow... + throw ioe; + } + } + + public int read() throws IOException { + if (!isOpen()) { + throw new IOException("Stream closed " + + Thread.currentThread().getName()); + } + int b = this.in.read(); + if (b != -1) { + assert this.recordingOutputStream != null: "ROS is null " + + Thread.currentThread().getName(); + this.recordingOutputStream.write(b); + } + return b; + } + + public int read(byte[] b, int off, int len) throws IOException { + if (!isOpen()) { + throw new IOException("Stream closed " + + Thread.currentThread().getName()); + } + int count = this.in.read(b,off,len); + if (count > 0) { + assert this.recordingOutputStream != null: "ROS is null " + + Thread.currentThread().getName(); + this.recordingOutputStream.write(b,off,count); + } + return count; + } + + public int read(byte[] b) throws IOException { + if (!isOpen()) { + throw new IOException("Stream closed " + + Thread.currentThread().getName()); + } + int count = this.in.read(b); + if (count > 0) { + assert this.recordingOutputStream != null: "ROS is null " + + Thread.currentThread().getName(); + this.recordingOutputStream.write(b,0,count); + } + return count; + } + + public void close() throws IOException { + if (logger.isLoggable(Level.FINE)) { + logger.fine(Thread.currentThread().getName() + " closing " + + this.in + ", " + Thread.currentThread().getName()); + } + IOUtils.closeQuietly(this.in); + this.in = null; + IOUtils.closeQuietly(this.recordingOutputStream); + } + + public ReplayInputStream getReplayInputStream() throws IOException { + return this.recordingOutputStream.getReplayInputStream(); + } + + public ReplayInputStream getMessageBodyReplayInputStream() throws IOException { + return this.recordingOutputStream.getMessageBodyReplayInputStream(); + } + + public long readFully() throws IOException { + while(read(drainBuffer) != -1) { + // Empty out stream. + continue; + } + return this.recordingOutputStream.getSize(); + } + + /** + * Read all of a stream (Or read until we timeout or have read to the max). + * @param softMaxLength Maximum length to read; if zero or < 0, then no + * limit. If met, return normally. + * @param hardMaxLength Maximum length to read; if zero or < 0, then no + * limit. If exceeded, throw RecorderLengthExceededException + * @param timeout Timeout in milliseconds for total read; if zero or + * negative, timeout is Long.MAX_VALUE. If exceeded, throw + * RecorderTimeoutException + * @param maxBytesPerMs How many bytes per millisecond. + * @throws IOException failed read. + * @throws RecorderLengthExceededException + * @throws RecorderTimeoutException + * @throws InterruptedException + */ + public void readFullyOrUntil(long softMaxLength) + throws IOException, RecorderLengthExceededException, + RecorderTimeoutException, InterruptedException { + // Check we're open before proceeding. + if (!isOpen()) { + // TODO: should this be a noisier exception-raising error? + return; + } + + long totalBytes = 0L; + long bytesRead = -1L; + long maxToRead = -1; + while (true) { + try { + // read no more than soft max + maxToRead = (softMaxLength <= 0) + ? drainBuffer.length + : Math.min(drainBuffer.length, softMaxLength - totalBytes); + // nor more than hard max + maxToRead = Math.min(maxToRead, recordingOutputStream.getRemainingLength()); + // but always at least 1 (to trigger hard max exception + maxToRead = Math.max(maxToRead, 1); + + bytesRead = read(drainBuffer,0,(int)maxToRead); + if (bytesRead == -1) { + break; + } + totalBytes += bytesRead; + + if (Thread.interrupted()) { + throw new InterruptedException("Interrupted during IO"); + } + } catch (SocketTimeoutException e) { + // A socket timeout is just a transient problem, meaning + // nothing was available in the configured timeout period, + // but something else might become available later. + // Take this opportunity to check the overall + // timeout (below). One reason for this timeout is + // servers that keep up the connection, 'keep-alive', even + // though we asked them to not keep the connection open. + if (logger.isLoggable(Level.FINE)) { + logger.log(Level.FINE, "socket timeout", e); + } + // check for interrupt + if (Thread.interrupted()) { + throw new InterruptedException("Interrupted during IO"); + } + // check for overall timeout + recordingOutputStream.checkLimits(); + } catch (SocketException se) { + throw se; + } catch (NullPointerException e) { + // [ 896757 ] NPEs in Andy's Th-Fri Crawl. + // A crawl was showing NPE's in this part of the code but can + // not reproduce. Adding this rethrowing catch block w/ + // diagnostics to help should we come across the problem in the + // future. + throw new NullPointerException("Stream " + this.in + ", " + + e.getMessage() + " " + Thread.currentThread().getName()); + } + + // if have read 'enough', just finish + if (softMaxLength > 0 && totalBytes >= softMaxLength) { + break; // return + } + } + } + + public long getSize() { + return this.recordingOutputStream.getSize(); + } + + public void markContentBegin() { + this.recordingOutputStream.markMessageBodyBegin(); + } + + public long getContentBegin() { + return this.recordingOutputStream.getMessageBodyBegin(); + } + + public void startDigest() { + this.recordingOutputStream.startDigest(); + } + + /** + * Convenience method for setting SHA1 digest. + */ + public void setSha1Digest() { + this.recordingOutputStream.setSha1Digest(); + } + + /** + * Sets a digest algorithm which may be applied to recorded data. + * As usually only a subset of the recorded data should + * be fed to the digest, you must also call startDigest() + * to begin digesting. + * + * @param algorithm + */ + public void setDigest(String algorithm) { + this.recordingOutputStream.setDigest(algorithm); + } + + /** + * Sets a digest function which may be applied to recorded data. + * As usually only a subset of the recorded data should + * be fed to the digest, you must also call startDigest() + * to begin digesting. + * + * @param md + */ + public void setDigest(MessageDigest md) { + this.recordingOutputStream.setDigest(md); + } + + /** + * Return the digest value for any recorded, digested data. Call + * only after all data has been recorded; otherwise, the running + * digest state is ruined. + * + * @return the digest final value + */ + public byte[] getDigestValue() { + return this.recordingOutputStream.getDigestValue(); + } + + public long getResponseContentLength() { + return this.recordingOutputStream.getResponseContentLength(); + } + + public void closeRecorder() throws IOException { + this.recordingOutputStream.closeRecorder(); + } + + /** + * @return True if we've been opened. + */ + public boolean isOpen() + { + return this.in != null; + } + + @Override + public synchronized void mark(int readlimit) { + this.in.mark(readlimit); + this.recordingOutputStream.mark(); + } + + @Override + public boolean markSupported() { + return this.in.markSupported(); + } + + @Override + public synchronized void reset() throws IOException { + this.in.reset(); + this.recordingOutputStream.reset(); + } + + /** + * Set limits to be enforced by internal recording-out + */ + public void setLimits(long hardMax, long timeoutMs, long maxRateKBps) { + recordingOutputStream.setLimits(hardMax, timeoutMs, maxRateKBps); + } + + /** + * Expose the amount of in-memory buffering used by the internal + * recording stream. + * @return int buffer size + */ + public int getRecordedBufferLength() { + return recordingOutputStream.getBufferLength(); + } + + public void clearForReuse() throws IOException { + recordingOutputStream.clearForReuse(); + } +} diff --git a/src/main/java/org/archive/io/RecordingOutputStream.java b/src/main/java/org/archive/io/RecordingOutputStream.java new file mode 100644 index 00000000..4d0713da --- /dev/null +++ b/src/main/java/org/archive/io/RecordingOutputStream.java @@ -0,0 +1,576 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + +import it.unimi.dsi.fastutil.io.FastBufferedOutputStream; + +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.util.logging.Level; +import java.util.logging.Logger; + + +/** + * An output stream that records all writes to wrapped output + * stream. + * + * A RecordingOutputStream can be wrapped around any other + * OutputStream to record all bytes written to it. You can + * then request a ReplayInputStream to read those bytes. + * + *

The RecordingOutputStream uses an in-memory buffer and + * backing disk file to allow it to record streams of + * arbitrary length limited only by available disk space. + * + *

As long as the stream recorded is smaller than the + * in-memory buffer, no disk access will occur. + * + *

Recorded content can be recovered as a ReplayInputStream + * (via getReplayInputStream() or, for only the content after + * the content-begin-mark is set, getContentReplayInputStream() ) + * or as a ReplayCharSequence (via getReplayCharSequence()). + * + *

This class is also used as a straight output stream + * by {@link RecordingInputStream} to which it records all reads. + * {@link RecordingInputStream} is exploiting the file backed buffer + * facility of this class passing null for the stream + * to wrap. TODO: Make a FileBackedOutputStream class that is + * subclassed by RecordingInputStream. + * + * @author gojomo + * + */ +public class RecordingOutputStream extends OutputStream { + protected static Logger logger = + Logger.getLogger(RecordingOutputStream.class.getName()); + + /** + * Size of recording. + * + * Later passed to ReplayInputStream on creation. It uses it to know when + * EOS. + */ + protected long size = 0; + + protected String backingFilename; + protected OutputStream diskStream = null; + + /** + * Buffer we write recordings to. + * + * We write all recordings here first till its full. Thereafter we + * write the backing file. + */ + private byte[] buffer; + + /** current virtual position in the recording */ + private long position; + + /** flag to disable recording */ + private boolean recording; + + /** + * Reusable buffer for FastBufferedOutputStream + */ + protected byte[] bufStreamBuf = + new byte [ FastBufferedOutputStream.DEFAULT_BUFFER_SIZE ]; + + /** + * True if we're to digest content. + */ + private boolean shouldDigest = false; + + /** + * Digest instance. + */ + private MessageDigest digest = null; + + /** + * Define for SHA1 algarithm. + */ + private static final String SHA1 = "SHA1"; + + /** + * Maximum amount of header material to accept without the content + * body beginning -- if more, throw a RecorderTooMuchHeaderException. + * TODO: make configurable? make smaller? + */ + protected static final long MAX_HEADER_MATERIAL = 1024*1024; // 1MB + + // configurable max length, max time limits + /** maximum length of material to record before throwing exception */ + protected long maxLength = Long.MAX_VALUE; + /** maximum time to record before throwing exception */ + protected long timeoutMs = Long.MAX_VALUE; + /** maximum rate to record (adds delays to hit target rate) */ + protected long maxRateBytesPerMs = Long.MAX_VALUE; + /** time recording begins for timeout, rate calculations */ + protected long startTime = Long.MAX_VALUE; + + /** + * When recording HTTP, where the content-body starts. + */ + protected long messageBodyBeginMark; + + /** + * Stream to record. + */ + private OutputStream out = null; + + // mark/reset support + /** furthest position reached before any reset()s */ + private long maxPosition = 0; + /** remembered position to reset() to */ + private long markPosition = 0; + + /** + * Create a new RecordingOutputStream. + * + * @param bufferSize Buffer size to use. + * @param backingFilename Name of backing file to use. + */ + public RecordingOutputStream(int bufferSize, String backingFilename) { + this.buffer = new byte[bufferSize]; + this.backingFilename = backingFilename; + recording = true; + } + + /** + * Wrap the given stream, both recording and passing along any data written + * to this RecordingOutputStream. + * + * @throws IOException If failed creation of backing file. + */ + public void open() throws IOException { + this.open(null); + } + + /** + * Wrap the given stream, both recording and passing along any data written + * to this RecordingOutputStream. + * + * @param wrappedStream Stream to wrap. May be null for case where we + * want to write to a file backed stream only. + * + * @throws IOException If failed creation of backing file. + */ + public void open(OutputStream wrappedStream) throws IOException { + if(isOpen()) { + // error; should not be opening/wrapping in an unclosed + // stream remains open + throw new IOException("ROS already open for " + +Thread.currentThread().getName()); + } + clearForReuse(); + this.out = wrappedStream; + if (this.diskStream == null) { + // TODO: Fix so we only make file when its actually needed. + FileOutputStream fis = new FileOutputStream(this.backingFilename); + + this.diskStream = new RecyclingFastBufferedOutputStream(fis, bufStreamBuf); + } + startTime = System.currentTimeMillis(); + } + + public void write(int b) throws IOException { + if(position< maxPosition) { + if(position+len<=maxPosition) { + // revisiting; do nothing but advance position + position += len; + return; + } + // consume part of the array doing nothing but advancing position + long consumeRange = maxPosition - position; + position += consumeRange; + off += consumeRange; + len -= consumeRange; + } + if(recording) { + record(b, off, len); + } + if (this.out != null) { + this.out.write(b, off, len); + } + checkLimits(); + } + + /** + * Check any enforced limits. + */ + protected void checkLimits() throws RecorderIOException { + // too much material before finding end of headers? + if (messageBodyBeginMark<0) { + // no mark yet + if(position>MAX_HEADER_MATERIAL) { + throw new RecorderTooMuchHeaderException(); + } + } + // overlong? + if(position>maxLength) { + throw new RecorderLengthExceededException(); + } + // taking too long? + long duration = System.currentTimeMillis() - startTime; + duration = Math.max(duration,1); // !divzero + if(duration>timeoutMs) { + throw new RecorderTimeoutException(); + } + // need to throttle reading to hit max configured rate? + if(position/duration > maxRateBytesPerMs) { + long desiredDuration = position / maxRateBytesPerMs; + try { + Thread.sleep(desiredDuration-duration); + } catch (InterruptedException e) { + logger.log(Level.WARNING, + "bandwidth throttling sleep interrupted", e); + } + } + } + + /** + * Record the given byte for later recovery + * + * @param b Int to record. + * + * @exception IOException Failed write to backing file. + */ + private void record(int b) throws IOException { + if (this.shouldDigest) { + this.digest.update((byte)b); + } + if (this.position >= this.buffer.length) { + // TODO: Its possible to call write w/o having first opened a + // stream. Protect ourselves against this. + assert this.diskStream != null: "Diskstream is null"; + this.diskStream.write(b); + } else { + this.buffer[(int) this.position] = (byte) b; + } + this.position++; + } + + /** + * Record the given byte-array range for recovery later + * + * @param b Buffer to record. + * @param off Offset into buffer at which to start recording. + * @param len Length of buffer to record. + * + * @exception IOException Failed write to backing file. + */ + private void record(byte[] b, int off, int len) throws IOException { + if(this.shouldDigest) { + assert this.digest != null: "Digest is null."; + this.digest.update(b, off, len); + } + tailRecord(b, off, len); + } + + /** + * Record without digesting. + * + * @param b Buffer to record. + * @param off Offset into buffer at which to start recording. + * @param len Length of buffer to record. + * + * @exception IOException Failed write to backing file. + */ + private void tailRecord(byte[] b, int off, int len) throws IOException { + if(this.position >= this.buffer.length){ + // TODO: Its possible to call write w/o having first opened a + // stream. Lets protect ourselves against this. + if (this.diskStream == null) { + throw new IOException("diskstream is null"); + } + this.diskStream.write(b, off, len); + this.position += len; + } else { + assert this.buffer != null: "Buffer is null"; + int toCopy = (int)Math.min(this.buffer.length - this.position, len); + assert b != null: "Passed buffer is null"; + System.arraycopy(b, off, this.buffer, (int)this.position, toCopy); + this.position += toCopy; + // TODO verify these are +1 -1 right + if (toCopy < len) { + tailRecord(b, off + toCopy, len - toCopy); + } + } + } + + public void close() throws IOException { + if(messageBodyBeginMark<0) { + // if unset, consider 0 posn as content-start + // (so that a -1 never survives to replay step) + messageBodyBeginMark = 0; + } + if (this.out != null) { + this.out.close(); + this.out = null; + } + closeRecorder(); + } + + protected synchronized void closeDiskStream() + throws IOException { + if (this.diskStream != null) { + this.diskStream.close(); + this.diskStream = null; + } + } + + public void closeRecorder() throws IOException { + recording = false; + closeDiskStream(); // if any + // This setting of size is important. Its passed to ReplayInputStream + // on creation. It uses it to know EOS. + if (this.size == 0) { + this.size = this.position; + } + } + + /* (non-Javadoc) + * @see java.io.OutputStream#flush() + */ + public void flush() throws IOException { + if (this.out != null) { + this.out.flush(); + } + if (this.diskStream != null) { + this.diskStream.flush(); + } + } + + public ReplayInputStream getReplayInputStream() throws IOException { + return getReplayInputStream(0); + } + + public ReplayInputStream getReplayInputStream(long skip) throws IOException { + // If this method is being called, then assumption must be that the + // stream is closed. If it ain't, then the stream gotten won't work + // -- the size will zero so any attempt at a read will get back EOF. + assert this.out == null: "Stream is still open."; + ReplayInputStream replay = new ReplayInputStream(this.buffer, + this.size, this.messageBodyBeginMark, this.backingFilename); + replay.skip(skip); + return replay; + } + + /** + * Return a replay stream, cued up to begining of content + * + * @throws IOException + * @return An RIS. + */ + public ReplayInputStream getMessageBodyReplayInputStream() throws IOException { + return getReplayInputStream(this.messageBodyBeginMark); + } + + public long getSize() { + return this.size; + } + + /** + * Remember the current position as the start of the "message + * body". Useful when recording HTTP traffic as a way to start + * replays after the headers. + */ + public void markMessageBodyBegin() { + this.messageBodyBeginMark = this.position; + startDigest(); + } + + /** + * Return stored message-body-begin-mark (which is also end-of-headers) + */ + public long getMessageBodyBegin() { + return this.messageBodyBeginMark; + } + + /** + * Starts digesting recorded data, if a MessageDigest has been + * set. + */ + public void startDigest() { + if (this.digest != null) { + this.digest.reset(); + this.shouldDigest = true; + } + } + + /** + * Convenience method for setting SHA1 digest. + * @see #setDigest(String) + */ + public void setSha1Digest() { + setDigest(SHA1); + } + + + /** + * Sets a digest function which may be applied to recorded data. + * The difference between calling this method and {@link #setDigest(MessageDigest)} + * is that this method tries to reuse MethodDigest instance if already allocated + * and of appropriate algorithm. + * @param algorithm Message digest algorithm to use. + * @see #setDigest(MessageDigest) + */ + public void setDigest(String algorithm) { + try { + // Reuse extant digest if its sha1 algorithm. + if (this.digest == null || + !this.digest.getAlgorithm().equals(algorithm)) { + setDigest(MessageDigest.getInstance(algorithm)); + } + } catch (NoSuchAlgorithmException e) { + e.printStackTrace(); + } + } + + /** + * Sets a digest function which may be applied to recorded data. + * + * As usually only a subset of the recorded data should + * be fed to the digest, you must also call startDigest() + * to begin digesting. + * + * @param md Message digest function to use. + */ + public void setDigest(MessageDigest md) { + this.digest = md; + } + + /** + * Return the digest value for any recorded, digested data. Call + * only after all data has been recorded; otherwise, the running + * digest state is ruined. + * + * @return the digest final value + */ + public byte[] getDigestValue() { + if(this.digest == null) { + return null; + } + return this.digest.digest(); + } + + public long getResponseContentLength() { + return this.size - this.messageBodyBeginMark; + } + + /** + * @return True if this ROS is open. + */ + public boolean isOpen() { + return this.out != null; + } + + public int getBufferLength() { + return this.buffer.length; + } + + /** + * When used alongside a mark-supporting RecordingInputStream, remember + * a position reachable by a future reset(). + */ + public void mark() { + // remember this position for subsequent reset() + this.markPosition = position; + } + + /** + * When used alongside a mark-supporting RecordingInputStream, reset + * the position to that saved by previous mark(). Until the position + * again reached "new" material, none of the bytes pushed to this + * stream will be digested or recorded. + */ + public void reset() { + // take note of furthest-position-reached to avoid double-recording + maxPosition = Math.max(maxPosition, position); + // reset to previous position + position = markPosition; + } + + /** + * Set limits on length, time, and rate to enforce. + * + * @param length + * @param milliseconds + * @param rateKBps + */ + public void setLimits(long length, long milliseconds, long rateKBps) { + maxLength = (length>0) ? length : Long.MAX_VALUE; + timeoutMs = (milliseconds>0) ? milliseconds : Long.MAX_VALUE; + maxRateBytesPerMs = (rateKBps>0) ? rateKBps*1024/1000 : Long.MAX_VALUE; + } + + /** + * Reset limits to effectively-unlimited defaults + */ + public void resetLimits() { + maxLength = Long.MAX_VALUE; + timeoutMs = Long.MAX_VALUE; + maxRateBytesPerMs = Long.MAX_VALUE; + } + + /** + * Return number of bytes that could be recorded without hitting + * length limit + * + * @return long byte count + */ + public long getRemainingLength() { + return maxLength - position; + } + + public void clearForReuse() throws IOException { + this.out = null; + this.position = 0; + this.markPosition = 0; + this.maxPosition = 0; + this.size = 0; + this.messageBodyBeginMark = -1; + // ensure recording turned on + this.recording = true; + // Always begins false; must use startDigest() to begin + this.shouldDigest = false; + if (this.diskStream != null) { + closeDiskStream(); + } + } +} + diff --git a/src/main/java/org/archive/io/RecoverableIOException.java b/src/main/java/org/archive/io/RecoverableIOException.java new file mode 100644 index 00000000..5ce2251a --- /dev/null +++ b/src/main/java/org/archive/io/RecoverableIOException.java @@ -0,0 +1,83 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.io.IOException; +import java.io.PrintStream; +import java.io.PrintWriter; + +/** + * A decorator on IOException for IOEs that are likely not fatal or at least + * merit retry. + * @author stack + * @version $Date$, $Revision$ + */ +public class RecoverableIOException extends IOException { + private static final long serialVersionUID = 6194776587381865451L; + private final IOException decoratedIOException; + + public RecoverableIOException(final String message) { + this(new IOException(message)); + } + + public RecoverableIOException(final IOException ioe) { + super(); + this.decoratedIOException = ioe; + } + + public Throwable getCause() { + return this.decoratedIOException.getCause(); + } + + public String getLocalizedMessage() { + return this.decoratedIOException.getLocalizedMessage(); + } + + public String getMessage() { + return this.decoratedIOException.getMessage(); + } + + public StackTraceElement[] getStackTrace() { + return this.decoratedIOException.getStackTrace(); + } + + public synchronized Throwable initCause(Throwable cause) { + return this.decoratedIOException.initCause(cause); + } + + public void printStackTrace() { + this.decoratedIOException.printStackTrace(); + } + + public void printStackTrace(PrintStream s) { + this.decoratedIOException.printStackTrace(s); + } + + public void printStackTrace(PrintWriter s) { + this.decoratedIOException.printStackTrace(s); + } + + public void setStackTrace(StackTraceElement[] stackTrace) { + this.decoratedIOException.setStackTrace(stackTrace); + } + + public String toString() { + return this.decoratedIOException.toString(); + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/RecyclingFastBufferedOutputStream.java b/src/main/java/org/archive/io/RecyclingFastBufferedOutputStream.java new file mode 100644 index 00000000..a3b76e46 --- /dev/null +++ b/src/main/java/org/archive/io/RecyclingFastBufferedOutputStream.java @@ -0,0 +1,37 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import it.unimi.dsi.fastutil.io.FastBufferedOutputStream; + +import java.io.OutputStream; + +/** + * FastBufferedOutputStream that accepts a passed-in buffer (avoiding + * reallocation). + */ +public class RecyclingFastBufferedOutputStream extends FastBufferedOutputStream { + public RecyclingFastBufferedOutputStream( final OutputStream os, final byte[] buffer ) { + super(os); + this.buffer = buffer; + avail = buffer.length; + } +} + + diff --git a/src/main/java/org/archive/io/ReplayCharSequence.java b/src/main/java/org/archive/io/ReplayCharSequence.java new file mode 100644 index 00000000..aa9b9587 --- /dev/null +++ b/src/main/java/org/archive/io/ReplayCharSequence.java @@ -0,0 +1,77 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + +import java.io.Closeable; +import java.io.IOException; +import java.nio.charset.CharacterCodingException; +import java.nio.charset.Charset; + +import com.google.common.base.Charsets; + + +/** + * CharSequence interface with addition of a {@link #close()} method. + * + * Users of implementations of this interface must call {@link #close()} so + * implementations get a chance at cleaning up after themselves. + * + * @author stack + * @version $Revision$, $Date$ + */ +public interface ReplayCharSequence extends CharSequence, Closeable { + + /** charset to use in replay when declared value + * is absent/illegal/unavailable */ + public Charset FALLBACK_CHARSET = Charsets.ISO_8859_1; // TODO: should this be UTF-8? + + /** + * Call this method when done so implementation has chance to clean up + * resources. + * + * @throws IOException Problem cleaning up file system resources. + */ + public void close() throws IOException; + + /** + * Report count of decoder errors silently eaten during ReplayCharSequence + * use. May be less than the number of individual decoding anomalies in + * underlying content (if decoding method doesn't allow counting individual + * errors). + */ + public long getDecodeExceptionCount(); + + /** + * Return the first coding-exception encountered, if the count > 0. + * @return CharacterCodingException + */ + public CharacterCodingException getCodingException(); + + /** + * @return false if {@link #close()} has been called + */ + public boolean isOpen(); + + /** + * Return the effective Charset used to create this CharSequence from + * (raw byte) source material. + */ + public Charset getCharset(); +} diff --git a/src/main/java/org/archive/io/ReplayInputStream.java b/src/main/java/org/archive/io/ReplayInputStream.java new file mode 100644 index 00000000..fccf5fd3 --- /dev/null +++ b/src/main/java/org/archive/io/ReplayInputStream.java @@ -0,0 +1,325 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; + +import org.apache.commons.io.IOUtils; +import org.archive.util.ArchiveUtils; +import org.archive.util.FileUtils; + + +/** + * Replays the bytes recorded from a RecordingInputStream or + * RecordingOutputStream. + * + * This InputStream supports mark and reset. + * + * @author gojomo + */ +public class ReplayInputStream extends SeekInputStream +{ + private static final int DEFAULT_BUFFER_SIZE = 256*1024; // 256KiB + private BufferedSeekInputStream diskStream; + private byte[] buffer; + private long position; + + /** + * Total size of stream content. + * + * Size of data to replay. + */ + private long size = -1; + + /** + * Where the response body starts, if marked + */ + protected long responseBodyStart = -1; + + + /** + * Constructor. + * + * @param buffer Buffer to read from. + * @param size Size of data to replay. + * @param responseBodyStart Start of the response body. + * @param backingFilename Backing file that sits behind the buffer. If + * size > than buffer then we go to backing file to read + * data that is beyond buffer.length. + * + * @throws IOException If we fail to open an input stream on + * backing file. + */ + public ReplayInputStream(byte[] buffer, long size, long responseBodyStart, + String backingFilename) + throws IOException + { + this(buffer, size, backingFilename); + this.responseBodyStart = responseBodyStart; + } + + /** + * Constructor. + * + * @param buffer Buffer to read from. + * @param size Size of data to replay. + * @param backingFilename Backing file that sits behind the buffer. If + * size > than buffer then we go to backing file to read + * data that is beyond buffer.length. + * @throws IOException If we fail to open an input stream on + * backing file. + */ + public ReplayInputStream(byte[] buffer, long size, String backingFilename) + throws IOException + { + this.buffer = buffer; + this.size = size; + if (size > buffer.length) { + setupDiskStream(new File(backingFilename)); + } + } + + protected void setupDiskStream(File backingFile) throws IOException { + RandomAccessInputStream rais = new RandomAccessInputStream(backingFile); + diskStream = new BufferedSeekInputStream(rais, 4096); + } + + protected File backingFile; + + /** + * Create a ReplayInputStream from the given source stream. Requires + * reading the entire stream (and possibly overflowing to a temporary + * file). Primary reason for doing so would be to have a repositionable + * version of the original stream's contents. + * + * If created via this constructor, use the destroy() method to ensure + * prompt deletion of any associated tmp file when done. + * + * @param fillStream + * @throws IOException + */ + public ReplayInputStream(InputStream fillStream) throws IOException { + this.buffer = new byte[DEFAULT_BUFFER_SIZE]; + long count = ArchiveUtils.readFully(fillStream, buffer); + if(fillStream.available()>0) { + this.backingFile = File.createTempFile("tid"+Thread.currentThread().getId(), "ris"); + count += FileUtils.readFullyToFile(fillStream, backingFile); + setupDiskStream(backingFile); + } + this.size = count; + } + + /** + * Close & destroy any internally-generated temporary files. + */ + public void destroy() { + IOUtils.closeQuietly(this); + if(backingFile!=null) { + FileUtils.deleteSoonerOrLater(backingFile); + } + } + + public long setToResponseBodyStart() throws IOException { + position(responseBodyStart); + return this.position; + } + + + /* (non-Javadoc) + * @see java.io.InputStream#read() + */ + public int read() throws IOException { + if (position == size) { + return -1; // EOF + } + if (position < buffer.length) { + // Convert to unsigned int. + int c = buffer[(int) position] & 0xFF; + position++; + return c; + } + int c = diskStream.read(); + if (c >= 0) { + position++; + } + return c; + } + + /* + * (non-Javadoc) + * + * @see java.io.InputStream#read(byte[], int, int) + */ + public int read(byte[] b, int off, int len) throws IOException { + if (position == size) { + return -1; // EOF + } + if (position < buffer.length) { + int toCopy = (int)Math.min(size - position, + Math.min(len, buffer.length - position)); + System.arraycopy(buffer, (int)position, b, off, toCopy); + if (toCopy > 0) { + position += toCopy; + } + return toCopy; + } + // into disk zone + int read = diskStream.read(b,off,len); + if(read>0) { + position += read; + } + return read; + } + + public void readFullyTo(OutputStream os) throws IOException { + byte[] buf = new byte[4096]; + int c = read(buf); + while (c != -1) { + os.write(buf,0,c); + c = read(buf); + } + } + + /* + * Like 'readFullyTo', but only reads the header-part. + * Starts from the beginning each time it is called. + */ + public void readHeaderTo(OutputStream os) throws IOException { + position = 0; + byte[] buf = new byte[(int)responseBodyStart]; + int c = read(buf,0,buf.length); + if(c != -1) { + os.write(buf,0,c); + } + } + + /* + * Like 'readFullyTo', but only reads the content-part. + */ + public void readContentTo(OutputStream os) throws IOException { + setToResponseBodyStart(); + byte[] buf = new byte[4096]; + int c = read(buf); + while (c != -1) { + os.write(buf,0,c); + c = read(buf); + } + } + + /** + * Convenience method to copy content out to target stream. + * @param os stream to write content to + * @param maxSize maximum count of bytes to copy + * @throws IOException + */ + public void readContentTo(OutputStream os, long maxSize) throws IOException { + setToResponseBodyStart(); + byte[] buf = new byte[4096]; + int c = read(buf); + long tot = 0; + while (c != -1 && tot < maxSize) { + os.write(buf,0,c); + c = read(buf); + tot += c; + } + } + + /* (non-Javadoc) + * @see java.io.InputStream#close() + */ + public void close() throws IOException { + super.close(); + if(diskStream != null) { + diskStream.close(); + } + } + + /** + * Total size of stream content. + * @return Returns the size. + */ + public long getSize() + { + return size; + } + + /** + * Total size of header. + * @return the size of the header. + */ + public long getHeaderSize() + { + return responseBodyStart; + } + + /** + * Total size of content. + * @return the size of the content. + */ + public long getContentSize() + { + return size - responseBodyStart; + } + + /** + * @return Amount THEORETICALLY remaining (TODO: Its not theoretical + * seemingly. The class implemetentation depends on it being exact). + */ + public long remaining() { + return size - position; + } + + + /** + * Reposition the stream. + * + * @param p the new position for this stream + * @throws IOException if an IO error occurs + */ + public void position(long p) throws IOException { + if (p < 0) { + throw new IOException("Negative seek offset."); + } + if (p > size) { + throw new IOException("Desired position exceeds size."); + } + if (p < buffer.length) { + // Only seek file if necessary + if (position > buffer.length) { + diskStream.position(0); + } + } else { + diskStream.position(p - buffer.length); + } + this.position = p; + } + + + public long position() throws IOException { + return position; + } + + protected byte[] getBuffer() { + return buffer; + } +} diff --git a/src/main/java/org/archive/io/RepositionableInputStream.java b/src/main/java/org/archive/io/RepositionableInputStream.java new file mode 100644 index 00000000..6f885130 --- /dev/null +++ b/src/main/java/org/archive/io/RepositionableInputStream.java @@ -0,0 +1,133 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import it.unimi.dsi.fastutil.io.RepositionableStream; + +import java.io.BufferedInputStream; +import java.io.IOException; +import java.io.InputStream; + +/** + * Wrapper around an {@link InputStream} to make a primitive Repositionable + * stream. Uses a {@link BufferedInputStream}. Calls mark on every read so + * we'll remember at least the last thing read (You can only backup on the + * last thing read -- not last 2 or 3 things read). Used by + * {@link GzippedInputStream} when reading streams over a network. Wraps a + * HTTP, etc., stream so we can back it up if needs be after the + * GZIP inflater has done a fill of its full buffer though it only needed + * the first few bytes to finish decompressing the current GZIP member. + * + *

TODO: More robust implementation. Tried to use the it.unimi.dsi.io + * FastBufferdInputStream but relies on FileChannel ByteBuffers and if not + * present -- as would be the case reading from a network stream, the main + * application for this instance -- then it expects the underlying stream + * implements RepositionableStream interface so chicken or egg problem. + * @author stack + */ +public class RepositionableInputStream extends BufferedInputStream implements + RepositionableStream { + private long position = 0; + private long markPosition = -1; + + public RepositionableInputStream(InputStream in) { + super(in); + } + + public RepositionableInputStream(InputStream in, int size) { + super(in, size); + } + + public int read(byte[] b) throws IOException { + int read = super.read(b); + if (read != -1) { + position += read; + } + return read; + } + + public synchronized int read(byte[] b, int offset, int ct) + throws IOException { + // Mark the underlying stream so that we'll remember what we are about + // to read unless a mark has been set in this RepositionableStream + // (We have two levels of mark). In this latter case we want the + // underlying stream to preserve its mark position so aligns with + // this RS when eset is called. + if (!isMarked()) { + super.mark((ct > offset)? ct - offset: ct); + } + int read = super.read(b, offset, ct); + if (read != -1) { + position += read; + } + return read; + } + + public int read() throws IOException { + // Mark the underlying stream so that we'll remember what we are about + // to read unless a mark has been set in this RepositionableStream + // (We have two levels of mark). In this latter case we want the + // underlying stream to preserve its mark position so aligns with + // this RS when eset is called. + if (!isMarked()) { + super.mark(1); + } + int c = super.read(); + if (c != -1) { + position++; + } + return c; + } + + public void position(final long offset) { + if (this.position == offset) { + return; + } + int diff = (int)(offset - this.position); + long lowerBound = this.position - this.pos; + long upperBound = lowerBound + this.count; + if (offset < lowerBound || offset >= upperBound) { + throw new IllegalAccessError("Offset goes outside " + + "current this.buf (TODO: Do buffer fills if positive)"); + } + this.position = offset; + this.pos += diff; + // Clear any mark. + this.markPosition = -1; + } + + public void mark(int readlimit) { + this.markPosition = this.position; + super.mark(readlimit); + } + + public void reset() throws IOException { + super.reset(); + this.position = this.markPosition; + this.markPosition = -1; + } + + protected boolean isMarked() { + return this.markPosition != -1; + } + + public long position() { + return this.position; + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/SafeSeekInputStream.java b/src/main/java/org/archive/io/SafeSeekInputStream.java new file mode 100644 index 00000000..0d8f83b1 --- /dev/null +++ b/src/main/java/org/archive/io/SafeSeekInputStream.java @@ -0,0 +1,124 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + + +import java.io.IOException; + + +/** + * Enables multiple concurrent streams based on the same underlying stream. + * + * @author pjack + */ +public class SafeSeekInputStream extends SeekInputStream { + + + /** + * The underlying stream. + */ + private SeekInputStream input; + + + /** + * The expected position of the underlying stream. + */ + private long expected; + + + /** + * Constructor. The given stream will be positioned to 0 so that an + * accurate position can be tracked. + * + * @param input the underlying input stream + * @throws IOException if an IO error occurs + */ + public SafeSeekInputStream(SeekInputStream input) throws IOException { + this.input = input; + this.expected = input.position(); + } + + + /** + * Ensures that the underlying stream's position is what we expect to be. + * + * @throws IOException if an IO error occurs + */ + private void ensure() throws IOException { + if (expected != input.position()) { + input.position(expected); + } + } + + + @Override + public int read() throws IOException { + ensure(); + int c = input.read(); + if (c >= 0) { + expected++; + } + return c; + } + + + @Override + public int read(byte[] buf, int ofs, int len) throws IOException { + ensure(); + int r = input.read(buf, ofs, len); + if (r > 0) { + expected += r; + } + return r; + } + + + @Override + public int read(byte[] buf) throws IOException { + ensure(); + int r = input.read(buf); + if (r > 0) { + expected += r; + } + return r; + } + + + @Override + public long skip(long c) throws IOException { + ensure(); + long r = input.skip(c); + if (r > 0) { + expected += r; + } + return r; + } + + + public void position(long p) throws IOException { + input.position(p); + expected = p; + } + + + public long position() throws IOException { + return expected; + } + +} diff --git a/src/main/java/org/archive/io/SeekInputStream.java b/src/main/java/org/archive/io/SeekInputStream.java new file mode 100644 index 00000000..177724ec --- /dev/null +++ b/src/main/java/org/archive/io/SeekInputStream.java @@ -0,0 +1,81 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + + +import it.unimi.dsi.fastutil.io.RepositionableStream; + +import java.io.IOException; +import java.io.InputStream; + + +/** + * Base class for repositionable input streams. + * + * @author pjack + */ +public abstract class SeekInputStream extends InputStream +implements RepositionableStream { + + + /** + * The marked file position. A value less than zero + * indicates that no mark has been set. + */ + private long mark = -1; + + + /** + * Marks the current position of the stream. The limit parameter is + * ignored; the mark will remain valid until reset is called or the + * stream is closed. + * + * @param limit ignored + */ + public void mark(int limit) { + try { + this.mark = position(); + } catch (IOException e) { + mark = -1; + } + } + + + /** + * Resets this stream to its marked position. + * + * @throws IOException if there is no mark, or if an IO error occurs + */ + public void reset() throws IOException { + if (mark < 0) { + throw new IOException("No mark."); + } + position(mark); + } + + + /** + * Returns true, since SeekInputStreams support mark/reset by default. + * + * @return true + */ + public boolean markSupported() { + return true; + } +} diff --git a/src/main/java/org/archive/io/SeekReader.java b/src/main/java/org/archive/io/SeekReader.java new file mode 100644 index 00000000..4abf7847 --- /dev/null +++ b/src/main/java/org/archive/io/SeekReader.java @@ -0,0 +1,84 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + + +import java.io.IOException; +import java.io.Reader; + +import it.unimi.dsi.fastutil.io.RepositionableStream; + + +/** + * Base class for repositionable readers. + * + * @author pjack + */ +public abstract class SeekReader extends Reader +implements RepositionableStream { + + + /** + * The marked file position. A value less than zero + * indicates that no mark has been set. + */ + private long mark = -1; + + + /** + * Marks the current position of the stream. The limit parameter is + * ignored; the mark will remain valid until reset is called or the + * stream is closed. + * + * @param limit ignored + */ + @Override + public void mark(int limit) { + try { + this.mark = position(); + } catch (IOException e) { + mark = -1; + } + } + + + /** + * Resets this stream to its marked position. + * + * @throws IOException if there is no mark, or if an IO error occurs + */ + @Override + public void reset() throws IOException { + if (mark < 0) { + throw new IOException("No mark."); + } + position(mark); + } + + + /** + * Returns true, since SeekInputStreams support mark/reset by default. + * + * @return true + */ + @Override + public boolean markSupported() { + return true; + } +} diff --git a/src/main/java/org/archive/io/SeekReaderCharSequence.java b/src/main/java/org/archive/io/SeekReaderCharSequence.java new file mode 100644 index 00000000..a9b4880f --- /dev/null +++ b/src/main/java/org/archive/io/SeekReaderCharSequence.java @@ -0,0 +1,56 @@ +package org.archive.io; + +import java.io.IOException; + +public class SeekReaderCharSequence implements CharSequence { + + + final private SeekReader reader; + final private int size; + + + public SeekReaderCharSequence(SeekReader reader, int size) { + this.reader = reader; + this.size = size; + } + + + public int length() { + return size; + } + + + public char charAt(int index) { + if ((index < 0) || (index >= length())) { + throw new IndexOutOfBoundsException(Integer.toString(index)); + } + try { + reader.position(index); + int r = reader.read(); + if (r < 0) { + throw new IllegalStateException("EOF"); + } + return (char)reader.read(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + + public CharSequence subSequence(int start, int end) { + return new CharSubSequence(this, start, end); + } + + public String toString() { + StringBuilder sb = new StringBuilder(); + try { + reader.position(0); + for (int ch = reader.read(); ch >= 0; ch = reader.read()) { + sb.append((char)ch); + } + return sb.toString(); + } catch (IOException e) { + throw new IllegalStateException(e); + } + } +} diff --git a/src/main/java/org/archive/io/SinkHandlerLogThread.java b/src/main/java/org/archive/io/SinkHandlerLogThread.java new file mode 100644 index 00000000..0070785e --- /dev/null +++ b/src/main/java/org/archive/io/SinkHandlerLogThread.java @@ -0,0 +1,34 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + + +/** + * Implemented by threads that provide extra information. + * + * TODO: rename class, rename getCurrentProcessorName() + */ +public interface SinkHandlerLogThread { + + String getName(); + String getCurrentProcessorName(); + int getSerialNumber(); + +} diff --git a/src/main/java/org/archive/io/UTF8Bytes.java b/src/main/java/org/archive/io/UTF8Bytes.java new file mode 100644 index 00000000..c280b08d --- /dev/null +++ b/src/main/java/org/archive/io/UTF8Bytes.java @@ -0,0 +1,37 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.io.UnsupportedEncodingException; + +/** + * Marker Interface for instances that can be serialized as UTF8 bytes. + * TODO: Do we need a UTF8Stream Marker Interface? + * @author stack + * @version $Date$ $Version$ + */ +public interface UTF8Bytes { + public static final String UTF8 = "UTF-8"; + + /** + * @return Instance as UTF-8 bytes. + * @throws UnsupportedEncodingException + */ + public byte [] getUTF8Bytes() throws UnsupportedEncodingException; +} diff --git a/src/main/java/org/archive/io/WriterPool.java b/src/main/java/org/archive/io/WriterPool.java new file mode 100644 index 00000000..2dc385a1 --- /dev/null +++ b/src/main/java/org/archive/io/WriterPool.java @@ -0,0 +1,343 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + +import java.io.File; +import java.io.IOException; +import java.util.Collection; +import java.util.LinkedList; +import java.util.concurrent.ArrayBlockingQueue; +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.logging.Level; +import java.util.logging.Logger; + +import org.json.JSONArray; +import org.json.JSONException; +import org.json.JSONObject; + +/** + * Pool of Writers. + * + * Abstract. Override and pass in the Constructor a factory that creates + * {@link WriterPoolMember} implementations. + * + * @author stack + */ +public abstract class WriterPool { + private final Logger logger = Logger.getLogger(this.getClass().getName()); + + /** + * Used to generate unique filename sequences. + */ + final protected AtomicInteger serialNo; + + /** + * Default maximum active number of files in the pool. + */ + public static final int DEFAULT_MAX_ACTIVE = 1; + + /** Assumed largest possible value of maxActive; pool will have this + * maximum capacity, so dynamic changes beyond this number won't work. */ + protected static final int LARGEST_MAX_ACTIVE = 255; + + /** + * Maximum time to wait on a free file before considering + * making a new one (if not already at max) + */ + public static final int DEFAULT_MAX_WAIT_FOR_IDLE = 500; + + /** + * File settings. + * Keep in data structure rather than as individual values. + */ + protected final WriterPoolSettings settings; + + /** maximum number of writers to create at a time*/ + protected int maxActive; + /** maximum ms to wait before considering creation of a writer */ + protected int maxWait; + /** current count of active writers; only read/mutated in synchronized blocks */ + protected int currentActive = 0; + /** round-robin queue of available writers */ + protected BlockingQueue availableWriters; + + /** system time when writer was last wanted (because one was not ready in time) */ + protected long lastWriterNeededTime; + /** system time when writer was last 'rolled over' (imminent creation of new file) */ + protected long lastWriterRolloverTime; + + /** + * Constructor + * @param serial Used to generate unique filename sequences + * @param factory Factory that knows how to make a {@link WriterPoolMember}. + * @param settings Settings for this pool. + * @param poolMaximumActive + * @param poolMaximumWait + */ + public WriterPool(final AtomicInteger serial, + final WriterPoolSettings settings, + final int poolMaximumActive, final int poolMaximumWait) { + logger.info("Initial configuration:" + + " prefix=" + settings.getPrefix() + + ", template=" + settings.getTemplate() + + ", compress=" + settings.getCompress() + + ", maxSize=" + settings.getMaxFileSizeBytes() + + ", maxActive=" + poolMaximumActive + + ", maxWait=" + poolMaximumWait); + this.settings = settings; + this.maxActive = poolMaximumActive; + this.maxWait = poolMaximumWait; + availableWriters = new ArrayBlockingQueue(LARGEST_MAX_ACTIVE, true); + this.serialNo = serial; + } + + /** + * Check out a {@link WriterPoolMember}. + * + * This method should be followed by a call to + * {@link #returnFile(WriterPoolMember)} or + * {@link #invalidateFile(WriterPoolMember)} else pool starts leaking. + * + * @return Writer checked out of a pool of files or created + * @throws IOException Problem getting Writer from pool (Converted + * from Exception to IOException so this pool can live as a good citizen + * down in depths of ARCSocketFactory). + */ + public WriterPoolMember borrowFile() + throws IOException { + WriterPoolMember writer = null; + while(writer == null) { + try { + writer = availableWriters.poll(maxWait,TimeUnit.MILLISECONDS); + } catch (InterruptedException e) { + // nothing to do but proceed + } + if(writer==null) { + writer = makeNewWriterIfAppropriate(); + } + } + return writer; + } + + /** + * Create a new writer instance, if still below maxActive count. + * Remember times to help make later decision when writer should + * be discarded. + * + * @return WriterPoolMember or null if already at max + */ + protected synchronized WriterPoolMember makeNewWriterIfAppropriate() { + long now = System.currentTimeMillis(); + lastWriterNeededTime = now; + if(currentActive < maxActive) { + currentActive++; + lastWriterRolloverTime = now; + return makeWriter(); + } + return null; + } + + /** + * @return new WriterPoolMember of appropriate type + */ + protected abstract WriterPoolMember makeWriter(); + + /** + * Discard a previously-used writer, cleanly closing it and leaving it out + * of the pool. + * @param writer + * @throws IOException + */ + public synchronized void destroyWriter(WriterPoolMember writer) throws IOException { + currentActive--; + writer.close(); + } + /** + * Return a writer, for likely reuse unless (1) writer's current file has + * reached its target size; and (2) there's been no demand for additional + * writers since the last time a new writer-file was rolled-over. In that + * case, the possibly-superfluous writer instance is discarded. + * @param writer Writer to return to the pool. + * @throws IOException Problem returning File to pool. + */ + public void returnFile(WriterPoolMember writer) + throws IOException { + synchronized(this) { + if(writer.isOversize()) { + // maybe retire writer rather than recycle + if(lastWriterNeededTime<=lastWriterRolloverTime) { + // no timeouts waiting for recycled writer since last writer rollover + destroyWriter(writer); + return; + } else { + // reuse writer instance, causing new file to be created + lastWriterRolloverTime = System.currentTimeMillis(); + } + } + } + if(!availableWriters.offer(writer)) { + logger.log(Level.WARNING, "writer unreturnable to available pool; closing early"); + destroyWriter(writer); + } + } + + /** + * Close and discard a writer that experienced a potentially-corrupting + * error. + * @param f writer with problem + * @throws IOException + */ + public synchronized void invalidateFile(WriterPoolMember f) + throws IOException { + try { + destroyWriter(f); + } catch (Exception e) { + // Convert exception. + throw new IOException(e.getMessage()); + } + // It'll have been closed. Rename with an '.invalid' suffix so it + // gets attention. + File file = f.getFile(); + file.renameTo(new File(file.getAbsoluteFile() + + WriterPoolMember.INVALID_SUFFIX)); + } + + /** + * @return Number of {@link WriterPoolMember}s checked out of pool. + * @throws java.lang.UnsupportedOperationException + */ + public synchronized int getNumActive() + throws UnsupportedOperationException { + return currentActive - getNumIdle(); + } + + /** + * @return Number of {@link WriterPoolMember} instances still in the pool. + * @throws java.lang.UnsupportedOperationException + */ + public int getNumIdle() + throws UnsupportedOperationException { + return availableWriters.size(); + } + + /** + * Close all {@link WriterPoolMember}s in pool. + */ + public void close() { + Collection writers = drainAllWriters(); + for (WriterPoolMember writer: writers) { + try { + destroyWriter(writer); + } catch (IOException e) { + logger.log(Level.WARNING,"problem closing writer",e); + } + } + } + + /** + * @return Returns settings. + */ + public WriterPoolSettings getSettings() { + return this.settings; + } + + /** + * @return State of the pool string + */ + protected String getPoolState() { + StringBuffer buffer = new StringBuffer("Active "); + buffer.append(getNumActive()); + buffer.append(" of max "); + buffer.append(maxActive); + buffer.append(", idle "); + buffer.append(getNumIdle()); + return buffer.toString(); + } + + /** + * Returns the atomic integer used to generate serial numbers + * for files. + * + * @return the serial number generator + */ + public AtomicInteger getSerialNo() { + return serialNo; + } + + /** + * Drains all the active writers from {@link #availableWriters}, blocking to + * wait for any writers currently in use to become available. + * + *

+ * When finished with writers, call availableWriters.addAll(...) to put them + * back into the rotation. + * + * @return all the active writers + */ + protected synchronized Collection drainAllWriters() { + LinkedList writers = new LinkedList(); + availableWriters.drainTo(writers); + + while (writers.size() < currentActive) { + try { + WriterPoolMember w = availableWriters.take(); + writers.add(w); + } catch (InterruptedException e) { + logger.severe("caught " + e + " while waiting for writers to free up; returning only " + + writers.size() + " of " + currentActive + " active writers"); + break; + } + } + + return writers; + } + + public void flush() { + Collection writers = drainAllWriters(); + + for (WriterPoolMember writer: writers) { + try { + writer.flush(); + } catch (IOException e) { + logger.log(Level.WARNING, "problem flushing writer " + writer, e); + } + } + + availableWriters.addAll(writers); + } + + public JSONArray jsonStatus() throws JSONException { + Collection writers = drainAllWriters(); + + JSONArray ja = new JSONArray(); + for (WriterPoolMember w: writers) { + JSONObject jo = new JSONObject(); + jo.put("file", w.getFile()); + jo.put("position", w.getPosition()); + ja.put(jo); + } + + availableWriters.addAll(writers); + + return ja; + } +} diff --git a/src/main/java/org/archive/io/WriterPoolMember.java b/src/main/java/org/archive/io/WriterPoolMember.java new file mode 100644 index 00000000..6ea6b295 --- /dev/null +++ b/src/main/java/org/archive/io/WriterPoolMember.java @@ -0,0 +1,487 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.text.DecimalFormat; +import java.text.NumberFormat; +import java.util.Iterator; +import java.util.List; +import java.util.Properties; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.logging.Logger; +import java.util.zip.GZIPOutputStream; + +import org.archive.util.ArchiveUtils; +import org.archive.util.FileUtils; +import org.archive.util.PropertyUtils; + + + +/** + * Member of {@link WriterPool}. + * Implements rotating off files, file naming with some guarantee of + * uniqueness, and position in file. Subclass to pick up functionality for a + * particular Writer type. + * @author stack + * @version $Date$ $Revision$ + */ +public abstract class WriterPoolMember implements ArchiveFileConstants { + private final Logger logger = Logger.getLogger(this.getClass().getName()); + + public static final String UTF8 = "UTF-8"; + + /** + * Default archival-aggregate filename template. + * + * Under usual assumptions -- hostnames aren't shared among crawling hosts; + * processes have unique PIDs and admin ports; timestamps inside one process + * don't repeat (see UniqueTimestampService); clocks are generally + * accurate -- will generate a unique name. + * + * Stands for Internet Archive Heritrix. + */ + public static final String DEFAULT_TEMPLATE = + "${prefix}-${timestamp17}-${serialno}-${heritrix.pid}~${heritrix.hostname}~${heritrix.port}"; + + /** + * Default for file prefix. + */ + public static final String DEFAULT_PREFIX = "WEB"; + + /** + * Reference to file we're currently writing. + */ + protected File f = null; + + /** Output stream for file. */ + protected OutputStream out = null; + /** Counting stream for metering */ + protected MiserOutputStream countOut = null; + + /** reusable buffer for recycling scenarios */ + protected byte[] rebuf; + + protected WriterPoolSettings settings; + private final String extension; + + /** + * Creation date for the current file. + * Set by {@link #createFile()}. + */ + protected String currentTimestamp = "UNSET!!!"; + + protected String currentBasename; + + /** + * A running sequence used making unique file names. + */ + final private AtomicInteger serialNo; + + /** + * Directories round-robin index. + */ + protected static int roundRobinIndex = 0; + + /** + * NumberFormat instance for formatting serial number. + * + * Pads serial number with zeros. + */ + protected static NumberFormat serialNoFormatter = new DecimalFormat("00000"); + + + /** + * Buffer to reuse writing streams. + */ + protected final byte [] scratchbuffer = new byte[4 * 1024]; + + + /** + * Constructor. + * Takes a stream. Use with caution. There is no upperbound check on size. + * Will just keep writing. + * + * @param serialNo used to create unique filename sequences + * @param out Where to write. + * @param file File the out is connected to. + * @param cmprs Compress the content written. + * @param a14DigitDate If null, we'll write current time. + * @throws IOException + */ + protected WriterPoolMember(AtomicInteger serialNo, + final OutputStream out, final File file, + final WriterPoolSettings settings) + throws IOException { + this(serialNo, settings, null); + this.countOut = (out instanceof MiserOutputStream) + ? (MiserOutputStream)out + : new MiserOutputStream(out, settings.getFrequentFlushes()); + this.out = this.countOut; + this.f = file; + } + + /** + * Constructor. + * + * @param serialNo used to create unique filename sequences + * @param dirs Where to drop files. + * @param prefix File prefix to use. + * @param cmprs Compress the records written. + * @param maxSize Maximum size for ARC files written. + * @param template filenaming template to use + * @param extension Extension to give file. + */ + public WriterPoolMember(AtomicInteger serialNo, + final WriterPoolSettings settings, final String extension) { + this.settings = settings; + this.extension = extension; + this.serialNo = serialNo; + } + + /** + * Call this method just before/after any significant write. + * + * Call at the end of the writing of a record or just before we start + * writing a new record. Will close current file and open a new file + * if file size has passed out maxSize. + * + *

Creates and opens a file if none already open. One use of this method + * then is after construction, call this method to add the metadata, then + * call {@link #getPosition()} to find offset of first record. + * + * TODO: perhaps this should be called checkForNewOpen? because it also + * handles initial open, even when not rolling oversize + * + * @exception IOException + */ + public void checkSize() throws IOException { + if (this.out == null || isOversize()) { + createFile(); + } + } + + /** Check if underlying file has already reached its target size. + * @return boolean true if file has reached target size and due to be closed + */ + public boolean isOversize() { + return settings.getMaxFileSizeBytes() != -1 && (this.getPosition() > settings.getMaxFileSizeBytes()); + } + + /** + * Create a new file. + * Rotates off the current Writer and creates a new in its place + * to take subsequent writes. Usually called from {@link #checkSize()}. + * @return Name of file created. + * @throws IOException + */ + protected String createFile() throws IOException { + generateNewBasename(); + String name = currentBasename + '.' + this.extension + + ((settings.getCompress())? DOT_COMPRESSED_FILE_EXTENSION: "") + + OCCUPIED_SUFFIX; + File dir = getNextDirectory(settings.calcOutputDirs()); + return createFile(new File(dir, name)); + } + + protected String createFile(final File file) throws IOException { + close(); + this.f = file; + FileOutputStream fos = new FileOutputStream(this.f); + if(rebuf==null) { + rebuf = new byte[settings.getWriteBufferSize()]; + } + this.countOut = new MiserOutputStream(new RecyclingFastBufferedOutputStream(fos,rebuf),settings.getFrequentFlushes()); + this.out = this.countOut; + logger.fine("Opened " + this.f.getAbsolutePath()); + return this.f.getName(); + } + + /** + * @param dirs List of File objects that point at directories. + * @return Find next directory to write an arc too. If more + * than one, it tries to round-robin through each in turn. + * @throws IOException + */ + protected File getNextDirectory(List dirs) + throws IOException { + if (WriterPoolMember.roundRobinIndex >= dirs.size()) { + WriterPoolMember.roundRobinIndex = 0; + } + File d = null; + try { + d = checkWriteable((File)dirs. + get(WriterPoolMember.roundRobinIndex)); + } catch (IndexOutOfBoundsException e) { + // Dirs list might be altered underneath us. + // If so, we get this exception -- just keep on going. + } + if (d == null && dirs.size() > 1) { + for (Iterator i = dirs.iterator(); d == null && i.hasNext();) { + d = checkWriteable((File)i.next()); + } + } else { + WriterPoolMember.roundRobinIndex++; + } + if (d == null) { + throw new IOException("Directories unusable."); + } + return d; + } + + protected File checkWriteable(File d) { + if (d == null) { + return d; + } + + try { + FileUtils.ensureWriteableDirectory(d); + } catch(IOException e) { + logger.warning("Directory " + d.getPath() + " is not" + + " writeable or cannot be created: " + e.getMessage()); + d = null; + } + return d; + } + + /** + * Generate a new basename by interpolating values in the configured + * template. Values come from local state, other configured values, and + * global system properties. The recommended default template will + * generate a unique basename under reasonable assumptions. + */ + protected void generateNewBasename() { + Properties localProps = new Properties(); + localProps.setProperty("prefix", settings.getPrefix()); + synchronized(this.getClass()) { + // ensure that serialNo and timestamp are minted together (never inverted sort order) + String paddedSerialNumber = WriterPoolMember.serialNoFormatter.format(serialNo.getAndIncrement()); + String timestamp17 = ArchiveUtils.getUnique17DigitDate(); + String timestamp14 = ArchiveUtils.getUnique14DigitDate(); + currentTimestamp = timestamp17; + localProps.setProperty("serialno", paddedSerialNumber); + localProps.setProperty("timestamp17", timestamp17); + localProps.setProperty("timestamp14", timestamp14); + } + currentBasename = PropertyUtils.interpolateWithProperties(settings.getTemplate(), + localProps, System.getProperties()); + } + + + /** + * Get the file name + * + * @return the filename, as if uncompressed + */ + protected String getBaseFilename() { + String name = this.f.getName(); + if (settings.getCompress() && name.endsWith(DOT_COMPRESSED_FILE_EXTENSION)) { + return name.substring(0,name.length() - 3); + } else if(settings.getCompress() && + name.endsWith(DOT_COMPRESSED_FILE_EXTENSION + + OCCUPIED_SUFFIX)) { + return name.substring(0, name.length() - + (3 + OCCUPIED_SUFFIX.length())); + } else { + return name; + } + } + + /** + * Get this file. + * + * Used by junit test to test for creation and when {@link WriterPool} wants + * to invalidate a file. + * + * @return The current file. + */ + public File getFile() { + return this.f; + } + + /** + * Post write tasks. + * + * Has side effects. Will open new file if we're at the upper bound. + * If we're writing compressed files, it will wrap output stream with a + * GZIP writer with side effect that GZIP header is written out on the + * stream. + * + * @exception IOException + */ + protected void preWriteRecordTasks() + throws IOException { + if (this.out == null) { + createFile(); + } + if (settings.getCompress()) { + // Wrap stream in GZIP Writer. + // The below construction immediately writes the GZIP 'default' + // header out on the underlying stream. + this.out = new CompressedStream(this.out); + } + } + + /** + * Post file write tasks. + * If compressed, finishes up compression and flushes stream so any + * subsequent checks get good reading. + * + * @exception IOException + */ + protected void postWriteRecordTasks() + throws IOException { + if (settings.getCompress()) { + CompressedStream o = (CompressedStream)this.out; + o.finish(); + o.flush(); + o.end(); + this.out = o.getWrappedStream(); + } + } + + /** + * Position in raw output (typically, physical file). + * Used making accounting of bytes written. + * @return Position in final media (assuming all flushing completes) + * @throws IOException + */ + public long getPosition() { + return (countOut==null)? 0L : this.countOut.getCount(); + } + + public boolean isCompressed() { + return settings.getCompress(); + } + + protected void write(final byte [] b) throws IOException { + this.out.write(b); + } + + protected void flush() throws IOException { + this.out.flush(); + } + + protected void write(byte[] b, int off, int len) throws IOException { + this.out.write(b, off, len); + } + + protected void write(int b) throws IOException { + this.out.write(b); + } + + /** + * Copy bytes from the provided InputStream to the target file/stream being + * written. + * + * @return number of bytes written (normally equal to {@code enforceLength}) + * @param is + * InputStream to copy bytes from + * @param recordLength + * expected number of bytes to copy + * @param enforceLength + * whether to throw an exception if too many/too few bytes are + * available from stream + * @throws IOException + */ + protected long copyFrom(final InputStream is, final long recordLength, + boolean enforceLength) throws IOException { + int read = scratchbuffer.length; + long tot = 0; + while ((tot < recordLength) + && (read = is.read(scratchbuffer)) != -1) { + int write = read; + // never write more than enforced length + write = (int) Math.min(write, recordLength - tot); + tot += read; + write(scratchbuffer, 0, write); + } + if (enforceLength && tot != recordLength) { + // throw exception if desired for read vs. declared mismatches + throw new IOException("Read " + tot + " but expected " + + recordLength); + } + + return tot; + } + + public void close() throws IOException { + if (this.out == null) { + return; + } + this.out.close(); + this.out = null; + if (this.f != null && this.f.exists()) { + String path = this.f.getAbsolutePath(); + if (path.endsWith(OCCUPIED_SUFFIX)) { + File f = new File(path.substring(0, + path.length() - OCCUPIED_SUFFIX.length())); + if (f.exists() & !f.delete()) { + logger.warning("Failed delete of " + f); + } + if (!this.f.renameTo(f)) { + logger.warning("Failed rename of " + path); + } + this.f = f; + } + + logger.fine("Closed " + this.f.getAbsolutePath() + + ", size " + this.f.length()); + } + } + + protected OutputStream getOutputStream() { + return this.out; + } + + /** + * An override so we get access to underlying output stream. + * and offer an end() that does not accompany closing underlying + * stream. + * @author stack + */ + private class CompressedStream extends GZIPOutputStream { + public CompressedStream(OutputStream out) + throws IOException { + super(out); + } + + /** + * @return Reference to stream being compressed. + */ + OutputStream getWrappedStream() { + return this.out; + } + + /** + * Release the deflater's native process resources, + * which otherwise would not occur until either + * finalization or DeflaterOutputStream.close() + * (which would also close underlying stream). + */ + public void end() { + def.end(); + } + } +} diff --git a/src/main/java/org/archive/io/WriterPoolSettings.java b/src/main/java/org/archive/io/WriterPoolSettings.java new file mode 100644 index 00000000..d0805cdc --- /dev/null +++ b/src/main/java/org/archive/io/WriterPoolSettings.java @@ -0,0 +1,39 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.io.File; +import java.util.List; + +/** + * Settings object for a {@link WriterPool}. + * Used creating {@link WriterPoolMember}s. + * @author stack + * @version $Date$, $Revision$ + */ +public interface WriterPoolSettings { + public long getMaxFileSizeBytes(); + public String getPrefix(); + public String getTemplate(); + public List calcOutputDirs(); + public boolean getCompress(); + public List getMetadata(); + public boolean getFrequentFlushes(); + public int getWriteBufferSize(); +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/arc/ARC2WCDX.java b/src/main/java/org/archive/io/arc/ARC2WCDX.java new file mode 100644 index 00000000..19010131 --- /dev/null +++ b/src/main/java/org/archive/io/arc/ARC2WCDX.java @@ -0,0 +1,243 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io.arc; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.PrintStream; +import java.util.Date; +import java.util.Iterator; +import java.util.zip.GZIPOutputStream; + +import org.apache.commons.httpclient.Header; +import org.apache.commons.httpclient.HeaderGroup; +import org.apache.commons.httpclient.util.DateParseException; +import org.apache.commons.httpclient.util.DateUtil; +import org.archive.io.ArchiveRecord; +import org.archive.util.ArchiveUtils; +import org.archive.util.SURT; + +/** + * Create a 'Wide' CDX from an ARC. Takes one argument, the path to the ARC. + * Writes .wcdx.gz in same directory. + * + * @author gojomo + */ +public class ARC2WCDX { + final public static String WCDX_VERSION="0.1"; + + public static void main(String[] args) throws IOException { + String arcFilename = args[0]; + createWcdx(arcFilename); + } + + public static Object[] createWcdx(String arcFilename) throws IOException { + ARCReader reader = ARCReaderFactory.get(arcFilename); + Object[] retVal = createWcdx(reader); + reader.close(); + return retVal; + } + + public static Object[] createWcdx(ARCReader reader) { + reader.setDigest(true); + + String wcdxPath = reader.getReaderIdentifier().replaceAll("\\.arc(\\.gz)?$",".wcdx.gz"); + File wcdxFile = new File(wcdxPath+".open"); + PrintStream writer = null; + long count = 0; + try { + writer = new PrintStream(new GZIPOutputStream(new FileOutputStream(wcdxFile))); + + // write header: legend + timestamp + StringBuilder legend = new StringBuilder(); + appendField(legend,"CDX"); + appendField(legend,"surt-uri"); + appendField(legend,"b"); // ARC timestamp + appendField(legend,"http-date"); + appendField(legend,"s"); // status code + appendField(legend,"m"); // media type + appendField(legend,"sha1"); // content sha1 + appendField(legend,"g"); // ARC name + appendField(legend,"V"); // start offset + appendField(legend,"end-offset"); // TODO: implement + appendField(legend,"n"); // ARC record length TODO: verify + appendField(legend,"http-content-length"); + appendField(legend,"http-last-modified"); + appendField(legend,"http-expires"); + appendField(legend,"http-etag"); + appendField(legend,"http-location"); + appendField(legend,"e"); // IP + appendField(legend,"a"); // original URL + // WCDX version+creation time: crude version control + appendField(legend,WCDX_VERSION+"@"+ArchiveUtils.get14DigitDate()); + writer.println(legend.toString()); + + Iterator iter = reader.iterator(); + count = 0; + while(iter.hasNext()) { + ARCRecord record = (ARCRecord) iter.next(); + record.close(); + ARCRecordMetaData h = (ARCRecordMetaData) record.getHeader(); + Header[] httpHeaders = record.getHttpHeaders(); + if(httpHeaders==null) { + httpHeaders = new Header[0]; + } + HeaderGroup hg = new HeaderGroup(); + hg.setHeaders(httpHeaders); + StringBuilder builder = new StringBuilder(); + + // SURT-form URI + appendField(builder,SURT.fromURI(h.getUrl())); + // record timestamp ('b') + appendField(builder,h.getDate()); + // http header date + appendTimeField(builder,hg.getFirstHeader("Date")); + // response code ('s') + appendField(builder,h.getStatusCode()); + // media type ('m') + appendField(builder,h.getMimetype()); + // content checksum (like 'c', but here Base32 SHA1) + appendField(builder,record.getDigestStr()); + // arc name ('g') + appendField(builder,reader.getFileName()); + // compressed start offset ('V') + appendField(builder,h.getOffset()); + + // compressed end offset (?) +// appendField(builder, +// reader.getInputStream() instanceof RepositionableStream +// ? ((GzippedInputStream)reader.getInputStream()).vPosition() +// : "-"); + // TODO; leave unavail for now + appendField(builder, "-"); + + // uncompressed (declared in ARC headerline) record length + appendField(builder,h.getLength()); + // http header content-length + appendField(builder,hg.getFirstHeader("Content-Length")); + + // http header mod-date + appendTimeField(builder,hg.getFirstHeader("Last-Modified")); + // http header expires + appendTimeField(builder,hg.getFirstHeader("Expires")); + + // http header etag + appendField(builder,hg.getFirstHeader("ETag")); + // http header redirect ('Location' header?) + appendField(builder,hg.getFirstHeader("Location")); + // ip ('e') + appendField(builder,h.getIp()); + // original URI + appendField(builder,h.getUrl()); + // TODO MAYBE - a title from inside content? + + writer.println(builder.toString()); + count++; + } + wcdxFile.renameTo(new File(wcdxPath)); + } catch (IOException e) { + // soldier on: but leave '.open' wcdx file as indicator of error + if(!wcdxFile.exists()) { + try { + wcdxFile.createNewFile(); + } catch (IOException e1) { + // TODO Auto-generated catch block + throw new RuntimeException(e1); + } + } + } catch (RuntimeException e) { + // soldier on: but leave '.open' wcdx file as indicator of error + if(!wcdxFile.exists()) { + try { + wcdxFile.createNewFile(); + } catch (IOException e1) { + // TODO Auto-generated catch block + throw new RuntimeException(e1); + } + } + } finally { + if(writer!=null) { + writer.close(); + } + } + + return new Object[] {wcdxPath, count}; + } + + protected static void appendField(StringBuilder builder, Object obj) { + if(builder.length()>0) { + // prepend with delimiter + builder.append(' '); + } + if(obj instanceof Header) { + obj = ((Header)obj).getValue().trim(); + } + + builder.append((obj==null||obj.toString().length()==0)?"-":obj); + } + + protected static void appendTimeField(StringBuilder builder, Object obj) { + if(builder.length()>0) { + // prepend with delimiter + builder.append(' '); + } + if(obj==null) { + builder.append("-"); + return; + } + if(obj instanceof Header) { + String s = ((Header)obj).getValue().trim(); + try { + Date date = DateUtil.parseDate(s); + String d = ArchiveUtils.get14DigitDate(date); + if(d.startsWith("209")) { + d = "199"+d.substring(3); + } + obj = d; + } catch (DateParseException e) { + builder.append('e'); + return; + } + + } + builder.append(obj); + } +} + +//'wide' CDX +//a original url +//b timestamp +//s resp code +//m type +//? content md5 (full 'k'? 'c'? +//g arc name +//V compressed start offset +//? compressed length +//n? uncompressed length +//? mod date +//? expires +//? server 'date' hdr +//? etag +//r redirect ('Location'?) +//e ip +//MAYBE: +//? TITLE from HTML or other format? + + diff --git a/src/main/java/org/archive/io/arc/ARCConstants.java b/src/main/java/org/archive/io/arc/ARCConstants.java new file mode 100644 index 00000000..c44cfef7 --- /dev/null +++ b/src/main/java/org/archive/io/arc/ARCConstants.java @@ -0,0 +1,29 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io.arc; + + +/** + * Constants used by ARC files and in ARC file processing. + * + * @author stack + * @deprecated + */ +public interface ARCConstants extends org.archive.format.arc.ARCConstants { +} diff --git a/src/main/java/org/archive/io/arc/ARCLocation.java b/src/main/java/org/archive/io/arc/ARCLocation.java new file mode 100644 index 00000000..c6c64437 --- /dev/null +++ b/src/main/java/org/archive/io/arc/ARCLocation.java @@ -0,0 +1,37 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io.arc; + +/** + * Datastructure to hold ARC record location. + * Used by wayback machine. + * @author stack + */ +public interface ARCLocation { + /** + * @return Returns the ARC filename. Can be full path to ARC, URL to an + * ARC or just the portion of an ARC name that is unique to a collection. + */ + public String getName(); + + /** + * @return Returns the offset into the ARC. + */ + public long getOffset(); +} diff --git a/src/main/java/org/archive/io/arc/ARCReader.java b/src/main/java/org/archive/io/arc/ARCReader.java new file mode 100644 index 00000000..7f85cc2a --- /dev/null +++ b/src/main/java/org/archive/io/arc/ARCReader.java @@ -0,0 +1,553 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io.arc; + +import java.io.ByteArrayOutputStream; +import java.io.Closeable; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.logging.Logger; + +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.Options; +import org.apache.commons.cli.ParseException; +import org.apache.commons.cli.PosixParser; +import org.archive.io.ArchiveReader; +import org.archive.io.ArchiveRecord; +import org.archive.io.ArchiveRecordHeader; +import org.archive.io.RecoverableIOException; +import org.archive.io.WriterPoolMember; +import org.archive.util.ArchiveUtils; + + +/** + * Get an iterator on an ARC file or get a record by absolute position. + * + * ARC files are described here: + * Arc + * File Format. + * + *

This class knows how to parse an ARC file. Pass it a file path + * or an URL to an ARC. It can parse ARC Version 1 and 2. + * + *

Iterator returns ARCRecord + * though {@link Iterator#next()} is returning + * java.lang.Object. Cast the return. + * + *

Profiling java.io vs. memory-mapped ByteBufferInputStream shows the + * latter slightly slower -- but not by much. TODO: Test more. Just + * change {@link #getInputStream(File, long)}. + * + * @author stack + * @version $Date$ $Revision$ + */ +public abstract class ARCReader extends ArchiveReader +implements ARCConstants, Closeable { + private final Logger logger = Logger.getLogger(ARCReader.class.getName()); + + /** + * Set to true if we are aligned on first record of Archive file. + * We used depend on offset. If offset was zero, then we were + * aligned on first record. This is no longer necessarily the case when + * Reader is created at an offset into an Archive file: The offset is zero + * but its relative to where we started reading. + */ + private boolean alignedOnFirstRecord = true; + + private boolean parseHttpHeaders = true; + + protected ARCReader() { + super(); + } + + /** + * Skip over any trailing new lines at end of the record so we're lined up + * ready to read the next. + * @param record + * @throws IOException + */ + protected void gotoEOR(ArchiveRecord record) throws IOException { + if (getIn().available() <= 0) { + return; + } + + // Remove any trailing LINE_SEPARATOR + int c = -1; + while (getIn().available() > 0) { + if (getIn().markSupported()) { + getIn().mark(1); + } + c = getIn().read(); + if (c != -1) { + if (c == LINE_SEPARATOR) { + continue; + } + if (getIn().markSupported()) { + // We've overread. We're probably in next record. There is + // no way of telling for sure. It may be dross at end of + // current record. Backup. + getIn().reset(); + break; + } + ArchiveRecordHeader h = (getCurrentRecord() != null)? + record.getHeader(): null; + throw new IOException("Read " + (char)c + + " when only " + LINE_SEPARATOR + " expected. " + + getReaderIdentifier() + ((h != null)? + h.getHeaderFields().toString(): "")); + } + } + } + + /** + * Create new arc record. + * + * Encapsulate housekeeping that has to do w/ creating a new record. + * + *

Call this method at end of constructor to read in the + * arcfile header. Will be problems reading subsequent arc records + * if you don't since arcfile header has the list of metadata fields for + * all records that follow. + * + *

When parsing through ARCs writing out CDX info, we spend about + * 38% of CPU in here -- about 30% of which is in getTokenizedHeaderLine + * -- of which 16% is reading. + * + * @param is InputStream to use. + * @param offset Absolute offset into arc file. + * @return An arc record. + * @throws IOException + */ + protected ARCRecord createArchiveRecord(InputStream is, long offset) + throws IOException { + try { + String version = super.getVersion(); + ARCRecord record = new ARCRecord(is, getReaderIdentifier(), offset, + isDigest(), isStrict(), isParseHttpHeaders(), + isAlignedOnFirstRecord(), version); + if (version != null && super.getVersion() == null) + super.setVersion(version); + currentRecord(record); + } catch (IOException e) { + if (e instanceof RecoverableIOException) { + // Don't mess with RecoverableIOExceptions. Let them out. + throw e; + } + IOException newE = new IOException(e.getMessage() + " (Offset " + + offset + ")."); + newE.setStackTrace(e.getStackTrace()); + throw newE; + } + return (ARCRecord)getCurrentRecord(); + } + + /** + * Returns version of this ARC file. Usually read from first record of ARC. + * If we're reading without having first read the first record -- e.g. + * random access into middle of an ARC -- then version will not have been + * set. For now, we return a default, version 1.1. Later, if more than + * just one version of ARC, we could look at such as the meta line to see + * what version of ARC this is. + * @return Version of this ARC file. + */ + public String getVersion() { + return (super.getVersion() == null)? "1.1": super.getVersion(); + } + + protected boolean isAlignedOnFirstRecord() { + return alignedOnFirstRecord; + } + + protected void setAlignedOnFirstRecord(boolean alignedOnFirstRecord) { + this.alignedOnFirstRecord = alignedOnFirstRecord; + } + + /** + * @return Returns the parseHttpHeaders. + */ + public boolean isParseHttpHeaders() { + return this.parseHttpHeaders; + } + + /** + * @param parse The parseHttpHeaders to set. + */ + public void setParseHttpHeaders(boolean parse) { + this.parseHttpHeaders = parse; + } + + public String getFileExtension() { + return ARC_FILE_EXTENSION; + } + + public String getDotFileExtension() { + return DOT_ARC_FILE_EXTENSION; + } + + protected boolean output(final String format) + throws IOException, java.text.ParseException { + boolean result = super.output(format); + if(!result && (format.equals(NOHEAD) || format.equals(HEADER))) { + throw new IOException(format + + " format only supported for single Records"); + } + return result; + } + + public boolean outputRecord(final String format) throws IOException { + boolean result = super.outputRecord(format); + if (result) { + return result; + } + if (format.equals(NOHEAD)) { + // No point digesting if dumping content. + setDigest(false); + ARCRecord r = (ARCRecord) get(); + r.skipHttpHeader(); + r.dump(); + result = true; + } else if (format.equals(HEADER)) { + // No point digesting if dumping content. + setDigest(false); + ARCRecord r = (ARCRecord) get(); + r.dumpHttpHeader(); + result = true; + } + + return result; + } + + public void dump(final boolean compress) + throws IOException, java.text.ParseException { + // No point digesting if we're doing a dump. + setDigest(false); + boolean firstRecord = true; + ARCWriter writer = null; + for (Iterator ii = iterator(); ii.hasNext();) { + ARCRecord r = (ARCRecord)ii.next(); + // We're to dump the arc on stdout. + // Get the first record's data if any. + ARCRecordMetaData meta = r.getMetaData(); + if (firstRecord) { + firstRecord = false; + // Get an ARCWriter. + ByteArrayOutputStream baos = + new ByteArrayOutputStream(r.available()); + // This is slow but done only once at top of ARC. + while (r.available() > 0) { + baos.write(r.read()); + } + List listOfMetadata = new ArrayList(); + listOfMetadata.add(baos.toString(WriterPoolMember.UTF8)); + // Assume getArc returns full path to file. ARCWriter + // or new File will complain if it is otherwise. + List outDirs = new ArrayList(); + WriterPoolSettingsData settings = + new WriterPoolSettingsData("","",-1L,compress,outDirs,listOfMetadata); + writer = new ARCWriter(new AtomicInteger(), System.out, + new File(meta.getArc()), settings); + continue; + } + + writer.write(meta.getUrl(), meta.getMimetype(), meta.getIp(), + ArchiveUtils.parse14DigitDate(meta.getDate()).getTime(), + (int)meta.getLength(), r); + } + // System.out.println(System.currentTimeMillis() - start); + } + + /** + * @return an ArchiveReader that will delete a local file on close. Used + * when we bring Archive files local and need to clean up afterward. + */ + public ARCReader getDeleteFileOnCloseReader(final File f) { + final ARCReader d = this; + return new ARCReader() { + private final ARCReader delegate = d; + private File archiveFile = f; + + public void close() throws IOException { + this.delegate.close(); + if (this.archiveFile != null) { + if (archiveFile.exists()) { + archiveFile.delete(); + } + this.archiveFile = null; + } + } + + public ArchiveRecord get(long o) throws IOException { + return this.delegate.get(o); + } + + public boolean isDigest() { + return this.delegate.isDigest(); + } + + public boolean isStrict() { + return this.delegate.isStrict(); + } + + public Iterator iterator() { + return this.delegate.iterator(); + } + + public void setDigest(boolean d) { + this.delegate.setDigest(d); + } + + public void setStrict(boolean s) { + this.delegate.setStrict(s); + } + + public List validate() throws IOException { + return this.delegate.validate(); + } + + @Override + public ArchiveRecord get() throws IOException { + return this.delegate.get(); + } + + @Override + public String getVersion() { + return this.delegate.getVersion(); + } + + @Override + public List validate(int noRecords) throws IOException { + return this.delegate.validate(noRecords); + } + + @Override + protected ARCRecord createArchiveRecord(InputStream is, + long offset) + throws IOException { + return this.delegate.createArchiveRecord(is, offset); + } + + @Override + protected void gotoEOR(ArchiveRecord record) throws IOException { + this.delegate.gotoEOR(record); + } + + @Override + public void dump(boolean compress) + throws IOException, java.text.ParseException { + this.delegate.dump(compress); + } + + @Override + public String getDotFileExtension() { + return this.delegate.getDotFileExtension(); + } + + @Override + public String getFileExtension() { + return this.delegate.getFileExtension(); + } + }; + } + + // Static methods follow. + + /** + * + * @param formatter Help formatter instance. + * @param options Usage options. + * @param exitCode Exit code. + */ + private static void usage(HelpFormatter formatter, Options options, + int exitCode) { + formatter.printHelp("java org.archive.io.arc.ARCReader" + + " [--digest=true|false] \\\n" + + " [--format=cdx|cdxfile|dump|gzipdump|header|nohead]" + + " [--offset=#] \\\n[--strict] [--parse] ARC_FILE|ARC_URL", + options); + System.exit(exitCode); + } + + /** + * Write out the arcfile. + * + * @param reader + * @param format Format to use outputting. + * @throws IOException + * @throws java.text.ParseException + */ + protected static void output(ARCReader reader, String format) + throws IOException, java.text.ParseException { + if (!reader.output(format)) { + throw new IOException("Unsupported format: " + format); + } + } + + /** + * Generate a CDX index file for an ARC file. + * + * @param urlOrPath The ARC file to generate a CDX index for + * @throws IOException + * @throws java.text.ParseException + */ + public static void createCDXIndexFile(String urlOrPath) + throws IOException, java.text.ParseException { + ARCReader r = ARCReaderFactory.get(urlOrPath); + r.setStrict(false); + r.setParseHttpHeaders(true); + r.setDigest(true); + output(r, CDX_FILE); + } + + /** + * Command-line interface to ARCReader. + * + * Here is the command-line interface: + *

+     * usage: java org.archive.io.arc.ARCReader [--offset=#] ARCFILE
+     *  -h,--help      Prints this message and exits.
+     *  -o,--offset    Outputs record at this offset into arc file.
+ * + *

See in $HERITRIX_HOME/bin/arcreader for a script that'll + * take care of classpaths and the calling of ARCReader. + * + *

Outputs using a pseudo-CDX format as described here: + * CDX + * Legent and here + * Example. + * Legend used in below is: 'CDX b e a m s c V (or v if uncompressed) n g'. + * Hash is hard-coded straight SHA-1 hash of content. + * + * @param args Command-line arguments. + * @throws ParseException Failed parse of the command line. + * @throws IOException + * @throws java.text.ParseException + */ + @SuppressWarnings("unchecked") + public static void main(String [] args) + throws ParseException, IOException, java.text.ParseException { + Options options = getOptions(); + options.addOption(new Option("p","parse", false, "Parse headers.")); + PosixParser parser = new PosixParser(); + CommandLine cmdline = parser.parse(options, args, false); + List cmdlineArgs = cmdline.getArgList(); + Option [] cmdlineOptions = cmdline.getOptions(); + HelpFormatter formatter = new HelpFormatter(); + + // If no args, print help. + if (cmdlineArgs.size() <= 0) { + usage(formatter, options, 0); + } + + // Now look at options passed. + long offset = -1; + boolean digest = false; + boolean strict = false; + boolean parse = false; + String format = CDX; + for (int i = 0; i < cmdlineOptions.length; i++) { + switch(cmdlineOptions[i].getId()) { + case 'h': + usage(formatter, options, 0); + break; + + case 'o': + offset = + Long.parseLong(cmdlineOptions[i].getValue()); + break; + + case 's': + strict = true; + break; + + case 'p': + parse = true; + break; + + case 'd': + digest = getTrueOrFalse(cmdlineOptions[i].getValue()); + break; + + case 'f': + format = cmdlineOptions[i].getValue().toLowerCase(); + boolean match = false; + // List of supported formats. + final String [] supportedFormats = + {CDX, DUMP, GZIP_DUMP, HEADER, NOHEAD, CDX_FILE}; + for (int ii = 0; ii < supportedFormats.length; ii++) { + if (supportedFormats[ii].equals(format)) { + match = true; + break; + } + } + if (!match) { + usage(formatter, options, 1); + } + break; + + default: + throw new RuntimeException("Unexpected option: " + + + cmdlineOptions[i].getId()); + } + } + + if (offset >= 0) { + if (cmdlineArgs.size() != 1) { + System.out.println("Error: Pass one arcfile only."); + usage(formatter, options, 1); + } + ARCReader arc = ARCReaderFactory.get((String)cmdlineArgs.get(0), + offset); + arc.setStrict(strict); + // We must parse headers if we need to skip them. + if (format.equals(NOHEAD) || format.equals(HEADER)) { + parse = true; + } + arc.setParseHttpHeaders(parse); + outputRecord(arc, format); + } else { + for (String urlOrPath : cmdlineArgs) { + try { + ARCReader r = ARCReaderFactory.get(urlOrPath); + r.setStrict(strict); + r.setParseHttpHeaders(parse); + r.setDigest(digest); + output(r, format); + } catch (RuntimeException e) { + // Write out name of file we failed on to help with + // debugging. Then print stack trace and try to keep + // going. We do this for case where we're being fed + // a bunch of ARCs; just note the bad one and move + // on to the next. + System.err.println("Exception processing " + urlOrPath + + ": " + e.getMessage()); + e.printStackTrace(System.err); + System.exit(1); + } + } + } + } +} diff --git a/src/main/java/org/archive/io/arc/ARCReaderFactory.java b/src/main/java/org/archive/io/arc/ARCReaderFactory.java new file mode 100644 index 00000000..e7dc1625 --- /dev/null +++ b/src/main/java/org/archive/io/arc/ARCReaderFactory.java @@ -0,0 +1,454 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io.arc; + +import java.io.BufferedInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.Iterator; +import java.util.logging.Level; + +import org.archive.io.ArchiveReader; +import org.archive.io.ArchiveReaderFactory; +import org.archive.io.ArchiveRecord; +import org.archive.io.ArchiveRecordHeader; +import org.archive.util.FileUtils; +import org.archive.util.zip.GZIPMembersInputStream; +import org.archive.util.zip.GzipHeader; +import org.archive.util.zip.NoGzipMagicException; + +import com.google.common.io.CountingInputStream; + + +/** + * Factory that returns an ARCReader. + * + * Can handle compressed and uncompressed ARCs. + * + * @author stack + */ +public class ARCReaderFactory extends ArchiveReaderFactory +implements ARCConstants { + /** + * This factory instance. + */ + private static final ARCReaderFactory factory = new ARCReaderFactory(); + + /** + * Shutdown any access to default constructor. + */ + protected ARCReaderFactory() { + super(); + } + + public static ARCReader get(String arcFileOrUrl) + throws MalformedURLException, IOException { + return (ARCReader)ARCReaderFactory.factory. + getArchiveReader(arcFileOrUrl); + } + + public static ARCReader get(String arcFileOrUrl, final long offset) + throws MalformedURLException, IOException { + return (ARCReader)ARCReaderFactory.factory. + getArchiveReader(arcFileOrUrl, offset); + } + + public static ARCReader get(final File f) throws IOException { + return (ARCReader)ARCReaderFactory.factory.getArchiveReader(f); + } + + public static ARCReader get(final File f, final long offset) + throws IOException { + return (ARCReader)ARCReaderFactory.factory.getArchiveReader(f, offset); + } + + protected ArchiveReader getArchiveReader(final File f, final long offset) + throws IOException { + return getArchiveReader(f, true, offset); + } + + /** + * @param f An arcfile to read. + * @param skipSuffixTest Set to true if want to test that ARC has proper + * suffix. Use this method and pass false to open ARCs + * with the .open or otherwise suffix. + * @param offset Have returned ARCReader set to start reading at passed + * offset. + * @return An ARCReader. + * @throws IOException + */ + public static ARCReader get(final File f, + final boolean skipSuffixTest, final long offset) + throws IOException { + return (ARCReader)ARCReaderFactory.factory.getArchiveReader(f, + skipSuffixTest, offset); + } + + protected ArchiveReader getArchiveReader(final File arcFile, + final boolean skipSuffixTest, final long offset) + throws IOException { + boolean compressed = testCompressedARCFile(arcFile, skipSuffixTest); + if (!compressed) { + if (!FileUtils.isReadableWithExtensionAndMagic(arcFile, + ARC_FILE_EXTENSION, ARC_MAGIC_NUMBER)) { + throw new IOException(arcFile.getAbsolutePath() + + " is not an Internet Archive ARC file."); + } + } + return compressed? + (ARCReader)ARCReaderFactory.factory. + new CompressedARCReader(arcFile, offset): + (ARCReader)ARCReaderFactory.factory. + new UncompressedARCReader(arcFile, offset); + } + + public static ArchiveReader get(final String s, final InputStream is, + final boolean atFirstRecord) + throws IOException { + return ARCReaderFactory.factory.getArchiveReader(s, is, + atFirstRecord); + } + + protected ArchiveReader getArchiveReader(final String arc, + final InputStream is, final boolean atFirstRecord) + throws IOException { + + // We do this mark() reset() stuff, wrapping in a BufferedInputStream if + // necessary to make it work, because testCompressedARCStream() consumes + // some bytes from the input stream + InputStream possiblyWrapped; + if (is.markSupported()) { + possiblyWrapped = is; + } else { + possiblyWrapped = new BufferedInputStream(is); + } + + possiblyWrapped.mark(100); + boolean compressed = testCompressedARCStream(possiblyWrapped); + possiblyWrapped.reset(); + + if (compressed) { + return new CompressedARCReader(arc, possiblyWrapped, atFirstRecord); + } else { + return new UncompressedARCReader(arc, possiblyWrapped); + } + } + + /** + * Get an ARCReader aligned at offset. This version of get + * will not bring the ARC local but will try to stream across the net making + * an HTTP 1.1 Range request on remote http server (RFC1435 Section 14.35). + * + * @param arcUrl HTTP URL for an ARC (All ARCs considered remote). + * @param offset Offset into ARC at which to start fetching. + * @return An ARCReader aligned at offset. + * @throws IOException + */ + public static ARCReader get(final URL arcUrl, final long offset) + throws IOException { + return (ARCReader)ARCReaderFactory.factory.getArchiveReader(arcUrl, + offset); + } + + /** + * Get an ARCReader. + * Pulls the ARC local into whereever the System Property + * java.io.tmpdir points. It then hands back an ARCReader that + * points at this local copy. A close on this ARCReader instance will + * remove the local copy. + * @param arcUrl An URL that points at an ARC. + * @return An ARCReader. + * @throws IOException + */ + public static ARCReader get(final URL arcUrl) + throws IOException { + return (ARCReader)ARCReaderFactory.factory.getArchiveReader(arcUrl); + } + + /** + * @param arcFile File to test. + * @return True if arcFile is compressed ARC. + * @throws IOException + */ + public boolean isCompressed(File arcFile) throws IOException { + return testCompressedARCFile(arcFile); + } + + /** + * Check file is compressed and in ARC GZIP format. + * + * @param arcFile File to test if its Internet Archive ARC file + * GZIP compressed. + * + * @return True if this is an Internet Archive GZIP'd ARC file (It begins + * w/ the Internet Archive GZIP header and has the + * COMPRESSED_ARC_FILE_EXTENSION suffix). + * + * @exception IOException If file does not exist or is not unreadable. + */ + public static boolean testCompressedARCFile(File arcFile) + throws IOException { + return testCompressedARCFile(arcFile, false); + } + + /** + * Check file is compressed and in ARC GZIP format. + * + * @param arcFile File to test if its Internet Archive ARC file + * GZIP compressed. + * @param skipSuffixCheck Set to true if we're not to test on the + * '.arc.gz' suffix. + * + * @return True if this is an Internet Archive GZIP'd ARC file (It begins + * w/ the Internet Archive GZIP header). + * + * @exception IOException If file does not exist or is not unreadable. + */ + public static boolean testCompressedARCFile(File arcFile, + boolean skipSuffixCheck) + throws IOException { + boolean compressedARCFile = false; + FileUtils.assertReadable(arcFile); + if(!skipSuffixCheck && !arcFile.getName().toLowerCase() + .endsWith(COMPRESSED_ARC_FILE_EXTENSION)) { + return compressedARCFile; + } + + final InputStream is = new FileInputStream(arcFile); + try { + compressedARCFile = testCompressedARCStream(is); + } finally { + is.close(); + } + return compressedARCFile; + } + + public static boolean isARCSuffix(final String arcName) { + return (arcName == null)? + false: + (arcName.toLowerCase().endsWith(DOT_COMPRESSED_ARC_FILE_EXTENSION))? + true: + (arcName.toLowerCase().endsWith(DOT_ARC_FILE_EXTENSION))? + true: false; + } + + /** + * Tests passed stream is gzip stream by reading in the HEAD. + * Does not reposition the stream. That is left up to the caller. + * @param is An InputStream. + * @return True if compressed stream. + * @throws IOException + */ + public static boolean testCompressedARCStream(final InputStream is) + throws IOException { + boolean compressedARCFile = false; + GzipHeader gh = null; + try { + gh = new GzipHeader(is); + } catch (NoGzipMagicException e) { + return false; + } + + byte[] fextra = gh.getFextra(); + // Now make sure following bytes are IA GZIP comment. + // First check length. ARC_GZIP_EXTRA_FIELD includes length + // so subtract two and start compare to ARC_GZIP_EXTRA_FIELD + // at +2. + // some Alexa ARC files gzip extra fields have changed slightly + // after the first two bytes, so we'll just look for the 'LX' + // extension for valid IA ARC files. + if (fextra != null) { + if (fextra.length >= ARC_GZIP_EXTRA_FIELD.length - 2) { + if (fextra[0] == ARC_GZIP_EXTRA_FIELD[2] && + fextra[1] == ARC_GZIP_EXTRA_FIELD[3]) { + compressedARCFile = true; + } + } + } else { + // Some old arcs don't have an extra header at all, but they're still compressed + compressedARCFile = true; + } + + return compressedARCFile; + } + + /** + * Uncompressed arc file reader. + * @author stack + */ + public class UncompressedARCReader extends ARCReader { + /** + * Constructor. + * @param f Uncompressed arcfile to read. + * @throws IOException + */ + public UncompressedARCReader(final File f) + throws IOException { + this(f, 0); + } + + /** + * Constructor. + * + * @param f Uncompressed arcfile to read. + * @param offset Offset at which to position ARCReader. + * @throws IOException + */ + public UncompressedARCReader(final File f, final long offset) + throws IOException { + // Arc file has been tested for existence by time it has come + // to here. + setIn(new CountingInputStream(getInputStream(f, offset))); + getIn().skip(offset); + initialize(f.getAbsolutePath()); + } + + /** + * Constructor. + * + * @param f Uncompressed arc to read. + * @param is InputStream. + */ + public UncompressedARCReader(final String f, final InputStream is) { + // Arc file has been tested for existence by time it has come + // to here. + setIn(new CountingInputStream(is)); + initialize(f); + } + } + + /** + * Compressed arc file reader. + * + * @author stack + */ + public class CompressedARCReader extends ARCReader { + + /** + * Constructor. + * + * @param f + * Compressed arcfile to read. + * @throws IOException + */ + public CompressedARCReader(final File f) throws IOException { + this(f, 0); + } + + /** + * Constructor. + * + * @param f Compressed arcfile to read. + * @param offset Position at where to start reading file. + * @throws IOException + */ + public CompressedARCReader(final File f, final long offset) + throws IOException { + // Arc file has been tested for existence by time it has come + // to here. + setIn(new GZIPMembersInputStream(getInputStream(f, offset))); + ((GZIPMembersInputStream)getIn()).compressedSeek(offset); + setCompressed((offset == 0)); // TODO: does this make sense??? + initialize(f.getAbsolutePath()); + } + + /** + * Constructor. + * + * @param f Compressed arcfile. + * @param is InputStream to use. + * @throws IOException + */ + public CompressedARCReader(final String f, final InputStream is, + final boolean atFirstRecord) + throws IOException { + // Arc file has been tested for existence by time it has come + // to here. + setIn(new GZIPMembersInputStream(is)); + setCompressed(true); + setAlignedOnFirstRecord(atFirstRecord); + initialize(f); + } + + /** + * Get record at passed offset. + * + * @param offset + * Byte index into arcfile at which a record starts. + * @return An ARCRecord reference. + * @throws IOException + */ + public ARCRecord get(long offset) throws IOException { + cleanupCurrentRecord(); + ((GZIPMembersInputStream)getIn()).compressedSeek(offset); + return createArchiveRecord(getIn(), offset); + } + + public Iterator iterator() { + /** + * Override ARCRecordIterator so can base returned iterator on + * GzippedInputStream iterator. + */ + return new ArchiveRecordIterator() { + private GZIPMembersInputStream gis = + (GZIPMembersInputStream)getIn(); + + private Iterator gzipIterator = this.gis.memberIterator(); + + protected boolean innerHasNext() { + return this.gzipIterator.hasNext(); + } + + protected ArchiveRecord innerNext() throws IOException { + InputStream is = this.gzipIterator.next(); + return createArchiveRecord(is, Math.max(gis.getCurrentMemberStart(), gis.getCurrentMemberEnd())); + } + }; + } + + protected void gotoEOR(ArchiveRecord rec) throws IOException { + int c; + while ((c = getIn().read())==LINE_SEPARATOR); + if(c==-1) { + return; + } + long skipped = 1; + while (getIn().read()>-1) { + skipped++; + } + // Report on system error the number of unexpected characters + // at the end of this record. + ArchiveRecordHeader meta = (getCurrentRecord() != null)? + rec.getHeader(): null; + String message = "Record STARTING at " + + ((GZIPMembersInputStream)getIn()).getCurrentMemberStart() + + " has " + skipped + " trailing byte(s): " + + ((meta != null)? meta.toString(): ""); + if (isStrict()) { + throw new IOException(message); + } + logStdErr(Level.WARNING, message); + } + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/arc/ARCRecord.java b/src/main/java/org/archive/io/arc/ARCRecord.java new file mode 100644 index 00000000..21bea07c --- /dev/null +++ b/src/main/java/org/archive/io/arc/ARCRecord.java @@ -0,0 +1,835 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io.arc; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.logging.Level; +import java.util.logging.Logger; +import java.util.regex.Matcher; + +import org.apache.commons.httpclient.Header; +import org.apache.commons.httpclient.StatusLine; +import org.apache.commons.httpclient.util.EncodingUtil; +import org.apache.commons.lang.StringUtils; +import org.archive.io.ArchiveRecord; +import org.archive.io.ArchiveRecordHeader; +import org.archive.io.RecoverableIOException; +import org.archive.util.InetAddressUtil; +import org.archive.util.LaxHttpParser; +import org.archive.util.TextUtils; + +/** + * An ARC file record. + * Does not compass the ARCRecord metadata line, just the record content. + * @author stack + */ +public class ARCRecord extends ArchiveRecord implements ARCConstants { + /** + * Http status line object. + * + * May be null if record is not http. + */ + private StatusLine httpStatus = null; + + /** + * Http header bytes. + * + * If non-null and bytes available, give out its contents before we + * go back to the underlying stream. + */ + private InputStream httpHeaderStream = null; + + /** + * Http headers. + * + * Only populated after reading of headers. + */ + private Header [] httpHeaders = null; + + /** + * Array of field names. + * + * Used to initialize headerFieldNameKeys. + */ + private final String [] headerFieldNameKeysArray = { + URL_FIELD_KEY, + IP_HEADER_FIELD_KEY, + DATE_FIELD_KEY, + MIMETYPE_FIELD_KEY, + LENGTH_FIELD_KEY + }; + + /** + * An array of the header field names found in the ARC file header on + * the 3rd line. + * + * We used to read these in from the arc file first record 3rd line but + * now we hardcode them for sake of improved performance. + */ + private final List headerFieldNameKeys = + Arrays.asList(this.headerFieldNameKeysArray); + + /** + * Http header bytes read while trying to read http header + */ + public long httpHeaderBytesRead = -1; + + /** + * record length from metadata line + */ + public long recordDeclaredLength; + + /** + * null if source was not compressed + */ + public long compressedBytes; + + /** + * actual payload data (not including trailing newline), + * should match record-declared-length + */ + public long uncompressedBytes; + + /** + * content-length header, iff HTTP and present, null otherwise + */ + public long httpPayloadDeclaredLength; + + /** + * actual http payload length, should match http-payload-declared-length + */ + public long httpPayloadActualLength; + + /** + * errors encountered reading record + */ + public List errors = new ArrayList(); + + /** + * verbatim ARC record header string + */ + private String headerString; + public String getHeaderString() { + return this.headerString; + } + + /** + * Constructor. + * + * @param in Stream cue'd up to be at the start of the record this instance + * is to represent. + * @param metaData Meta data. + * @throws IOException + */ + public ARCRecord(InputStream in, ArchiveRecordHeader metaData) + throws IOException { + this(in, metaData, 0, true, false, true); + } + + /** + * Constructor. + * + * @param in Stream cue'd up to be at the start of the record this instance + * is to represent. + * @param metaData Meta data. + * @param bodyOffset Offset into the body. Usually 0. + * @param digest True if we're to calculate digest for this record. Not + * digesting saves about ~15% of cpu during an ARC parse. + * @param strict Be strict parsing (Parsing stops if ARC inproperly + * formatted). + * @param parseHttpHeaders True if we are to parse HTTP headers. Costs + * about ~20% of CPU during an ARC parse. + * @throws IOException + */ + public ARCRecord(InputStream in, ArchiveRecordHeader metaData, + int bodyOffset, boolean digest, boolean strict, + final boolean parseHttpHeaders) + throws IOException { + super(in, metaData, bodyOffset, digest, strict); + if (parseHttpHeaders) { + this.httpHeaderStream = readHttpHeader(); + } + } + + /** + * Constructor. + * + * @param in Stream cue'd up to be at the start of the records metadata + * this instance is to represent. + * @param identifier Identifier for this the hosting Reader. + * @param offset Current offset into in (Used to keep + * position properly aligned). Usually 0. + * @param digest True if we're to calculate digest for this record. Not + * digesting saves about ~15% of cpu during an ARC parse. + * @param strict Be strict parsing (Parsing stops if ARC inproperly + * formatted). + * @param parseHttpHeaders True if we are to parse HTTP headers. Costs + * about ~20% of CPU during an ARC parse. + * @param isAllignedOnFirstRecord True if this is the first record to be + * read from an archive + * @param String version Version information to be returned to the + * ARCReader constructing this record + * + * @throws IOException + */ + public ARCRecord(InputStream in, final String identifier, + final long offset, boolean digest, boolean strict, + final boolean parseHttpHeaders, + final boolean isAlignedOnFirstRecord, String version) + throws IOException { + super(in, null, 0, digest, strict); + setHeader(parseHeaders(in, identifier, offset, strict, isAlignedOnFirstRecord, version)); + if (parseHttpHeaders) { + this.httpHeaderStream = readHttpHeader(); + } + } + + /** + * Constructor. + * + * @param in Stream cue'd up to be at the start of the records metadata + * this instance is to represent. + * @param identifier Identifier for this the hosting Reader. + * @param offset Current offset into in (Used to keep + * position properly aligned). Usually 0. + * @param digest True if we're to calculate digest for this record. Not + * digesting saves about ~15% of cpu during an ARC parse. + * @param strict Be strict parsing (Parsing stops if ARC inproperly + * formatted). + * @param parseHttpHeaders True if we are to parse HTTP headers. Costs + * about ~20% of CPU during an ARC parse. + * + * @throws IOException + */ + public ARCRecord(InputStream in, final String identifier, + final long offset, boolean digest, boolean strict, + final boolean parseHttpHeaders) + throws IOException { + this(in, identifier, offset, digest, strict, parseHttpHeaders, + false, null); + } + + private ArchiveRecordHeader parseHeaders(final InputStream in, + final String identifier, final long offset, final boolean strict, + final boolean isAlignedOnFirstRecord, String version) + throws IOException { + + ArrayList firstLineValues = new ArrayList(20); + getTokenizedHeaderLine(in, firstLineValues); + + int bodyOffset = 0; + if (offset == 0 && isAlignedOnFirstRecord) { + // If offset is zero and we were aligned at first record on + // creation (See #alignedOnFirstRecord for more on this), then no + // records have been read yet and we're reading our first one, the + // record of ARC file meta info. Its special. In ARC versions + // 1.x, first record has three lines of meta info. We've just read + // the first line. There are two more. The second line has misc. + // info. We're only interested in the first field, the version + // number. The third line is the list of field names. Here's what + // ARC file version 1.x meta content looks like: + // + // filedesc://testIsBoundary-JunitIAH200401070157520.arc 0.0.0.0 \\ + // 20040107015752 text/plain 77 + // 1 0 InternetArchive + // URL IP-address Archive-date Content-type Archive-length + // + ArrayList secondLineValues = new ArrayList(20); + bodyOffset += getTokenizedHeaderLine(in, secondLineValues); + version = ((String)secondLineValues.get(0) + + "." + (String)secondLineValues.get(1)); + // Just read over the 3rd line. We used to parse it and use + // values found here but now we just hardcode them to avoid + // having to read this 3rd line even for random arc file accesses. + bodyOffset += getTokenizedHeaderLine(in, null); + // this.position = bodyOffset; + } + setBodyOffset(bodyOffset); + + return computeMetaData(this.headerFieldNameKeys, firstLineValues, version, offset, identifier); + } + + /** + * Get a record header line as list of tokens. + * + * We keep reading till we find a LINE_SEPARATOR or we reach the end + * of file w/o finding a LINE_SEPARATOR or the line length is crazy. + * + * @param stream InputStream to read from. + * @param list Empty list that gets filled w/ string tokens. + * @return Count of characters read. + * @exception IOException If problem reading stream or no line separator + * found or EOF before EOL or we didn't get minimum header fields. + */ + private int getTokenizedHeaderLine(final InputStream stream, + List list) throws IOException { + // Preallocate usual line size. + StringBuilder buffer = new StringBuilder(2048 + 20); + int read = 0; + int previous = -1; + for (int c = -1; true;) { + previous = c; + c = stream.read(); + if (c == -1) { + throw new RecoverableIOException("Hit EOF before header EOL."); + } + c &= 0xff; + read++; + if (read > MAX_HEADER_LINE_LENGTH) { + throw new IOException("Header line longer than max allowed " + + " -- " + String.valueOf(MAX_HEADER_LINE_LENGTH) + + " -- or passed buffer doesn't contain a line (Read: " + + buffer.length() + "). Here's" + + " some of what was read: " + + buffer.substring(0, Math.min(buffer.length(), 256))); + } + + if (c == LINE_SEPARATOR) { + if (buffer.length() == 0) { + // Empty line at start of buffer. Skip it and try again. + continue; + } + + if (list != null) { + list.add(buffer.toString()); + } + // LOOP TERMINATION. + break; + } else if (c == HEADER_FIELD_SEPARATOR) { + if (!isStrict() && previous == HEADER_FIELD_SEPARATOR) { + // Early ARCs sometimes had multiple spaces between fields. + continue; + } + if (list != null) { + list.add(buffer.toString()); + } + // reset to empty + buffer.setLength(0); + } else { + buffer.append((char)c); + } + } + + // List must have at least 3 elements in it and no more than 10. If + // it has other than this, then bogus parse. + if (list != null && (list.size() < 3 || list.size() > 100)) { + throw new IOException("Unparseable header line: " + list); + } + + // save verbatim header String + this.headerString = StringUtils.join(list," "); + + return read; + } + + /** + * Compute metadata fields. + * + * Here we check the meta field has right number of items in it. + * + * @param keys Keys to use composing headerFields map. + * @param values Values to set into the headerFields map. + * @param v The version of this ARC file. + * @param offset Offset into arc file. + * + * @return Metadata structure for this record. + * + * @exception IOException If no. of keys doesn't match no. of values. + */ + private ARCRecordMetaData computeMetaData(List keys, + List values, String v, long offset, final String identifier) + throws IOException { + if (keys.size() != values.size()) { + List originalValues = values; + if (!isStrict()) { + values = fixSpaceInURL(values, keys.size()); + // If values still doesn't match key size, try and do + // further repair. + if (keys.size() != values.size()) { + // Early ARCs had a space in mimetype. + if (values.size() == (keys.size() + 1) && + values.get(4).toLowerCase().startsWith("charset=")) { + List nuvalues = + new ArrayList(keys.size()); + nuvalues.add(0, values.get(0)); + nuvalues.add(1, values.get(1)); + nuvalues.add(2, values.get(2)); + nuvalues.add(3, values.get(3) + values.get(4)); + nuvalues.add(4, values.get(5)); + values = nuvalues; + } else if((values.size() + 1) == keys.size() && + isLegitimateIPValue(values.get(1)) && + isDate(values.get(2)) && isNumber(values.get(3))) { + // Mimetype is empty. + List nuvalues = + new ArrayList(keys.size()); + nuvalues.add(0, values.get(0)); + nuvalues.add(1, values.get(1)); + nuvalues.add(2, values.get(2)); + nuvalues.add(3, "-"); + nuvalues.add(4, values.get(3)); + values = nuvalues; + } + } + } + if (keys.size() != values.size()) { + throw new IOException("Size of field name keys does" + + " not match count of field values: " + values); + } + // Note that field was fixed on stderr. + System.err.println(Level.WARNING.toString() + "Fixed spaces in metadata line at " + + "offset " + offset + + " Original: " + originalValues + ", New: " + values); + } + + Map headerFields = + new HashMap(keys.size() + 2); + for (int i = 0; i < keys.size(); i++) { + headerFields.put(keys.get(i), values.get(i)); + } + + // Add a check for tabs in URLs. If any, replace with '%09'. + // See https://sourceforge.net/tracker/?group_id=73833&atid=539099&func=detail&aid=1010966, + // [ 1010966 ] crawl.log has URIs with spaces in them. + String url = (String)headerFields.get(URL_FIELD_KEY); + if (url != null && url.indexOf('\t') >= 0) { + headerFields.put(URL_FIELD_KEY, + TextUtils.replaceAll("\t", url, "%09")); + } + + headerFields.put(VERSION_FIELD_KEY, v); + headerFields.put(ABSOLUTE_OFFSET_KEY, new Long(offset)); + + return new ARCRecordMetaData(identifier, headerFields); + } + + /** + * Fix space in URLs. + * The ARCWriter used to write into the ARC URLs with spaces in them. + * See [ 1010966 ] + * crawl.log has URIs with spaces in them. + * This method does fix up on such headers converting all spaces found + * to '%20'. + * @param values List of metadata values. + * @param requiredSize Expected size of resultant values list. + * @return New list if we successfully fixed up values or original if + * fixup failed. + */ + private List fixSpaceInURL(List values, int requiredSize) { + // Do validity check. 3rd from last is a date of 14 numeric + // characters. The 4th from last is IP, all before the IP + // should be concatenated together with a '%20' joiner. + // In the below, '4' is 4th field from end which has the IP. + if (!(values.size() > requiredSize) || values.size() < 4) { + return values; + } + // Test 3rd field is valid date. + if (!isDate((String) values.get(values.size() - 3))) { + return values; + } + + // Test 4th field is valid IP. + if (!isLegitimateIPValue((String) values.get(values.size() - 4))) { + return values; + } + + List newValues = new ArrayList(requiredSize); + StringBuffer url = new StringBuffer(); + for (int i = 0; i < (values.size() - 4); i++) { + if (i > 0) { + url.append("%20"); + } + url.append(values.get(i)); + } + newValues.add(url.toString()); + for (int i = values.size() - 4; i < values.size(); i++) { + newValues.add(values.get(i)); + } + return newValues; + } + + private boolean isDate(final String date) { + if (date.length() != 14) { + return false; + } + return isNumber(date); + } + + private boolean isNumber(final String n) { + for (int i = 0; i < n.length(); i++) { + if (!Character.isDigit(n.charAt(i))) { + return false; + } + } + return true; + } + + private boolean isLegitimateIPValue(final String ip) { + if ("-".equals(ip)) { + return true; + } + Matcher m = InetAddressUtil.IPV4_QUADS.matcher(ip); + return m != null && m.matches(); + } + + /** + * Skip over the the http header if one present. + * + * Subsequent reads will get the body. + * + *

Calling this method in the midst of reading the header + * will make for strange results. Otherwise, safe to call + * at any time though before reading any of the arc record + * content is only time that it makes sense. + * + *

After calling this method, you can call + * {@link #getHttpHeaders()} to get the read http header. + * + * @throws IOException + */ + public void skipHttpHeader() throws IOException { + if (this.httpHeaderStream != null) { + // Empty the httpHeaderStream + for (int available = this.httpHeaderStream.available(); + this.httpHeaderStream != null && + (available = this.httpHeaderStream.available()) > 0;) { + // We should be in this loop once only we should only do this + // buffer allocation once. + byte [] buffer = new byte[available]; + // The read nulls out httpHeaderStream when done with it so + // need check for null in the loop control line. + read(buffer, 0, available); + } + } + } + + public void dumpHttpHeader() throws IOException { + if (this.httpHeaderStream == null) { + return; + } + // Dump the httpHeaderStream to STDOUT + for (int available = this.httpHeaderStream.available(); + this.httpHeaderStream != null + && (available = this.httpHeaderStream.available()) > 0;) { + // We should be in this loop only once and should do this + // buffer allocation once. + byte[] buffer = new byte[available]; + // The read nulls out httpHeaderStream when done with it so + // need check for null in the loop control line. + int read = read(buffer, 0, available); + System.out.write(buffer, 0, read); + } + } + + /** + * Read http header if present. Technique borrowed from HttpClient HttpParse + * class. set errors when found. + * + * @return ByteArrayInputStream with the http header in it or null if no + * http header. + * @throws IOException + */ + private InputStream readHttpHeader() throws IOException { + + // this can be helpful when simply iterating over records, + // looking for problems. + Logger logger = Logger.getLogger(this.getClass().getName()); + ArchiveRecordHeader h = this.getHeader(); + + // If judged a record that doesn't have an http header, return + // immediately. + String url = getHeader().getUrl(); + if(!url.startsWith("http") || + getHeader().getLength() <= MIN_HTTP_HEADER_LENGTH) { + return null; + } + + String statusLine; + byte[] statusBytes; + int eolCharCount = 0; + int errOffset = 0; + + // Read status line, skipping any errant http headers found before it + // This allows a larger number of 'corrupt' arcs -- where headers were accidentally + // inserted before the status line to be readable + while (true) { + statusBytes = LaxHttpParser.readRawLine(getIn()); + eolCharCount = getEolCharsCount(statusBytes); + if (eolCharCount <= 0) { + throw new RecoverableIOException( + "Failed to read http status where one was expected: " + + ((statusBytes == null) ? "" : new String(statusBytes))); + } + + statusLine = EncodingUtil.getString(statusBytes, 0, + statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING); + + // If a null or DELETED break immediately + if ((statusLine == null) || statusLine.startsWith("DELETED")) { + break; + } + + // If it's actually the status line, break, otherwise continue skipping any + // previous header values + if (!statusLine.contains(":") && StatusLine.startsWithHTTP(statusLine)) { + break; + } + + // Add bytes read to error "offset" to add to position + errOffset += statusBytes.length; + } + + if (errOffset > 0) { + this.incrementPosition(errOffset); + } + + if ((statusLine == null) || + !StatusLine.startsWithHTTP(statusLine)) { + if (statusLine.startsWith("DELETED")) { + // Some old ARCs have deleted records like following: + // http://vireo.gatech.edu:80/ebt-bin/nph-dweb/dynaweb/SGI_Developer/SGITCL_PG/@Generic__BookTocView/11108%3Btd%3D2 130.207.168.42 19991010131803 text/html 29202 + // DELETED_TIME=20000425001133_DELETER=Kurt_REASON=alexalist + // (follows ~29K spaces) + // For now, throw a RecoverableIOException so if iterating over + // records, we keep going. TODO: Later make a legitimate + // ARCRecord from the deleted record rather than throw + // exception. + throw new DeletedARCRecordIOException(statusLine); + } else { + this.errors.add(ArcRecordErrors.HTTP_STATUS_LINE_INVALID); + } + } + + try { + this.httpStatus = new StatusLine(statusLine); + } catch(IOException e) { + logger.warning(e.getMessage() + " at offset: " + h.getOffset()); + this.errors.add(ArcRecordErrors.HTTP_STATUS_LINE_EXCEPTION); + } + + // Save off all bytes read. Keep them as bytes rather than + // convert to strings so we don't have to worry about encodings + // though this should never be a problem doing http headers since + // its all supposed to be ascii. + ByteArrayOutputStream baos = + new ByteArrayOutputStream(statusBytes.length + 4 * 1024); + baos.write(statusBytes); + + // Now read rest of the header lines looking for the separation + // between header and body. + for (byte [] lineBytes = null; true;) { + lineBytes = LaxHttpParser.readRawLine(getIn()); + eolCharCount = getEolCharsCount(lineBytes); + if (eolCharCount <= 0) { + if (getIn().available() == 0) { + httpHeaderBytesRead += statusBytes.length; + logger.warning("HTTP header truncated at offset: " + h.getOffset()); + this.errors.add(ArcRecordErrors.HTTP_HEADER_TRUNCATED); + this.setEor(true); + break; + } else { + throw new IOException("Failed reading http headers: " + + ((lineBytes != null)? new String(lineBytes): null)); + } + } else { + httpHeaderBytesRead += lineBytes.length; + } + // Save the bytes read. + baos.write(lineBytes); + if ((lineBytes.length - eolCharCount) <= 0) { + // We've finished reading the http header. + break; + } + } + + byte [] headerBytes = baos.toByteArray(); + // Save off where body starts. + this.getMetaData().setContentBegin(headerBytes.length); + ByteArrayInputStream bais = + new ByteArrayInputStream(headerBytes); + if (!bais.markSupported()) { + throw new IOException("ByteArrayInputStream does not support mark"); + } + bais.mark(headerBytes.length); + // Read the status line. Don't let it into the parseHeaders function. + // It doesn't know what to do with it. + bais.read(statusBytes, 0, statusBytes.length); + this.httpHeaders = LaxHttpParser.parseHeaders(bais, + ARCConstants.DEFAULT_ENCODING); + this.getMetaData().setStatusCode(Integer.toString(getStatusCode())); + bais.reset(); + return bais; + } + + private static class DeletedARCRecordIOException + extends RecoverableIOException { + private static final long serialVersionUID = 1L; + + public DeletedARCRecordIOException(final String reason) { + super(reason); + } + } + + /** + * Return status code for this record. + * + * This method will return -1 until the http header has been read. + * @return Status code. + */ + public int getStatusCode() { + return (this.httpStatus == null)? -1: this.httpStatus.getStatusCode(); + } + + /** + * @param bytes Array of bytes to examine for an EOL. + * @return Count of end-of-line characters or zero if none. + */ + private int getEolCharsCount(byte [] bytes) { + int count = 0; + if (bytes != null && bytes.length >=1 && + bytes[bytes.length - 1] == '\n') { + count++; + if (bytes.length >=2 && bytes[bytes.length -2] == '\r') { + count++; + } + } + return count; + } + + /** + * @return Meta data for this record. + */ + public ARCRecordMetaData getMetaData() { + return (ARCRecordMetaData)getHeader(); + } + + /** + * @return http headers (Only available after header has been read). + */ + public Header [] getHttpHeaders() { + return this.httpHeaders; + } + + /** + * @return ArcRecordErrors encountered when reading + */ + public List getErrors() { + return this.errors; + } + + /** + * @return true if ARC record errors found + */ + public boolean hasErrors() { + return !this.errors.isEmpty(); + } + + /** + * @return Next character in this ARCRecord's content else -1 if at end of + * this record. + * @throws IOException + */ + public int read() throws IOException { + int c = -1; + if (this.httpHeaderStream != null && + (this.httpHeaderStream.available() > 0)) { + // If http header, return bytes from it before we go to underlying + // stream. + c = this.httpHeaderStream.read(); + // If done with the header stream, null it out. + if (this.httpHeaderStream.available() <= 0) { + this.httpHeaderStream = null; + } + incrementPosition(); + } else { + c = super.read(); + } + return c; + } + + public int read(byte [] b, int offset, int length) throws IOException { + int read = -1; + if (this.httpHeaderStream != null && + (this.httpHeaderStream.available() > 0)) { + // If http header, return bytes from it before we go to underlying + // stream. + read = Math.min(length, this.httpHeaderStream.available()); + if (read == 0) { + read = -1; + } else { + read = this.httpHeaderStream.read(b, offset, read); + } + // If done with the header stream, null it out. + if (this.httpHeaderStream.available() <= 0) { + this.httpHeaderStream = null; + } + incrementPosition(read); + } else { + read = super.read(b, offset, length); + } + return read; + } + + /** + * @return Offset at which the body begins (Only known after + * header has been read) or -1 if none or if we haven't read + * headers yet. Usually length of HTTP headers (does not include ARC + * metadata line length). + */ + public int getBodyOffset() { + return this.getMetaData().getContentBegin(); + } + + @Override + protected String getIp4Cdx(ArchiveRecordHeader h) { + String result = null; + if (h instanceof ARCRecordMetaData) { + result = ((ARCRecordMetaData)h).getIp(); + } + return (result != null)? result: super.getIp4Cdx(h); + } + + @Override + protected String getStatusCode4Cdx(ArchiveRecordHeader h) { + String result = null; + if (h instanceof ARCRecordMetaData) { + result = ((ARCRecordMetaData) h).getStatusCode(); + } + return (result != null) ? result: super.getStatusCode4Cdx(h); + } + + @Override + protected String getDigest4Cdx(ArchiveRecordHeader h) { + String result = null; + if (h instanceof ARCRecordMetaData) { + result = ((ARCRecordMetaData) h).getDigest(); + } + return (result != null) ? result: super.getDigest4Cdx(h); + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/arc/ARCRecordMetaData.java b/src/main/java/org/archive/io/arc/ARCRecordMetaData.java new file mode 100644 index 00000000..3f617041 --- /dev/null +++ b/src/main/java/org/archive/io/arc/ARCRecordMetaData.java @@ -0,0 +1,267 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io.arc; + +import java.io.File; +import java.io.IOException; +import java.util.Iterator; +import java.util.Map; +import java.util.Set; + +import org.archive.io.ArchiveRecordHeader; + + +/** + * An immutable class to hold an ARC record meta data. + * + * @author stack + */ +public class ARCRecordMetaData implements ArchiveRecordHeader, ARCConstants { + /** + * Map of record header fields. + * + * We store all in a hashmap. This way we can hold version 1 or + * version 2 record meta data. + * + *

Keys are lowercase. + */ + protected Map headerFields = null; + + /** + * Digest for the record. + * + * Only available after the record has been read in totality. + */ + private String digest = null; + + /** + * Status for this request. + * + * There may be no status. + */ + private String statusCode = null; + + /** + * The arc this metadata came out. + * Descriptive String, either path or URL. + */ + private String arc = null; + + private int contentBegin = 0; + + /** + * Shut down the default constructor. + */ + protected ARCRecordMetaData() { + super(); + } + + /** + * Constructor. + * + * @param arc The arc file this metadata came out of. + * @param headerFields Hash of meta fields. + * + * @throws IOException + */ + public ARCRecordMetaData(final String arc, Map headerFields) + throws IOException { + // Make sure the minimum required fields are present, + for (Iterator i = REQUIRED_VERSION_1_HEADER_FIELDS.iterator(); + i.hasNext(); ) { + testRequiredField(headerFields, (String)i.next()); + } + this.headerFields = headerFields; + this.arc = arc; + } + + /** + * Test required field is present in hash. + * + * @param fields Map of fields. + * @param requiredField Field to test for. + * + * @exception IOException If required field is not present. + */ + protected void testRequiredField(Map fields, String requiredField) + throws IOException { + if (!fields.containsKey(requiredField)) { + throw new IOException("Required field " + requiredField + + " not in meta data."); + } + } + + /** + * Get the time when the record was harvested. + *

+ * Returns the date in Heritrix 14 digit time format (UTC). See the + * {@link org.archive.util.ArchiveUtils} class for converting to Java + * dates. + * + * @return Header date in Heritrix 14 digit format. + * @see org.archive.util.ArchiveUtils#parse14DigitDate(String) + */ + public String getDate() { + return (String) this.headerFields.get(DATE_FIELD_KEY); + } + + /** + * @return Return length of the record. + */ + public long getLength() { + return Long.parseLong((String)this.headerFields. + get(LENGTH_FIELD_KEY)); + } + + /** + * @return Return Content-Length of the contents of the record + * Same as record length for arcs? TODO + */ + public long getContentLength() { + return getLength(); + } + + /** + * @return Header url. + */ + public String getUrl() { + return (String)this.headerFields.get(URL_FIELD_KEY); + } + + /** + * @return IP. + */ + public String getIp() + { + return (String)this.headerFields.get(IP_HEADER_FIELD_KEY); + } + + /** + * @return mimetype The mimetype that is in the ARC metaline -- NOT the http + * content-type content. + */ + public String getMimetype() { + return (String)this.headerFields.get(MIMETYPE_FIELD_KEY); + } + + /** + * @return Arcfile version. + */ + public String getVersion() { + return (String)this.headerFields.get(VERSION_FIELD_KEY); + } + + /** + * @return Offset into arcfile at which this record begins. + */ + public long getOffset() { + return ((Long)this.headerFields.get(ABSOLUTE_OFFSET_KEY)).longValue(); + } + + /** + * @param key Key to use looking up field value. + * @return value for passed key of null if no such entry. + */ + public Object getHeaderValue(String key) { + return this.headerFields.get(key); + } + + /** + * @return Header field name keys. + */ + public Set getHeaderFieldKeys() + { + return this.headerFields.keySet(); + } + + /** + * @return Map of header fields. + */ + public Map getHeaderFields() { + return this.headerFields; + } + + /** + * @return Returns identifier for ARC. + */ + public String getArc() { + return this.arc; + } + + /** + * @return Convenience method that does a + * return new File(this.arc) (Be aware this.arc is not always + * full path to an ARC file -- may be an URL). Test + * returned file for existence. + */ + public File getArcFile() { + return new File(this.arc); + } + + /** + * @return Returns the digest. + */ + public String getDigest() { + return this.digest; + } + + /** + * @param d The digest to set. + */ + public void setDigest(String d) { + this.digest = d; + } + + /** + * @return Returns the statusCode. May be null. + */ + public String getStatusCode() { + return this.statusCode; + } + + /** + * @param statusCode The statusCode to set. + */ + public void setStatusCode(String statusCode) { + this.statusCode = statusCode; + } + + public String toString() { + return ((this.arc != null)? this.arc: "") + + ": " + + ((this.headerFields != null)? this.headerFields.toString(): ""); + } + + public String getReaderIdentifier() { + return this.getArc(); + } + + public String getRecordIdentifier() { + return getDate() + "/" + getUrl(); + } + + public int getContentBegin() { + return this.contentBegin; + } + + protected void setContentBegin(final int offset) { + this.contentBegin = offset; + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/arc/ARCUtils.java b/src/main/java/org/archive/io/arc/ARCUtils.java new file mode 100644 index 00000000..985457e2 --- /dev/null +++ b/src/main/java/org/archive/io/arc/ARCUtils.java @@ -0,0 +1,240 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io.arc; + +import it.unimi.dsi.fastutil.io.RepositionableStream; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.net.URI; +import java.net.URISyntaxException; + +import org.archive.url.UsableURI; +import org.archive.util.zip.GzipHeader; +import org.archive.util.zip.NoGzipMagicException; + +public class ARCUtils implements ARCConstants { + /** + * @param pathOrUri Path or URI to extract arc filename from. + * @return Extracted arc file name. + * @throws URISyntaxException + */ + public static String parseArcFilename(final String pathOrUri) + throws URISyntaxException { + String path = pathOrUri; + if (UsableURI.hasScheme(pathOrUri)) { + URI url = new URI(pathOrUri); + path = url.getPath(); + } + return (new File(path)).getName(); + } + + /** + * @param arcFile File to test. + * @return True if arcFile is compressed ARC. + * @throws IOException + */ + public static boolean isCompressed(File arcFile) throws IOException { + return testCompressedARCFile(arcFile); + } + + /** + * Check file is compressed and in ARC GZIP format. + * + * @param arcFile File to test if its Internet Archive ARC file + * GZIP compressed. + * + * @return True if this is an Internet Archive GZIP'd ARC file (It begins + * w/ the Internet Archive GZIP header and has the + * COMPRESSED_ARC_FILE_EXTENSION suffix). + * + * @exception IOException If file does not exist or is not unreadable. + */ + public static boolean testCompressedARCFile(File arcFile) + throws IOException { + return testCompressedARCFile(arcFile, false); + } + + /** + * Check file is compressed and in ARC GZIP format. + * + * @param arcFile File to test if its Internet Archive ARC file + * GZIP compressed. + * @param skipSuffixCheck Set to true if we're not to test on the + * '.arc.gz' suffix. + * + * @return True if this is an Internet Archive GZIP'd ARC file (It begins + * w/ the Internet Archive GZIP header). + * + * @exception IOException If file does not exist or is not unreadable. + */ + public static boolean testCompressedARCFile(File arcFile, + boolean skipSuffixCheck) + throws IOException { + boolean compressedARCFile = false; + isReadable(arcFile); + if(!skipSuffixCheck && !arcFile.getName().toLowerCase() + .endsWith(COMPRESSED_ARC_FILE_EXTENSION)) { + return compressedARCFile; + } + + final InputStream is = new FileInputStream(arcFile); + try { + compressedARCFile = testCompressedARCStream(is); + } finally { + is.close(); + } + return compressedARCFile; + } + + /** + * Tests passed stream is gzip stream by reading in the HEAD. + * Does not reposition the stream. That is left up to the caller. + * @param is An InputStream. + * @return True if compressed stream. + * @throws IOException + */ + public static boolean testCompressedARCStream(final InputStream is) + throws IOException { + boolean compressedARCFile = false; + GzipHeader gh = null; + try { + gh = new GzipHeader(is); + } catch (NoGzipMagicException e ) { + return compressedARCFile; + } + + byte[] fextra = gh.getFextra(); + // Now make sure following bytes are IA GZIP comment. + // First check length. ARC_GZIP_EXTRA_FIELD includes length + // so subtract two and start compare to ARC_GZIP_EXTRA_FIELD + // at +2. + if (fextra != null && + ARC_GZIP_EXTRA_FIELD.length - 2 == fextra.length) { + compressedARCFile = true; + for (int i = 0; i < fextra.length; i++) { + if (fextra[i] != ARC_GZIP_EXTRA_FIELD[i + 2]) { + compressedARCFile = false; + break; + } + } + } + return compressedARCFile; + } + + /** + * Tests passed stream is gzip stream by reading in the HEAD. + * Does reposition of stream when done. + * @param rs An InputStream that is Repositionable. + * @return True if compressed stream. + * @throws IOException + */ + public static boolean testCompressedRepositionalStream( + final RepositionableStream rs) + throws IOException { + boolean compressedARCFile = false; + long p = rs.position(); + try { + compressedARCFile = testCompressedStream((InputStream)rs); + } finally { + rs.position(p); + } + return compressedARCFile; + } + + /** + * Tests passed stream is gzip stream by reading in the HEAD. + * Does reposition of stream when done. + * @param is An InputStream. + * @return True if compressed stream. + * @throws IOException + */ + public static boolean testCompressedStream(final InputStream is) + throws IOException { + boolean compressedARCFile = false; + try { + new GzipHeader(is); + compressedARCFile = true; + } catch (NoGzipMagicException e) { + return compressedARCFile; + } + return compressedARCFile; + } + + /** + * Check file is uncompressed ARC file. + * + * @param arcFile + * File to test if its Internet Archive ARC file uncompressed. + * + * @return True if this is an Internet Archive ARC file. + * + * @exception IOException + * If file does not exist or is not unreadable. + */ + public static boolean testUncompressedARCFile(File arcFile) + throws IOException { + boolean uncompressedARCFile = false; + isReadable(arcFile); + if(arcFile.getName().toLowerCase().endsWith(ARC_FILE_EXTENSION)) { + FileInputStream fis = new FileInputStream(arcFile); + try { + byte [] b = new byte[ARC_MAGIC_NUMBER.length()]; + int read = fis.read(b, 0, ARC_MAGIC_NUMBER.length()); + fis.close(); + if (read == ARC_MAGIC_NUMBER.length()) { + StringBuffer beginStr + = new StringBuffer(ARC_MAGIC_NUMBER.length()); + for (int i = 0; i < ARC_MAGIC_NUMBER.length(); i++) { + beginStr.append((char)b[i]); + } + + if (beginStr.toString(). + equalsIgnoreCase(ARC_MAGIC_NUMBER)) { + uncompressedARCFile = true; + } + } + } finally { + fis.close(); + } + } + + return uncompressedARCFile; + } + + + /** + * @param arcFile File to test. + * @exception IOException If file does not exist or is not unreadable. + */ + private static void isReadable(File arcFile) throws IOException { + if (!arcFile.exists()) { + throw new FileNotFoundException(arcFile.getAbsolutePath() + + " does not exist."); + } + + if (!arcFile.canRead()) { + throw new FileNotFoundException(arcFile.getAbsolutePath() + + " is not readable."); + } + } +} diff --git a/src/main/java/org/archive/io/arc/ARCWriter.java b/src/main/java/org/archive/io/arc/ARCWriter.java new file mode 100644 index 00000000..b5825d50 --- /dev/null +++ b/src/main/java/org/archive/io/arc/ARCWriter.java @@ -0,0 +1,459 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io.arc; + +import java.io.BufferedInputStream; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.Closeable; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.PrintStream; +import java.io.UnsupportedEncodingException; +import java.util.Iterator; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.logging.Logger; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.archive.io.ReplayInputStream; +import org.archive.io.WriterPoolMember; +import org.archive.io.WriterPoolSettings; +import org.archive.util.ArchiveUtils; +import org.archive.util.DevUtils; +import org.archive.util.MimetypeUtils; + + +/** + * Write ARC files. + * + * Assumption is that the caller is managing access to this ARCWriter ensuring + * only one thread of control accessing this ARC file instance at any one time. + * + *

ARC files are described here: + * Arc + * File Format. This class does version 1 of the ARC file format. It also + * writes version 1.1 which is version 1 with data stuffed into the body of the + * first arc record in the file, the arc file meta record itself. + * + *

An ARC file is three lines of meta data followed by an optional 'body' and + * then a couple of '\n' and then: record, '\n', record, '\n', record, etc. + * If we are writing compressed ARC files, then each of the ARC file records is + * individually gzipped and concatenated together to make up a single ARC file. + * In GZIP terms, each ARC record is a GZIP member of a total gzip'd + * file. + * + *

The GZIPping of the ARC file meta data is exceptional. It is GZIPped + * w/ an extra GZIP header, a special Internet Archive (IA) extra header field + * (e.g. FEXTRA is set in the GZIP header FLG field and an extra field is + * appended to the GZIP header). The extra field has little in it but its + * presence denotes this GZIP as an Internet Archive gzipped ARC. See RFC1952 + * to learn about the GZIP header structure. + * + *

This class then does its GZIPping in the following fashion. Each GZIP + * member is written w/ a new instance of GZIPOutputStream -- actually + * ARCWriterGZIPOututStream so we can get access to the underlying stream. + * The underlying stream stays open across GZIPoutputStream instantiations. + * For the 'special' GZIPing of the ARC file meta data, we cheat by catching the + * GZIPOutputStream output into a byte array, manipulating it adding the + * IA GZIP header, before writing to the stream. + * + *

I tried writing a resettable GZIPOutputStream and could make it work w/ + * the SUN JDK but the IBM JDK threw NPE inside in the deflate.reset -- its zlib + * native call doesn't seem to like the notion of resetting -- so I gave up on + * it. + * + *

Because of such as the above and troubles with GZIPInputStream, we should + * write our own GZIP*Streams, ones that resettable and consious of gzip + * members. + * + *

This class will write until we hit >= maxSize. The check is done at + * record boundary. Records do not span ARC files. We will then close current + * file and open another and then continue writing. + * + *

TESTING: Here is how to test that produced ARC files are good + * using the + * alexa + * ARC c-tools: + *

+ * % av_procarc hx20040109230030-0.arc.gz | av_ziparc > \
+ *     /tmp/hx20040109230030-0.dat.gz
+ * % av_ripdat /tmp/hx20040109230030-0.dat.gz > /tmp/hx20040109230030-0.cdx
+ * 
+ * Examine the produced cdx file to make sure it makes sense. Search + * for 'no-type 0'. If found, then we're opening a gzip record w/o data to + * write. This is bad. + * + *

You can also do gzip -t FILENAME and it will tell you if the + * ARC makes sense to GZIP. + * + *

While being written, ARCs have a '.open' suffix appended. + * + * @author stack + */ +public class ARCWriter extends WriterPoolMember implements ARCConstants, Closeable { + private static final Logger logger = + Logger.getLogger(ARCWriter.class.getName()); + + /** + * Metadata line pattern. + */ + private static final Pattern METADATA_LINE_PATTERN = + Pattern.compile("^\\S+ \\S+ \\S+ \\S+ \\S+(" + LINE_SEPARATOR + "?)$"); + + + /** + * Constructor. + * Takes a stream. Use with caution. There is no upperbound check on size. + * Will just keep writing. + * + * @param serialNo used to generate unique file name sequences + * @param out Where to write. + * @param arc File the out is connected to. + * @param cmprs Compress the content written. + * @param metadata File meta data. Can be null. Is list of File and/or + * String objects. + * @param a14DigitDate If null, we'll write current time. + * @throws IOException + */ + public ARCWriter(final AtomicInteger serialNo, final PrintStream out, + final File arc, final WriterPoolSettings settings) + throws IOException { + super(serialNo, out, arc, settings); + writeFirstRecord(ArchiveUtils.get14DigitDate()); + } + + /** + * Constructor. + * + * @param serialNo used to generate unique file name sequences + * @param settings all creation parameters + */ + public ARCWriter(final AtomicInteger serialNo, final WriterPoolSettings settings) { + super(serialNo, settings, ARC_FILE_EXTENSION); + + } + + protected String createFile() + throws IOException { + String name = super.createFile(); + writeFirstRecord(currentTimestamp); + return name; + } + + private void writeFirstRecord(final String ts) + throws IOException { + write(generateARCFileMetaData(ts)); + } + + /** + * Write out the ARCMetaData. + * + *

Generate ARC file meta data. Currently we only do version 1 of the + * ARC file formats or version 1.1 when metadata has been supplied (We + * write it into the body of the first record in the arc file). + * + *

Version 1 metadata looks roughly like this: + * + *

filedesc://testWriteRecord-JunitIAH20040110013326-2.arc 0.0.0.0 \\
+     *  20040110013326 text/plain 77
+     * 1 0 InternetArchive
+     * URL IP-address Archive-date Content-type Archive-length
+     * 
+ * + *

If compress is set, then we generate a header that has been gzipped + * in the Internet Archive manner. Such a gzipping enables the FEXTRA + * flag in the FLG field of the gzip header. It then appends an extra + * header field: '8', '0', 'L', 'X', '0', '0', '0', '0'. The first two + * bytes are the length of the field and the last 6 bytes the Internet + * Archive header. To learn about GZIP format, see RFC1952. To learn + * about the Internet Archive extra header field, read the source for + * av_ziparc which can be found at + * alexa/vista/alexa-tools-1.2/src/av_ziparc.cc. + * + *

We do things in this roundabout manner because the java + * GZIPOutputStream does not give access to GZIP header fields. + * + * @param date Date to put into the ARC metadata; if 17-digit will be + * truncated to traditional 14-digits + * + * @return Byte array filled w/ the arc header. + * @throws IOException + */ + private byte [] generateARCFileMetaData(String date) + throws IOException { + if(date!=null && date.length()>14) { + date = date.substring(0,14); + } + int metadataBodyLength = getMetadataLength(); + // If metadata body, then the minor part of the version is '1' rather + // than '0'. + String metadataHeaderLinesTwoAndThree = + getMetadataHeaderLinesTwoAndThree("1 " + + ((metadataBodyLength > 0)? "1": "0")); + int recordLength = metadataBodyLength + + metadataHeaderLinesTwoAndThree.getBytes(DEFAULT_ENCODING).length; + String metadataHeaderStr = ARC_MAGIC_NUMBER + getBaseFilename() + + " 0.0.0.0 " + date + " text/plain " + recordLength + + metadataHeaderLinesTwoAndThree; + ByteArrayOutputStream metabaos = + new ByteArrayOutputStream(recordLength); + // Write the metadata header. + metabaos.write(metadataHeaderStr.getBytes(DEFAULT_ENCODING)); + // Write the metadata body, if anything to write. + if (metadataBodyLength > 0) { + writeMetaData(metabaos); + } + + // Write out a LINE_SEPARATORs to end this record. + metabaos.write(LINE_SEPARATOR); + + // Now get bytes of all just written and compress if flag set. + byte [] bytes = metabaos.toByteArray(); + + if(isCompressed()) { + // GZIP the header but catch the gzipping into a byte array so we + // can add the special IA GZIP header to the product. After + // manipulations, write to the output stream (The JAVA GZIP + // implementation does not give access to GZIP header. It + // produces a 'default' header only). We can get away w/ these + // maniupulations because the GZIP 'default' header doesn't + // do the 'optional' CRC'ing of the header. + byte [] gzippedMetaData = ArchiveUtils.gzip(bytes); + if (gzippedMetaData[3] != 0) { + throw new IOException("The GZIP FLG header is unexpectedly " + + " non-zero. Need to add smarter code that can deal " + + " when already extant extra GZIP header fields."); + } + // Set the GZIP FLG header to '4' which says that the GZIP header + // has extra fields. Then insert the alex {'L', 'X', '0', '0', '0, + // '0'} 'extra' field. The IA GZIP header will also set byte + // 9 (zero-based), the OS byte, to 3 (Unix). We'll do the same. + gzippedMetaData[3] = 4; + gzippedMetaData[9] = 3; + byte [] assemblyBuffer = new byte[gzippedMetaData.length + + ARC_GZIP_EXTRA_FIELD.length]; + // '10' in the below is a pointer past the following bytes of the + // GZIP header: ID1 ID2 CM FLG + MTIME(4-bytes) XFL OS. See + // RFC1952 for explaination of the abbreviations just used. + System.arraycopy(gzippedMetaData, 0, assemblyBuffer, 0, 10); + System.arraycopy(ARC_GZIP_EXTRA_FIELD, 0, assemblyBuffer, 10, + ARC_GZIP_EXTRA_FIELD.length); + System.arraycopy(gzippedMetaData, 10, assemblyBuffer, + 10 + ARC_GZIP_EXTRA_FIELD.length, gzippedMetaData.length - 10); + bytes = assemblyBuffer; + } + return bytes; + } + + public String getMetadataHeaderLinesTwoAndThree(String version) { + StringBuffer buffer = new StringBuffer(); + buffer.append(LINE_SEPARATOR); + buffer.append(version); + buffer.append(" InternetArchive"); + buffer.append(LINE_SEPARATOR); + buffer.append("URL IP-address Archive-date Content-type Archive-length"); + buffer.append(LINE_SEPARATOR); + return buffer.toString(); + } + + /** + * Write all metadata to passed baos. + * + * @param baos Byte array to write to. + * @throws UnsupportedEncodingException + * @throws IOException + */ + private void writeMetaData(ByteArrayOutputStream baos) + throws UnsupportedEncodingException, IOException { + if (settings.getMetadata() == null) { + return; + } + + for (Iterator i = settings.getMetadata().iterator(); + i.hasNext();) { + Object obj = i.next(); + if (obj instanceof String) { + baos.write(((String)obj).getBytes(DEFAULT_ENCODING)); + } else if (obj instanceof File) { + InputStream is = null; + try { + is = new BufferedInputStream( + new FileInputStream((File)obj)); + byte [] buffer = new byte[4096]; + for (int read = -1; (read = is.read(buffer)) != -1;) { + baos.write(buffer, 0, read); + } + } finally { + if (is != null) { + is.close(); + } + } + } else if (obj != null) { + logger.severe("Unsupported metadata type: " + obj); + } + } + return; + } + + /** + * @return Total length of metadata. + * @throws UnsupportedEncodingException + */ + private int getMetadataLength() + throws UnsupportedEncodingException { + int result = -1; + if (settings.getMetadata() == null) { + result = 0; + } else { + for (Iterator i = settings.getMetadata().iterator(); + i.hasNext();) { + Object obj = i.next(); + if (obj instanceof String) { + result += ((String)obj).getBytes(DEFAULT_ENCODING).length; + } else if (obj instanceof File) { + result += ((File)obj).length(); + } else { + logger.severe("Unsupported metadata type: " + obj); + } + } + } + return result; + } + + /** + * @deprecated use input-stream version directly instead + */ + public void write(String uri, String contentType, String hostIP, + long fetchBeginTimeStamp, long recordLength, + ByteArrayOutputStream baos) + throws IOException { + write(uri, contentType, hostIP, fetchBeginTimeStamp, recordLength, + new ByteArrayInputStream(baos.toByteArray()), false); + } + + public void write(String uri, String contentType, String hostIP, + long fetchBeginTimeStamp, long recordLength, InputStream in) + throws IOException { + write(uri,contentType,hostIP,fetchBeginTimeStamp,recordLength,in,true); + } + + /** + * Write a record with the given metadata/content. + * + * @param uri + * URI for metadata-line + * @param contentType + * MIME content-type for metadata-line + * @param hostIP + * IP for metadata-line + * @param fetchBeginTimeStamp + * timestamp for metadata-line + * @param recordLength + * length for metadata-line; also may be enforced + * @param in + * source InputStream for record content + * @param enforceLength + * whether to enforce the declared length; should be true + * unless intentionally writing bad records for testing + * @throws IOException + */ + public void write(String uri, String contentType, String hostIP, + long fetchBeginTimeStamp, long recordLength, InputStream in, + boolean enforceLength) throws IOException { + preWriteRecordTasks(); + try { + write(getMetaLine(uri, contentType, hostIP, fetchBeginTimeStamp, + recordLength).getBytes(UTF8)); + copyFrom(in, recordLength, enforceLength); + if (in instanceof ReplayInputStream) { + // check for consumption of entire recorded material + long remaining = ((ReplayInputStream) in).remaining(); + // Should be zero at this stage. If not, something is + // wrong. + if (remaining != 0) { + String message = "Gap between expected and actual: " + + remaining + LINE_SEPARATOR + DevUtils.extraInfo() + + " writing arc " + + this.getFile().getAbsolutePath(); + DevUtils.warnHandle(new Throwable(message), message); + throw new IOException(message); + } + } + write(LINE_SEPARATOR); + } finally { + postWriteRecordTasks(); + } + } + + /** + * @param uri + * @param contentType + * @param hostIP + * @param fetchBeginTimeStamp + * @param recordLength + * @return Metadata line for an ARCRecord made of passed components. + * @exception IOException + */ + protected String getMetaLine(String uri, String contentType, String hostIP, + long fetchBeginTimeStamp, long recordLength) + throws IOException { + if (fetchBeginTimeStamp <= 0) { + throw new IOException("Bogus fetchBeginTimestamp: " + + Long.toString(fetchBeginTimeStamp)); + } + + return validateMetaLine(createMetaline(uri, hostIP, + ArchiveUtils.get14DigitDate(fetchBeginTimeStamp), + MimetypeUtils.truncate(contentType), + Long.toString(recordLength))); + } + + public String createMetaline(String uri, String hostIP, + String timeStamp, String mimetype, String recordLength) { + return uri + HEADER_FIELD_SEPARATOR + hostIP + + HEADER_FIELD_SEPARATOR + timeStamp + + HEADER_FIELD_SEPARATOR + mimetype + + HEADER_FIELD_SEPARATOR + recordLength + LINE_SEPARATOR; + } + + /** + * Test that the metadata line is valid before writing. + * @param metaLineStr + * @throws IOException + * @return The passed in metaline. + */ + protected String validateMetaLine(String metaLineStr) + throws IOException { + if (metaLineStr.length() > MAX_METADATA_LINE_LENGTH) { + throw new IOException("Metadata line too long (" + + metaLineStr.length() + ">" + MAX_METADATA_LINE_LENGTH + + "): " + metaLineStr); + } + Matcher m = METADATA_LINE_PATTERN.matcher(metaLineStr); + if (!m.matches()) { + throw new IOException("Metadata line doesn't match expected" + + " pattern: " + metaLineStr); + } + return metaLineStr; + } +} diff --git a/src/main/java/org/archive/io/arc/ARCWriterPool.java b/src/main/java/org/archive/io/arc/ARCWriterPool.java new file mode 100644 index 00000000..b55b3ed4 --- /dev/null +++ b/src/main/java/org/archive/io/arc/ARCWriterPool.java @@ -0,0 +1,69 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io.arc; + +import java.util.concurrent.atomic.AtomicInteger; + +import org.archive.io.WriterPool; +import org.archive.io.WriterPoolMember; +import org.archive.io.WriterPoolSettings; + + +/** + * A pool of ARCWriters. + * + * @author stack + */ +public class ARCWriterPool extends WriterPool { + /** + * Constructor + * + * @param settings Settings for this pool. + * @param poolMaximumActive + * @param poolMaximumWait + */ + public ARCWriterPool(final WriterPoolSettings settings, + final int poolMaximumActive, final int poolMaximumWait) { + this(new AtomicInteger(), settings, poolMaximumActive, poolMaximumWait); + } + + /** + * Constructor + * + * @param serial Used to generate unique filename sequences + * @param settings Settings for this pool. + * @param poolMaximumActive + * @param poolMaximumWait + */ + public ARCWriterPool(final AtomicInteger serial, + final WriterPoolSettings settings, + final int poolMaximumActive, final int poolMaximumWait) { + super(serial, settings, poolMaximumActive, poolMaximumWait); + } + + /* (non-Javadoc) + * @see org.archive.io.WriterPool#makeWriter() + */ + protected WriterPoolMember makeWriter() { + return new ARCWriter(serialNo, settings); + } + + + +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/arc/WriterPoolSettingsData.java b/src/main/java/org/archive/io/arc/WriterPoolSettingsData.java new file mode 100644 index 00000000..7396f2d8 --- /dev/null +++ b/src/main/java/org/archive/io/arc/WriterPoolSettingsData.java @@ -0,0 +1,80 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io.arc; + +import java.io.File; +import java.util.List; + +import org.archive.io.WriterPoolSettings; + +public class WriterPoolSettingsData implements WriterPoolSettings { + protected long maxFileSizeBytes; + protected String prefix; + protected String template; + protected List outputDirs; + protected boolean compress; + protected List metadata; + protected boolean frequentFlushes = true; + protected int writeBufferSize = 16*1024; + + public WriterPoolSettingsData(String prefix, String template, + long maxFileSizeBytes, boolean compress, List outputDirs, + List metadata) { + super(); + this.maxFileSizeBytes = maxFileSizeBytes; + this.prefix = prefix; + this.template = template; + this.outputDirs = outputDirs; + this.compress = compress; + this.metadata = metadata; + } + + @Override + public boolean getCompress() { + return compress; + } + @Override + public long getMaxFileSizeBytes() { + return maxFileSizeBytes; + } + @Override + public List getMetadata() { + return metadata; + } + @Override + public List calcOutputDirs() { + return outputDirs; + } + @Override + public String getPrefix() { + return prefix; + } + @Override + public String getTemplate() { + return template; + } + @Override + public boolean getFrequentFlushes() { + return frequentFlushes; + } + @Override + public int getWriteBufferSize() { + return writeBufferSize; + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/package.html b/src/main/java/org/archive/io/package.html new file mode 100644 index 00000000..d1798b80 --- /dev/null +++ b/src/main/java/org/archive/io/package.html @@ -0,0 +1,9 @@ + + + +org.archive.io.arc package + + +ARC file reading and writing. + + diff --git a/src/main/java/org/archive/io/warc/WARCConstants.java b/src/main/java/org/archive/io/warc/WARCConstants.java new file mode 100644 index 00000000..83cc8a6d --- /dev/null +++ b/src/main/java/org/archive/io/warc/WARCConstants.java @@ -0,0 +1,24 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io.warc; + +@Deprecated +public interface WARCConstants extends org.archive.format.warc.WARCConstants { +} diff --git a/src/main/java/org/archive/io/warc/WARCReader.java b/src/main/java/org/archive/io/warc/WARCReader.java new file mode 100644 index 00000000..a34854ef --- /dev/null +++ b/src/main/java/org/archive/io/warc/WARCReader.java @@ -0,0 +1,287 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io.warc; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.util.Iterator; +import java.util.List; + +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.Options; +import org.apache.commons.cli.ParseException; +import org.apache.commons.cli.PosixParser; +import org.apache.commons.lang.NotImplementedException; +import org.archive.io.ArchiveReader; +import org.archive.io.ArchiveRecord; + +/** + * WARCReader. + * Go via {@link WARCReaderFactory} to get instance. + * @author stack + * @version $Date: 2006-11-27 18:03:03 -0800 (Mon, 27 Nov 2006) $ $Version$ + */ +public class WARCReader extends ArchiveReader implements WARCConstants { + protected WARCReader() { + super(); + } + + @Override + protected void initialize(String i) { + super.initialize(i); + setVersion(WARC_VERSION); + } + + /** + * Skip over any trailing new lines at end of the record so we're lined up + * ready to read the next. + * @param record + * @throws IOException + */ + protected void gotoEOR(ArchiveRecord record) throws IOException { + if (record.available() != 0) { + throw new IOException("Record should be exhausted before coming " + + "in here"); + } + + // Records end in 2*CRLF. Suck it up. + readExpectedChar(getIn(), CRLF.charAt(0)); + readExpectedChar(getIn(), CRLF.charAt(1)); + readExpectedChar(getIn(), CRLF.charAt(0)); + readExpectedChar(getIn(), CRLF.charAt(1)); + } + + protected void readExpectedChar(final InputStream is, final int expected) + throws IOException { + int c = is.read(); + if (c != expected) { + throw new IOException("Unexpected character " + + Integer.toHexString(c) + "(Expecting " + + Integer.toHexString(expected) + ")"); + } + } + + /** + * Create new WARC record. + * Encapsulate housekeeping that has to do w/ creating new Record. + * @param is InputStream to use. + * @param offset Absolute offset into WARC file. + * @return A WARCRecord. + * @throws IOException + */ + protected WARCRecord createArchiveRecord(InputStream is, long offset) + throws IOException { + return (WARCRecord)currentRecord(new WARCRecord(is, + getReaderIdentifier(), offset, isDigest(), isStrict())); + } + + @Override + public void dump(boolean compress) + throws IOException, java.text.ParseException { + for (final Iterator i = iterator(); i.hasNext();) { + ArchiveRecord r = i.next(); + System.out.println(r.getHeader().toString()); + r.dump(); + System.out.println(); + } + } + + + @Override + public ArchiveReader getDeleteFileOnCloseReader(final File f) { + throw new NotImplementedException("TODO"); + } + + @Override + public String getDotFileExtension() { + return DOT_WARC_FILE_EXTENSION; + } + + @Override + public String getFileExtension() { + return WARC_FILE_EXTENSION; + } + + // Static methods follow. Mostly for command-line processing. + + /** + * + * @param formatter Help formatter instance. + * @param options Usage options. + * @param exitCode Exit code. + */ + private static void usage(HelpFormatter formatter, Options options, + int exitCode) { + formatter.printHelp("java org.archive.io.arc.WARCReader" + + " [--digest=true|false] \\\n" + + " [--format=cdx|cdxfile|dump|gzipdump]" + + " [--offset=#] \\\n[--strict] [--parse] WARC_FILE|WARC_URL", + options); + System.exit(exitCode); + } + + /** + * Write out the arcfile. + * + * @param reader + * @param format Format to use outputting. + * @throws IOException + * @throws java.text.ParseException + */ + protected static void output(WARCReader reader, String format) + throws IOException, java.text.ParseException { + if (!reader.output(format)) { + throw new IOException("Unsupported format: " + format); + } + } + + /** + * Generate a CDX index file for an ARC file. + * + * @param urlOrPath The ARC file to generate a CDX index for + * @throws IOException + * @throws java.text.ParseException + */ + public static void createCDXIndexFile(String urlOrPath) + throws IOException, java.text.ParseException { + WARCReader r = WARCReaderFactory.get(urlOrPath); + r.setStrict(false); + r.setDigest(true); + output(r, CDX_FILE); + } + + /** + * Command-line interface to WARCReader. + * + * Here is the command-line interface: + *

+     * usage: java org.archive.io.arc.WARCReader [--offset=#] ARCFILE
+     *  -h,--help      Prints this message and exits.
+     *  -o,--offset    Outputs record at this offset into arc file.
+ * + *

Outputs using a pseudo-CDX format as described here: + * CDX + * Legent and here + * Example. + * Legend used in below is: 'CDX b e a m s c V (or v if uncompressed) n g'. + * Hash is hard-coded straight SHA-1 hash of content. + * + * @param args Command-line arguments. + * @throws ParseException Failed parse of the command line. + * @throws IOException + * @throws java.text.ParseException + */ + public static void main(String [] args) + throws ParseException, IOException, java.text.ParseException { + Options options = getOptions(); + PosixParser parser = new PosixParser(); + CommandLine cmdline = parser.parse(options, args, false); + @SuppressWarnings("unchecked") + List cmdlineArgs = cmdline.getArgList(); + Option [] cmdlineOptions = cmdline.getOptions(); + HelpFormatter formatter = new HelpFormatter(); + + // If no args, print help. + if (cmdlineArgs.size() <= 0) { + usage(formatter, options, 0); + } + + // Now look at options passed. + long offset = -1; + boolean digest = false; + boolean strict = false; + String format = CDX; + for (int i = 0; i < cmdlineOptions.length; i++) { + switch(cmdlineOptions[i].getId()) { + case 'h': + usage(formatter, options, 0); + break; + + case 'o': + offset = + Long.parseLong(cmdlineOptions[i].getValue()); + break; + + case 's': + strict = true; + break; + + case 'd': + digest = getTrueOrFalse(cmdlineOptions[i].getValue()); + break; + + case 'f': + format = cmdlineOptions[i].getValue().toLowerCase(); + boolean match = false; + // List of supported formats. + final String [] supportedFormats = + {CDX, DUMP, GZIP_DUMP, CDX_FILE}; + for (int ii = 0; ii < supportedFormats.length; ii++) { + if (supportedFormats[ii].equals(format)) { + match = true; + break; + } + } + if (!match) { + usage(formatter, options, 1); + } + break; + + default: + throw new RuntimeException("Unexpected option: " + + + cmdlineOptions[i].getId()); + } + } + + if (offset >= 0) { + if (cmdlineArgs.size() != 1) { + System.out.println("Error: Pass one arcfile only."); + usage(formatter, options, 1); + } + WARCReader r = WARCReaderFactory.get( + new File((String)cmdlineArgs.get(0)), offset); + r.setStrict(strict); + outputRecord(r, format); + } else { + for (Iterator i = cmdlineArgs.iterator(); i.hasNext();) { + String urlOrPath = (String)i.next(); + try { + WARCReader r = WARCReaderFactory.get(urlOrPath); + r.setStrict(strict); + r.setDigest(digest); + output(r, format); + } catch (RuntimeException e) { + // Write out name of file we failed on to help with + // debugging. Then print stack trace and try to keep + // going. We do this for case where we're being fed + // a bunch of ARCs; just note the bad one and move + // on to the next. + System.err.println("Exception processing " + urlOrPath + + ": " + e.getMessage()); + e.printStackTrace(System.err); + System.exit(1); + } + } + } + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/warc/WARCReaderFactory.java b/src/main/java/org/archive/io/warc/WARCReaderFactory.java new file mode 100644 index 00000000..9c6c7e77 --- /dev/null +++ b/src/main/java/org/archive/io/warc/WARCReaderFactory.java @@ -0,0 +1,307 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io.warc; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.Iterator; + +import org.archive.io.ArchiveReader; +import org.archive.io.ArchiveReaderFactory; +import org.archive.io.ArchiveRecord; +import org.archive.io.warc.WARCConstants; +import org.archive.util.ArchiveUtils; +import org.archive.util.FileUtils; +import org.archive.util.zip.GZIPMembersInputStream; + +import com.google.common.io.CountingInputStream; + +/** + * Factory for WARC Readers. + * Figures whether to give out a compressed file Reader or an uncompressed + * Reader. + * @author stack + * @version $Date: 2006-08-23 17:59:04 -0700 (Wed, 23 Aug 2006) $ $Version$ + */ +public class WARCReaderFactory extends ArchiveReaderFactory +implements WARCConstants { + private static final WARCReaderFactory factory = new WARCReaderFactory(); + + /** + * Shutdown any access to default constructor. + * This factory is Singleton. + */ + private WARCReaderFactory() { + super(); + } + + public static WARCReader get(String arcFileOrUrl) + throws MalformedURLException, IOException { + return (WARCReader)WARCReaderFactory.factory. + getArchiveReader(arcFileOrUrl); + } + + public static WARCReader get(final File f) throws IOException { + return (WARCReader)WARCReaderFactory.factory.getArchiveReader(f); + } + + /** + * @param f An arcfile to read. + * @param offset Have returned Reader set to start reading at this offset. + * @return A WARCReader. + * @throws IOException + */ + public static WARCReader get(final File f, final long offset) + throws IOException { + return (WARCReader)WARCReaderFactory.factory. + getArchiveReader(f, offset); + } + + protected ArchiveReader getArchiveReader(final File f, final long offset) + throws IOException { + boolean compressed = testCompressedWARCFile(f); + if (!compressed) { + if (!FileUtils.isReadableWithExtensionAndMagic(f, + DOT_WARC_FILE_EXTENSION, WARC_MAGIC)) { + throw new IOException(f.getAbsolutePath() + + " is not a WARC file."); + } + } + return (WARCReader)(compressed? + WARCReaderFactory.factory.new CompressedWARCReader(f, offset): + WARCReaderFactory.factory.new UncompressedWARCReader(f, offset)); + } + + public static ArchiveReader get(final String s, final InputStream is, + final boolean atFirstRecord) + throws IOException { + return WARCReaderFactory.factory.getArchiveReader(s, is, + atFirstRecord); + } + + protected ArchiveReader getArchiveReader(final String f, + final InputStream is, final boolean atFirstRecord) + throws IOException { + // For now, assume stream is compressed. Later add test of input + // stream or handle exception thrown when figure not compressed stream. + return new CompressedWARCReader(f, is, atFirstRecord); + } + + public static WARCReader get(final URL arcUrl, final long offset) + throws IOException { + return (WARCReader)WARCReaderFactory.factory.getArchiveReader(arcUrl, + offset); + } + + /** + * Get an ARCReader. + * Pulls the ARC local into whereever the System Property + * java.io.tmpdir points. It then hands back an ARCReader that + * points at this local copy. A close on this ARCReader instance will + * remove the local copy. + * @param arcUrl An URL that points at an ARC. + * @return An ARCReader. + * @throws IOException + */ + public static WARCReader get(final URL arcUrl) + throws IOException { + return (WARCReader)WARCReaderFactory.factory.getArchiveReader(arcUrl); + } + + /** + * Check file is compressed WARC. + * + * @param f File to test. + * + * @return True if this is compressed WARC (TODO: Just tests if file is + * GZIP'd file (It begins w/ GZIP MAGIC)). + * + * @exception IOException If file does not exist or is not unreadable. + */ + public static boolean testCompressedWARCFile(final File f) + throws IOException { + FileUtils.assertReadable(f); + boolean compressed = false; + final InputStream is = new FileInputStream(f); + try { + compressed = ArchiveUtils.isGzipped(is); + } finally { + is.close(); + } + return compressed; + } + + /** + * Uncompressed WARC file reader. + * @author stack + */ + public class UncompressedWARCReader extends WARCReader { + /** + * Constructor. + * @param f Uncompressed arcfile to read. + * @throws IOException + */ + public UncompressedWARCReader(final File f) + throws IOException { + this(f, 0); + } + + /** + * Constructor. + * + * @param f Uncompressed file to read. + * @param offset Offset at which to position Reader. + * @throws IOException + */ + public UncompressedWARCReader(final File f, final long offset) + throws IOException { + // File has been tested for existence by time it has come to here. + setIn(new CountingInputStream(getInputStream(f, offset))); + getIn().skip(offset); + initialize(f.getAbsolutePath()); + } + + /** + * Constructor. + * + * @param f Uncompressed file to read. + * @param is InputStream. + */ + public UncompressedWARCReader(final String f, final InputStream is) { + // Arc file has been tested for existence by time it has come + // to here. + setIn(new CountingInputStream(is)); + initialize(f); + } + } + + /** + * Compressed WARC file reader. + * + * @author stack + */ + public class CompressedWARCReader extends WARCReader { + /** + * Constructor. + * + * @param f Compressed file to read. + * @throws IOException + */ + public CompressedWARCReader(final File f) throws IOException { + this(f, 0); + } + + /** + * Constructor. + * + * @param f Compressed arcfile to read. + * @param offset Position at where to start reading file. + * @throws IOException + */ + public CompressedWARCReader(final File f, final long offset) + throws IOException { + // File has been tested for existence by time it has come to here. + setIn(new GZIPMembersInputStream(getInputStream(f, offset))); + ((GZIPMembersInputStream)getIn()).compressedSeek(offset); + setCompressed((offset == 0)); // TODO: does this make sense?!?! + initialize(f.getAbsolutePath()); + } + + /** + * Constructor. + * + * @param f Compressed arcfile. + * @param is InputStream to use. + * @param atFirstRecord + * @throws IOException + */ + public CompressedWARCReader(final String f, final InputStream is, + final boolean atFirstRecord) + throws IOException { + // Arc file has been tested for existence by time it has come + // to here. + setIn(new GZIPMembersInputStream(is)); + setCompressed(true); + initialize(f); + // TODO: Ignore atFirstRecord. Probably doesn't apply in WARC world. + } + + /** + * Get record at passed offset. + * + * @param offset Byte index into file at which a record starts. + * @return A WARCRecord reference. + * @throws IOException + */ + public WARCRecord get(long offset) throws IOException { + cleanupCurrentRecord(); + ((GZIPMembersInputStream)getIn()).compressedSeek(offset); + return (WARCRecord) createArchiveRecord(getIn(), offset); + } + + public Iterator iterator() { + /** + * Override ArchiveRecordIterator so can base returned iterator on + * GzippedInputStream iterator. + */ + return new ArchiveRecordIterator() { + private GZIPMembersInputStream gis = + (GZIPMembersInputStream)getIn(); + + private Iterator gzipIterator = this.gis.memberIterator(); + + protected boolean innerHasNext() { + return this.gzipIterator.hasNext(); + } + + protected ArchiveRecord innerNext() throws IOException { + // Get the position before gzipIterator.next moves + // it on past the gzip header. + InputStream is = (InputStream) this.gzipIterator.next(); + return createArchiveRecord(is, Math.max(gis.getCurrentMemberStart(), gis.getCurrentMemberEnd())); + } + }; + } + + protected void gotoEOR(ArchiveRecord rec) throws IOException { + long skipped = 0; + while (getIn().read()>-1) { + skipped++; + } + if(skipped>4) { + System.err.println("unexpected extra data after record "+rec); + } + return; + } + } + + public static boolean isWARCSuffix(final String f) { + return (f == null)? + false: + (f.toLowerCase().endsWith(DOT_COMPRESSED_WARC_FILE_EXTENSION))? + true: + (f.toLowerCase().endsWith(DOT_WARC_FILE_EXTENSION))? + true: false; + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/warc/WARCRecord.java b/src/main/java/org/archive/io/warc/WARCRecord.java new file mode 100644 index 00000000..635d1c3b --- /dev/null +++ b/src/main/java/org/archive/io/warc/WARCRecord.java @@ -0,0 +1,233 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io.warc; + +import it.unimi.dsi.fastutil.io.RepositionableStream; + +import java.io.IOException; +import java.io.InputStream; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.commons.httpclient.Header; +import org.apache.commons.httpclient.HttpParser; +import org.archive.io.ArchiveRecord; +import org.archive.io.ArchiveRecordHeader; +import org.archive.util.LaxHttpParser; + + +/** + * A WARC file Record. + * + * @author stack + */ +public class WARCRecord extends ArchiveRecord implements WARCConstants { + private Pattern WHITESPACE = Pattern.compile("\\s"); + + /** + * Constructor. + * + * @param in Stream cue'd up to be at the start of the record this instance + * is to represent. + * @throws IOException + */ + public WARCRecord(InputStream in, final String identifier, + final long offset) + throws IOException { + this(in, identifier, offset, true, false); + } + + /** + * Constructor. + * @param in Stream cue'd up just past Header Line and Named Fields. + * @param headers Header Line and ANVL Named fields. + * @throws IOException + */ + public WARCRecord(InputStream in, ArchiveRecordHeader headers) + throws IOException { + super(in, headers, 0, true, false); + } + + /** + * Constructor. + * + * @param in Stream cue'd up to be at the start of the record this instance + * is to represent or, if headers is not null, just past the + * Header Line and Named Fields. + * @param identifier Identifier for this the hosting Reader. + * @param offset Current offset into in (Used to keep + * position properly aligned). Usually 0. + * @param digest True if we're to calculate digest for this record. Not + * digesting saves about ~15% of cpu during parse. + * @param strict Be strict parsing (Parsing stops if file inproperly + * formatted). + * @throws IOException + */ + public WARCRecord(final InputStream in, final String identifier, + final long offset, boolean digest, boolean strict) + throws IOException { + super(in, null, 0, digest, strict); + setHeader(parseHeaders(in, identifier, offset, strict)); + } + + /** + * Parse WARC Header Line and Named Fields. + * @param in Stream to read. + * @param identifier Identifier for the hosting Reader. + * @param offset Absolute offset into Reader. + * @param strict Whether to be loose parsing or not. + * @return An ArchiveRecordHeader. + * @throws IOException + */ + protected ArchiveRecordHeader parseHeaders(final InputStream in, + final String identifier, final long offset, final boolean strict) + throws IOException { + final Map m = new HashMap(); + m.put(ABSOLUTE_OFFSET_KEY, new Long(offset)); + m.put(READER_IDENTIFIER_FIELD_KEY, identifier); + + long startPosition = -1; + if (in instanceof RepositionableStream) { + startPosition = ((RepositionableStream)in).position(); + } + String firstLine = + new String(LaxHttpParser.readLine(in, WARC_HEADER_ENCODING)); + if (firstLine == null || firstLine.length() <=0) { + throw new IOException("Failed to read WARC_MAGIC"); + } + if (!firstLine.startsWith(WARC_MAGIC)) { + throw new IOException("Failed to find WARC MAGIC: " + firstLine); + } + // Here we start reading off the inputstream but we're reading the + // stream direct rather than going via WARCRecord#read. The latter will + // keep count of bytes read, digest and fail properly if EOR too soon... + // We don't want digesting while reading Headers. + // + Header [] h = LaxHttpParser.parseHeaders(in, WARC_HEADER_ENCODING); + for (int i = 0; i < h.length; i++) { + m.put(h[i].getName(), h[i].getValue()); + } + int headerLength = -1; + if (in instanceof RepositionableStream) { + headerLength = + (int)(((RepositionableStream)in).position() - startPosition); + } + final int contentOffset = headerLength; + incrementPosition(contentOffset); + + return new ArchiveRecordHeader() { + private Map headers = m; + private int contentBegin = contentOffset; + + public String getDate() { + return (String)this.headers.get(HEADER_KEY_DATE); + } + + public String getDigest() { + return null; + // TODO: perhaps return block-digest? + // superclass def implies this is calculated ("only after + // read in totality"), not pulled from header, so + // below prior implementation was misleading +// return (String)this.headers.get(HEADER_KEY_CHECKSUM); + } + + public String getReaderIdentifier() { + return (String)this.headers.get(READER_IDENTIFIER_FIELD_KEY); + } + + public Set getHeaderFieldKeys() { + return this.headers.keySet(); + } + + public Map getHeaderFields() { + return this.headers; + } + + public Object getHeaderValue(String key) { + return this.headers.get(key); + } + + // Returns just the Content-Length of the warc record + public long getContentLength() { + Object o = this.headers.get(CONTENT_LENGTH); + if (o == null) { + return -1; + } + long contentLength = (o instanceof Long)? + ((Long)o).longValue(): Long.parseLong((String)o); + return contentLength; + } + + // Returns the full record length + public long getLength() + { + return getContentLength() + contentOffset; + } + + public String getMimetype() { + return (String)this.headers.get(CONTENT_TYPE); + } + + public long getOffset() { + Object o = this.headers.get(ABSOLUTE_OFFSET_KEY); + if (o == null) { + return -1; + } + return (o instanceof Long)? + ((Long)o).longValue(): Long.parseLong((String)o); + } + + public String getRecordIdentifier() { + return (String)this.headers.get(RECORD_IDENTIFIER_FIELD_KEY); + } + + public String getUrl() { + return (String)this.headers.get(HEADER_KEY_URI); + } + + public String getVersion() { + return (String)this.headers.get(VERSION_FIELD_KEY); + } + + public int getContentBegin() { + return this.contentBegin; + } + + @Override + public String toString() { + return this.headers.toString(); + } + }; + } + + @Override + protected String getMimetype4Cdx(ArchiveRecordHeader h) { + final String m = super.getMimetype4Cdx(h); + // Mimetypes can have spaces in WARCs. Emitting for CDX, just + // squash them for now. Later, quote them since squashing spaces won't + // work for params that have quoted-string values. + Matcher matcher = WHITESPACE.matcher(m); + return matcher.replaceAll(""); + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/warc/WARCRecordInfo.java b/src/main/java/org/archive/io/warc/WARCRecordInfo.java new file mode 100644 index 00000000..a6198c44 --- /dev/null +++ b/src/main/java/org/archive/io/warc/WARCRecordInfo.java @@ -0,0 +1,139 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io.warc; + +import java.io.InputStream; +import java.net.URI; + +import org.archive.format.warc.WARCConstants.WARCRecordType; +import org.archive.util.anvl.ANVLRecord; + +public class WARCRecordInfo { + + protected WARCRecordType type; + protected String url; + protected String create14DigitDate; + protected String mimetype; + protected URI recordId; + protected ANVLRecord extraHeaders; + protected InputStream contentStream; + protected long contentLength; + protected boolean enforceLength; + protected String warcFilename; + protected Long warcFileOffset; + + public void setType(WARCRecordType type) { + this.type = type; + } + + public void setUrl(String url) { + this.url = url; + } + + public String getCreate14DigitDate() { + return create14DigitDate; + } + + public void setCreate14DigitDate(String create14DigitDate) { + this.create14DigitDate = create14DigitDate; + } + + public String getMimetype() { + return mimetype; + } + + public void setMimetype(String mimetype) { + this.mimetype = mimetype; + } + + public URI getRecordId() { + return recordId; + } + + public void setRecordId(URI recordId) { + this.recordId = recordId; + } + + public ANVLRecord getExtraHeaders() { + return extraHeaders; + } + + public void setExtraHeaders(ANVLRecord extraHeaders) { + this.extraHeaders = extraHeaders; + } + + public InputStream getContentStream() { + return contentStream; + } + + public void setContentStream(InputStream contentStream) { + this.contentStream = contentStream; + } + + public long getContentLength() { + return contentLength; + } + + public void setContentLength(long contentLength) { + this.contentLength = contentLength; + } + + public boolean isEnforceLength() { + return enforceLength; + } + + public boolean getEnforceLength() { + return enforceLength; + } + + public void setEnforceLength(boolean enforceLength) { + this.enforceLength = enforceLength; + } + + public WARCRecordType getType() { + return type; + } + + public String getUrl() { + return url; + } + + public void addExtraHeader(String label, String value) { + if (extraHeaders == null) { + extraHeaders = new ANVLRecord(); + } + extraHeaders.addLabelValue(label, value); + } + + public void setWARCFilename(String warcFilenameWithoutOccupiedSuffix) { + this.warcFilename = warcFilenameWithoutOccupiedSuffix; + } + + public String getWARCFilename() { + return warcFilename; + } + + public void setWARCFileOffset(Long startPosition) { + this.warcFileOffset = startPosition; + } + + public Long getWARCFileOffset() { + return warcFileOffset; + } +} diff --git a/src/main/java/org/archive/io/warc/WARCWriter.java b/src/main/java/org/archive/io/warc/WARCWriter.java new file mode 100644 index 00000000..b9558263 --- /dev/null +++ b/src/main/java/org/archive/io/warc/WARCWriter.java @@ -0,0 +1,436 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io.warc; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.IOException; +import java.io.OutputStream; +import java.net.URI; +import java.util.HashMap; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.Map; +import java.util.Map.Entry; +import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; +import java.util.logging.Level; +import java.util.logging.Logger; + +import org.apache.commons.lang.StringUtils; +import org.archive.io.ArchiveFileConstants; +import org.archive.io.UTF8Bytes; +import org.archive.io.WriterPoolMember; +import org.archive.util.ArchiveUtils; +import org.archive.util.anvl.Element; + + +/** + * WARC implementation. + * + *

Assumption is that the caller is managing access to this + * WARCWriter ensuring only one thread accessing this WARC instance + * at any one time. + * + *

While being written, WARCs have a '.open' suffix appended. + * + * @contributor stack + * @version $Revision: 4604 $ $Date: 2006-09-05 22:38:18 -0700 (Tue, 05 Sep 2006) $ + */ +public class WARCWriter extends WriterPoolMember +implements WARCConstants { + public static final String TOTALS = "totals"; + public static final String SIZE_ON_DISK = "sizeOnDisk"; + public static final String TOTAL_BYTES = "totalBytes"; + public static final String CONTENT_BYTES = "contentBytes"; + public static final String NUM_RECORDS = "numRecords"; + + private static final Logger logger = + Logger.getLogger(WARCWriter.class.getName()); + + /** + * NEWLINE as bytes. + */ + public static byte [] CRLF_BYTES; + static { + try { + CRLF_BYTES = CRLF.getBytes(DEFAULT_ENCODING); + } catch(Exception e) { + e.printStackTrace(); + } + }; + + /** + * Temporarily accumulates stats managed externally by + * {@link WARCWriterProcessor}. WARCWriterProcessor will call + * {@link #resetTmpStats()}, write some records, then add + * {@link #getTmpStats()} into its long-term running totals. + */ + private Map> tmpStats; + + /** Temporarily accumulates info on written warc records for use externally. */ + private LinkedList tmpRecordLog = new LinkedList(); + + /** + * Constructor. + * Takes a stream. Use with caution. There is no upperbound check on size. + * Will just keep writing. Only pass Streams that are bounded. + * @param serialNo used to generate unique file name sequences + * @param out Where to write. + * @param f File the out is connected to. + * @param cmprs Compress the content written. + * @param a14DigitDate If null, we'll write current time. + * @throws IOException + */ + public WARCWriter(final AtomicInteger serialNo, + final OutputStream out, final File f, + final WARCWriterPoolSettings settings) + throws IOException { + super(serialNo, out, f, settings); + } + + /** + * Constructor. + * + * @param dirs Where to drop files. + * @param prefix File prefix to use. + * @param cmprs Compress the records written. + * @param maxSize Maximum size for ARC files written. + * @param suffix File tail to use. If null, unused. + * @param warcinfoData File metadata for warcinfo record. + */ + public WARCWriter(final AtomicInteger serialNo, + final WARCWriterPoolSettings settings) { + super(serialNo, settings, WARC_FILE_EXTENSION); + } + + @Override + protected String createFile(File file) throws IOException { + String filename = super.createFile(file); + writeWarcinfoRecord(filename); + return filename; + } + + protected void baseCharacterCheck(final char c, final String parameter) + throws IllegalArgumentException { + // TODO: Too strict? UNICODE control characters? + if (Character.isISOControl(c) || !Character.isValidCodePoint(c)) { + throw new IllegalArgumentException("Contains illegal character 0x" + + Integer.toHexString(c) + ": " + parameter); + } + } + + protected String checkHeaderValue(final String value) + throws IllegalArgumentException { + for (int i = 0; i < value.length(); i++) { + final char c = value.charAt(i); + baseCharacterCheck(c, value); + if (Character.isWhitespace(c)) { + throw new IllegalArgumentException("Contains disallowed white space 0x" + + Integer.toHexString(c) + ": " + value); + } + } + return value; + } + + protected String checkHeaderLineMimetypeParameter(final String parameter) + throws IllegalArgumentException { + StringBuilder sb = new StringBuilder(parameter.length()); + boolean wasWhitespace = false; + for (int i = 0; i < parameter.length(); i++) { + char c = parameter.charAt(i); + if (Character.isWhitespace(c)) { + // Map all to ' ' and collapse multiples into one. + // TODO: Make sure white space occurs in legal location -- + // before parameter or inside quoted-string. + if (wasWhitespace) { + continue; + } + wasWhitespace = true; + c = ' '; + } else { + wasWhitespace = false; + baseCharacterCheck(c, parameter); + } + sb.append(c); + } + + return sb.toString(); + } + +// protected String createRecordHeader(final String type, +// final String url, final String create14DigitDate, +// final String mimetype, final URI recordId, +// final ANVLRecord xtraHeaders, final long contentLength) + protected String createRecordHeader(WARCRecordInfo metaRecord) + throws IllegalArgumentException { + final StringBuilder sb = + new StringBuilder(2048/*A SWAG: TODO: Do analysis.*/); + sb.append(WARC_ID).append(CRLF); + sb.append(HEADER_KEY_TYPE).append(COLON_SPACE).append(metaRecord.getType()). + append(CRLF); + // Do not write a subject-uri if not one present. + if (!StringUtils.isEmpty(metaRecord.getUrl())) { + sb.append(HEADER_KEY_URI).append(COLON_SPACE). + append(checkHeaderValue(metaRecord.getUrl())).append(CRLF); + } + sb.append(HEADER_KEY_DATE).append(COLON_SPACE). + append(metaRecord.getCreate14DigitDate()).append(CRLF); + if (metaRecord.getExtraHeaders() != null) { + for (final Iterator i = metaRecord.getExtraHeaders().iterator(); i.hasNext();) { + sb.append(i.next()).append(CRLF); + } + } + + sb.append(HEADER_KEY_ID).append(COLON_SPACE).append('<'). + append(metaRecord.getRecordId().toString()).append('>').append(CRLF); + if (metaRecord.getContentLength() > 0) { + sb.append(CONTENT_TYPE).append(COLON_SPACE).append( + checkHeaderLineMimetypeParameter(metaRecord.getMimetype())).append(CRLF); + } + sb.append(CONTENT_LENGTH).append(COLON_SPACE). + append(Long.toString(metaRecord.getContentLength())).append(CRLF); + + return sb.toString(); + } + + public void writeRecord(WARCRecordInfo recordInfo) + throws IOException { + + if (recordInfo.getContentLength() == 0 && + (recordInfo.getExtraHeaders() == null || recordInfo.getExtraHeaders().size() <= 0)) { + throw new IllegalArgumentException("Cannot write record " + + "of content-length zero and base headers only."); + } + + String header; + try { + header = createRecordHeader(recordInfo); + + } catch (IllegalArgumentException e) { + logger.log(Level.SEVERE,"could not write record type: " + recordInfo.getType() + + "for URL: " + recordInfo.getUrl(), e); + return; + } + + long contentBytes = 0; + long totalBytes = 0; + long startPosition; + + try { + startPosition = getPosition(); + preWriteRecordTasks(); + + // TODO: Revisit encoding of header. + byte[] bytes = header.getBytes(WARC_HEADER_ENCODING); + write(bytes); + totalBytes += bytes.length; + + if (recordInfo.getContentStream() != null && recordInfo.getContentLength() > 0) { + // Write out the header/body separator. + write(CRLF_BYTES); // TODO: should this be written even for zero-length? + totalBytes += CRLF_BYTES.length; + contentBytes += copyFrom(recordInfo.getContentStream(), + recordInfo.getContentLength(), + recordInfo.getEnforceLength()); + totalBytes += contentBytes; + } + + // Write out the two blank lines at end of all records. + write(CRLF_BYTES); + write(CRLF_BYTES); + totalBytes += 2 * CRLF_BYTES.length; + + tally(recordInfo.getType(), contentBytes, totalBytes, getPosition() - startPosition); + + recordInfo.setWARCFilename(getFilenameWithoutOccupiedSuffix()); + recordInfo.setWARCFileOffset(startPosition); + tmpRecordLog.add(recordInfo); + } finally { + postWriteRecordTasks(); + } + } + + public String getFilenameWithoutOccupiedSuffix() { + String name = getFile().getName(); + if (name.endsWith(ArchiveFileConstants.OCCUPIED_SUFFIX)) { + name = name.substring(0, name.length() - ArchiveFileConstants.OCCUPIED_SUFFIX.length()); + } + return name; + } + + // if compression is enabled, sizeOnDisk means compressed bytes; if not, it + // should be the same as totalBytes (right?) + protected void tally(WARCRecordType warcRecordType, long contentBytes, long totalBytes, long sizeOnDisk) { + if (tmpStats == null) { + tmpStats = new HashMap>(); + } + + // add to stats for this record type + Map substats = tmpStats.get(warcRecordType.toString()); + if (substats == null) { + substats = new HashMap(); + tmpStats.put(warcRecordType.toString(), substats); + } + subtally(substats, contentBytes, totalBytes, sizeOnDisk); + + // add to totals + substats = tmpStats.get(TOTALS); + if (substats == null) { + substats = new HashMap(); + tmpStats.put(TOTALS, substats); + } + subtally(substats, contentBytes, totalBytes, sizeOnDisk); + } + + protected void subtally(Map substats, long contentBytes, + long totalBytes, long sizeOnDisk) { + + if (substats.get(NUM_RECORDS) == null) { + substats.put(NUM_RECORDS, 1l); + } else { + substats.put(NUM_RECORDS, substats.get(NUM_RECORDS) + 1); + } + + if (substats.get(CONTENT_BYTES) == null) { + substats.put(CONTENT_BYTES, contentBytes); + } else { + substats.put(CONTENT_BYTES, substats.get(CONTENT_BYTES) + contentBytes); + } + + if (substats.get(TOTAL_BYTES) == null) { + substats.put(TOTAL_BYTES, totalBytes); + } else { + substats.put(TOTAL_BYTES, substats.get(TOTAL_BYTES) + totalBytes); + } + + if (substats.get(SIZE_ON_DISK) == null) { + substats.put(SIZE_ON_DISK, sizeOnDisk); + } else { + substats.put(SIZE_ON_DISK, substats.get(SIZE_ON_DISK) + sizeOnDisk); + } + } + + protected URI generateRecordId(final Map qualifiers) + throws IOException { + return ((WARCWriterPoolSettings)settings).getRecordIDGenerator().getQualifiedRecordID(qualifiers); + } + + protected URI generateRecordId(final String key, final String value) + throws IOException { + return ((WARCWriterPoolSettings)settings).getRecordIDGenerator().getQualifiedRecordID(key, value); + } + + public URI writeWarcinfoRecord(String filename) + throws IOException { + return writeWarcinfoRecord(filename, null); + } + + public URI writeWarcinfoRecord(String filename, final String description) + throws IOException { + WARCRecordInfo recordInfo = new WARCRecordInfo(); + recordInfo.setType(WARCRecordType.warcinfo); + recordInfo.setCreate14DigitDate(ArchiveUtils.getLog14Date()); + recordInfo.setMimetype("application/warc-fields"); + + // Strip .open suffix if present. + if (filename.endsWith(WriterPoolMember.OCCUPIED_SUFFIX)) { + filename = filename.substring(0, + filename.length() - WriterPoolMember.OCCUPIED_SUFFIX.length()); + } + recordInfo.addExtraHeader(HEADER_KEY_FILENAME, filename); + if (description != null && description.length() > 0) { + recordInfo.addExtraHeader(CONTENT_DESCRIPTION, description); + } + + // Add warcinfo body. + byte [] warcinfoBody = null; + if (settings.getMetadata() == null) { + // TODO: What to write into a warcinfo? What to associate? + warcinfoBody = "TODO: Unimplemented".getBytes(); + } else { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + for (final Iterator i = settings.getMetadata().iterator(); + i.hasNext();) { + baos.write(i.next().toString().getBytes(UTF8Bytes.UTF8)); + } + warcinfoBody = baos.toByteArray(); + } + recordInfo.setContentStream(new ByteArrayInputStream(warcinfoBody)); + recordInfo.setContentLength((long) warcinfoBody.length); + recordInfo.setEnforceLength(true); + + recordInfo.setRecordId(generateRecordId(TYPE, WARCRecordType.warcinfo.toString())); + + writeRecord(recordInfo); + + // TODO: If at start of file, and we're writing compressed, + // write out our distinctive GZIP extensions. + return recordInfo.getRecordId(); + } + + /** + * @see WARCWriter#tmpStats for usage model + */ + public void resetTmpStats() { + if (tmpStats != null) { + for (Map substats : tmpStats.values()) { + for (Entry entry : substats.entrySet()) { + entry.setValue(0l); + } + } + } + } + + public Map> getTmpStats() { + return tmpStats; + } + + public static long getStat(Map> map, String key, + String subkey) { + if (map != null && map.get(key) != null + && map.get(key).get(subkey) != null) { + return map.get(key).get(subkey); + } else { + return 0l; + } + } + + public static long getStat( + ConcurrentMap> map, + String key, String subkey) { + if (map != null && map.get(key) != null + && map.get(key).get(subkey) != null) { + return map.get(key).get(subkey).get(); + } else { + return 0l; + } + } + + public void resetTmpRecordLog() { + tmpRecordLog.clear(); + } + + public Iterable getTmpRecordLog() { + return tmpRecordLog; + } +} diff --git a/src/main/java/org/archive/io/warc/WARCWriterPool.java b/src/main/java/org/archive/io/warc/WARCWriterPool.java new file mode 100644 index 00000000..fdc97162 --- /dev/null +++ b/src/main/java/org/archive/io/warc/WARCWriterPool.java @@ -0,0 +1,64 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io.warc; + +import java.util.concurrent.atomic.AtomicInteger; + +import org.archive.io.WriterPool; +import org.archive.io.WriterPoolMember; + + +/** + * A pool of WARCWriters. + * @contributor stack + * @contributor gojomo + * @version $Revision: 4566 $ $Date: 2006-08-31 09:51:41 -0700 (Thu, 31 Aug 2006) $ + */ +public class WARCWriterPool extends WriterPool { + /** + * Constructor + * @param settings Settings for this pool. + * @param poolMaximumActive + * @param poolMaximumWait + */ + public WARCWriterPool(final WARCWriterPoolSettings settings, + final int poolMaximumActive, final int poolMaximumWait) { + this(new AtomicInteger(), settings, poolMaximumActive, poolMaximumWait); + } + + /** + * Constructor + * @param serial Used to generate unique filename sequences + * @param settings Settings for this pool. + * @param poolMaximumActive + * @param poolMaximumWait + */ + public WARCWriterPool(final AtomicInteger serial, + final WARCWriterPoolSettings settings, + final int poolMaximumActive, final int poolMaximumWait) { + super(serial, settings, poolMaximumActive, poolMaximumWait); + } + + /* (non-Javadoc) + * @see org.archive.io.WriterPool#makeWriter() + */ + protected WriterPoolMember makeWriter() { + return new WARCWriter(serialNo, (WARCWriterPoolSettings)settings); + } +} diff --git a/src/main/java/org/archive/io/warc/WARCWriterPoolSettings.java b/src/main/java/org/archive/io/warc/WARCWriterPoolSettings.java new file mode 100644 index 00000000..b028a8b7 --- /dev/null +++ b/src/main/java/org/archive/io/warc/WARCWriterPoolSettings.java @@ -0,0 +1,32 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io.warc; + +import org.archive.io.WriterPoolSettings; +import org.archive.uid.RecordIDGenerator; + +/** + * Settings object for a {@link WARCWriterPool}. + * Used creating {@link WARCWriter}s. + * + * @version $Date: 2010-08-19 17:21:43 -0700 (Thu, 19 Aug 2010) $, $Revision: 6927 $ + */ +public interface WARCWriterPoolSettings extends WriterPoolSettings { + public RecordIDGenerator getRecordIDGenerator(); +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/warc/WARCWriterPoolSettingsData.java b/src/main/java/org/archive/io/warc/WARCWriterPoolSettingsData.java new file mode 100644 index 00000000..d56c9971 --- /dev/null +++ b/src/main/java/org/archive/io/warc/WARCWriterPoolSettingsData.java @@ -0,0 +1,40 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io.warc; + +import java.io.File; +import java.util.List; + +import org.archive.io.arc.WriterPoolSettingsData; +import org.archive.uid.RecordIDGenerator; + +public class WARCWriterPoolSettingsData extends WriterPoolSettingsData implements WARCWriterPoolSettings { + RecordIDGenerator generator; + + public WARCWriterPoolSettingsData(String prefix, String template, + long maxFileSizeBytes, boolean compress, List outputDirs, + List metadata, RecordIDGenerator generator) { + super(prefix,template,maxFileSizeBytes,compress,outputDirs,metadata); + this.generator = generator; + } + @Override + public RecordIDGenerator getRecordIDGenerator() { + return generator; + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/warc/package.html b/src/main/java/org/archive/io/warc/package.html new file mode 100644 index 00000000..f52aa95b --- /dev/null +++ b/src/main/java/org/archive/io/warc/package.html @@ -0,0 +1,38 @@ + + + +org.archive.io.warc package + + +Experimental WARC Writer and Readers. Code and specification subject to change +with no guarantees of backward compatibility: i.e. newer readers +may not be able to parse WARCs written with older writers. This package +contains prototyping code for revision 0.12 of the WARC specification. +See latest revision +for current state (Version 0.10 code and its documentation has been moved into the +v10 subpackage). + + +

Implementation Notes

+

Tools

+

Initial implementations of Arc2Warc and Warc2Arc +tools can be found in the package above this one, at +{@link org.archive.io.Arc2Warc} and {@link org.archive.io.Warc2Arc} +respectively. Pass --help to learn how to use each tool. +

+ +

TODO

+
    +
  • Is MIME-Version header needed? MIME Parsers seem fine without (python email +lib and java mail).
  • +
  • Should we write out a Content-Transfer-Encoding +header (Currently we do not). Need section in spec. explicit about our +interpretation of MIME and deviations (e.g. content-transfer-encoding should +be assumed binary in case of WARCs, multipart is not disallowed but not +encouraged, etc.)
  • +
  • Minor: Do WARC-Version: 0.12 like MIME-Version: 1.0 rather than +WARC/0.12 for lead in to an ARCRecord?
  • +
+ + + diff --git a/src/main/java/org/archive/net/DownloadURLConnection.java b/src/main/java/org/archive/net/DownloadURLConnection.java new file mode 100644 index 00000000..fbcee421 --- /dev/null +++ b/src/main/java/org/archive/net/DownloadURLConnection.java @@ -0,0 +1,131 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.net; + +import java.io.BufferedInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; +import java.net.URLConnection; +import java.util.Arrays; +import java.util.logging.Level; +import java.util.logging.Logger; + +import org.archive.util.ProcessUtils; +import org.archive.util.ProcessUtils.ProcessResult; + +/** + * An URL Connection that pre-downloads URL reference before passing back a + * Stream reference. When closed, it removes the local download file. + * @author stack + * @version $Date$, $Revision$ + */ +public abstract class DownloadURLConnection extends URLConnection { + private final String CLASSNAME = DownloadURLConnection.class.getName(); + private final Logger LOGGER = Logger.getLogger(CLASSNAME); + private static final File TMPDIR = + new File(System.getProperty("java.io.tmpdir", "/tmp")); + private File downloadFile = null; + + protected DownloadURLConnection(URL u) { + super(u); + } + + protected String getScript() { + return System.getProperty(this.getClass().getName() + ".path", + "UNDEFINED"); + } + + protected String [] getCommand(final URL thisUrl, + final File downloadFile) { + return new String[] {getScript(), thisUrl.getPath(), + downloadFile.getAbsolutePath()}; + } + + /** + * Do script copy to local file. + * File is available via {@link #getFile()}. + * @throws IOException + */ + public void connect() throws IOException { + if (this.connected) { + return; + } + + this.downloadFile = File.createTempFile(CLASSNAME, null, TMPDIR); + try { + String [] cmd = getCommand(this.url, this.downloadFile); + if (LOGGER.isLoggable(Level.FINE)) { + StringBuffer buffer = new StringBuffer(); + for (int i = 0; i < cmd.length; i++) { + if (i > 0) { + buffer.append(" "); + } + buffer.append(cmd[i]); + } + LOGGER.fine("Command: " + buffer.toString()); + } + ProcessResult pr = ProcessUtils.exec(cmd); + if (pr.getResult() != 0) { + LOGGER.info(Arrays.toString(cmd) + " returned non-null " + pr.getResult()); + } + // Assume download went smoothly. + this.connected = true; + } catch (IOException ioe) { + // Clean up my tmp file. + this.downloadFile.delete(); + this.downloadFile = null; + // Rethrow. + throw ioe; + } + } + + public File getFile() { + return this.downloadFile; + } + + protected void setFile(final File f) { + this.downloadFile = f; + } + + public InputStream getInputStream() throws IOException { + if (!this.connected) { + connect(); + } + + // Return BufferedInputStream so 'delegation' is done for me, so + // I don't have to implement all IS methods and pass to my + // 'delegate' instance. + final DownloadURLConnection connection = this; + return new BufferedInputStream(new FileInputStream(this.downloadFile)) { + private DownloadURLConnection ruc = connection; + + public void close() throws IOException { + super.close(); + if (this.ruc != null && this.ruc.getFile()!= null && + this.ruc.getFile().exists()) { + this.ruc.getFile().delete(); + this.ruc.setFile(null); + } + } + }; + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/net/FTPException.java b/src/main/java/org/archive/net/FTPException.java new file mode 100644 index 00000000..2d104390 --- /dev/null +++ b/src/main/java/org/archive/net/FTPException.java @@ -0,0 +1,56 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.net; + +import java.io.IOException; + +/** + * Indicates that a FTP operation failed due to a protocol violation. + * For instance, if authentication fails. + * + * @author pjack + */ +public class FTPException extends IOException { + private static final long serialVersionUID = 1L; + + /** + * The reply code from the FTP server. + */ + private int code; + + /** + * Constructs a new FTPException. + * + * @param code the error code from the FTP server + */ + public FTPException(int code) { + super("FTP error code: " + code); + this.code = code; + } + + + /** + * Returns the error code from the FTP server. + * + * @return the error code from the FTP server + */ + public int getReplyCode() { + return code; + } +} diff --git a/src/main/java/org/archive/url/PublicSuffixes.java b/src/main/java/org/archive/net/PublicSuffixes.java similarity index 99% rename from src/main/java/org/archive/url/PublicSuffixes.java rename to src/main/java/org/archive/net/PublicSuffixes.java index 7c3df6b8..eab8081a 100644 --- a/src/main/java/org/archive/url/PublicSuffixes.java +++ b/src/main/java/org/archive/net/PublicSuffixes.java @@ -17,7 +17,7 @@ * limitations under the License. */ -package org.archive.url; +package org.archive.net; import java.io.BufferedReader; import java.io.BufferedWriter; diff --git a/src/main/java/org/archive/net/md5/Handler.java b/src/main/java/org/archive/net/md5/Handler.java new file mode 100644 index 00000000..8afcdebb --- /dev/null +++ b/src/main/java/org/archive/net/md5/Handler.java @@ -0,0 +1,87 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.net.md5; + +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; +import java.net.URLConnection; +import java.net.URLStreamHandler; + +/** + * A protocol handler for an 'md5' URI scheme. + * Md5 URLs look like this: md5:deadbeefdeadbeefdeadbeefdeadbeef + * When this handler is invoked against an md5 URL, it passes the raw md5 to + * the configured script as an argument. The configured script then does the + * work to bring the item pointed to by the md5 local so we can open a Stream + * on the local copy. Local file is deleted when we finish. Do + * {@link org.archive.net.DownloadURLConnection#getFile()} to get name of + * temporary file. + * + *

You need to define the system property + * -Djava.protocol.handler.pkgs=org.archive.net to add this handler + * to the java.net.URL set. Also define system properties + * -Dorg.archive.net.md5.Md5URLConnection.path=PATH_TO_SCRIPT to + * pass path of script to run as well as + * -Dorg.archive.net.md5.Md5URLConnection.options=OPTIONS for + * any options you'd like to include. The pointed-to PATH_TO_SCRIPT + * will be invoked as follows: PATH_TO_SCRIPT OPTIONS MD5 + * LOCAL_TMP_FILE. The LOCAL_TMP_FILE file is made in + * java.io.tmpdir using java tmp name code. + * @author stack + */ +public class Handler extends URLStreamHandler { + protected URLConnection openConnection(URL u) { + return new Md5URLConnection(u); + } + + /** + * Main dumps rsync file to STDOUT. + * @param args + * @throws IOException + */ + public static void main(String[] args) + throws IOException { + if (args.length != 1) { + System.out.println("Usage: java java " + + "-Djava.protocol.handler.pkgs=org.archive.net " + + "org.archive.net.md5.Handler " + + "md5:deadbeefdeadbeefdeadbeefdeadbeef"); + System.exit(1); + } + System.setProperty("org.archive.net.md5.Md5URLConnection.path", + "/tmp/manifest"); + System.setProperty("java.protocol.handler.pkgs", "org.archive.net"); + URL u = new URL(args[0]); + URLConnection connect = u.openConnection(); + // Write download to stdout. + final int bufferlength = 4096; + byte [] buffer = new byte [bufferlength]; + InputStream is = connect.getInputStream(); + try { + for (int count = is.read(buffer, 0, bufferlength); + (count = is.read(buffer, 0, bufferlength)) != -1;) { + System.out.write(buffer, 0, count); + } + System.out.flush(); + } finally { + is.close(); + } + } +} diff --git a/src/main/java/org/archive/net/md5/Md5URLConnection.java b/src/main/java/org/archive/net/md5/Md5URLConnection.java new file mode 100644 index 00000000..e4fe98e3 --- /dev/null +++ b/src/main/java/org/archive/net/md5/Md5URLConnection.java @@ -0,0 +1,34 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.net.md5; + +import java.net.URL; + +import org.archive.net.DownloadURLConnection; + +/** + * Md5 URL connection. + * @author stack + * @version $Date$, $Revision$ + */ +public class Md5URLConnection extends DownloadURLConnection { + protected Md5URLConnection(URL u) { + super(u); + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/net/rsync/Handler.java b/src/main/java/org/archive/net/rsync/Handler.java new file mode 100644 index 00000000..9eb35f5d --- /dev/null +++ b/src/main/java/org/archive/net/rsync/Handler.java @@ -0,0 +1,71 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.net.rsync; + +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; +import java.net.URLConnection; +import java.net.URLStreamHandler; + +/** + * A protocol handler that uses native rsync client to do copy. + * You need to define the system property + * -Djava.protocol.handler.pkgs=org.archive.net to add this handler + * to the java.net.URL set. Assumes rsync is in path. Define + * system property + * -Dorg.archive.net.rsync.RsyncUrlConnection.path=PATH_TO_RSYNC to + * pass path to rsync. Downloads to java.io.tmpdir. + * @author stack + */ +public class Handler extends URLStreamHandler { + protected URLConnection openConnection(URL u) { + return new RsyncURLConnection(u); + } + + /** + * Main dumps rsync file to STDOUT. + * @param args + * @throws IOException + */ + public static void main(String[] args) + throws IOException { + if (args.length != 1) { + System.out.println("Usage: java java " + + "-Djava.protocol.handler.pkgs=org.archive.net " + + "org.archive.net.rsync.Handler RSYNC_URL"); + System.exit(1); + } + URL u = new URL(args[0]); + URLConnection connect = u.openConnection(); + // Write download to stdout. + final int bufferlength = 4096; + byte [] buffer = new byte [bufferlength]; + InputStream is = connect.getInputStream(); + try { + for (int count = is.read(buffer, 0, bufferlength); + (count = is.read(buffer, 0, bufferlength)) != -1;) { + System.out.write(buffer, 0, count); + } + System.out.flush(); + } finally { + is.close(); + } + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/net/rsync/RsyncURLConnection.java b/src/main/java/org/archive/net/rsync/RsyncURLConnection.java new file mode 100644 index 00000000..c6097e96 --- /dev/null +++ b/src/main/java/org/archive/net/rsync/RsyncURLConnection.java @@ -0,0 +1,51 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.net.rsync; + +import java.io.File; +import java.net.URL; + +import org.archive.net.DownloadURLConnection; + +/** + * Rsync URL connection. + * @author stack + * @version $Date$, $Revision$ + */ +public class RsyncURLConnection extends DownloadURLConnection { + private final String RSYNC_TIMEOUT = + System.getProperty(RsyncURLConnection.class.getName() + ".timeout", + "300"); + + protected RsyncURLConnection(URL u) { + super(u); + } + + protected String getScript() { + return System.getProperty(this.getClass().getName() + ".path", + "rsync"); + } + + @Override + protected String[] getCommand(final URL thisUrl, + final File downloadFile) { + return new String[] {getScript(), "--timeout=" + RSYNC_TIMEOUT, + this.url.getPath(), downloadFile.getAbsolutePath()}; + } +} diff --git a/src/main/java/org/archive/uid/RecordIDGenerator.java b/src/main/java/org/archive/uid/RecordIDGenerator.java new file mode 100644 index 00000000..97f1a022 --- /dev/null +++ b/src/main/java/org/archive/uid/RecordIDGenerator.java @@ -0,0 +1,72 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.uid; + +import java.net.URI; +import java.net.URISyntaxException; +import java.util.Map; + +/** + * A record-id generator. + * + * @contributor stack + * @contributor gojomo + * @version $Revision$ $Date$ + */ +public interface RecordIDGenerator { + /** + * @return A URI that can serve as a record-id. + * @throws URISyntaxException + */ + public URI getRecordID(); + + /** + * @param qualifiers Qualifiers to add. + * @return A URI qualified with passed qualifiers that can + * serve as a record-id, or, a new, unique record-id without qualifiers + * (if qualifiers not easily implemented using passed URI scheme). + */ + public URI getQualifiedRecordID(final Map qualifiers); + + /** + * @param key Name of qualifier + * @param value Value of qualifier + * @return A URI qualified with passed qualifiers that can + * serve as a record-id, or, a new, unique record-id without qualifiers + * (if qualifiers not easily implemented using passed URI scheme). + */ + public URI getQualifiedRecordID(final String key, final String value); + + /** + * Append (or if already present, update) qualifiers to passed + * recordId. Use with caution. Guard against turning up a + * result that already exists. Use when writing a group of records inside + * a single transaction. + * + * How qualifiers are appended/updated varies with URI scheme. Its allowed + * that an invocation of this method does nought but call + * {@link #getRecordID()}, returning a new URI unrelated to the passed + * recordId and passed qualifier. + * @param recordId URI to append qualifier to. + * @param qualifiers Map of qualifier values keyed by qualifier name. + * @return New URI based off passed uri and passed qualifier. + */ + public URI qualifyRecordID(final URI recordId, + final Map qualifiers); +} diff --git a/src/main/java/org/archive/uid/UUIDGenerator.java b/src/main/java/org/archive/uid/UUIDGenerator.java new file mode 100644 index 00000000..26d29e60 --- /dev/null +++ b/src/main/java/org/archive/uid/UUIDGenerator.java @@ -0,0 +1,72 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.uid; + +import java.net.URI; +import java.net.URISyntaxException; +import java.util.Map; +import java.util.UUID; + +/** + * Generates UUIDs, using + * {@link java.util.UUID java.util.UUID}, formatted as URNs from the UUID + * namespace [See RFC4122]. + * Here is an examples of the type of ID it makes: + * urn:uuid:0161811f-5da6-4c6e-9808-a2fab97114cf. Always makes a + * new identifier even when passed qualifiers. + * + * @author stack + * @version $Revision$ $Date$ + * @see RFC4122 + */ +public class UUIDGenerator implements RecordIDGenerator { + private static final String SCHEME = "urn:uuid"; + private static final String SCHEME_COLON = SCHEME + ":"; + + public UUIDGenerator() { + super(); + } + + public URI qualifyRecordID(URI recordId, + final Map qualifiers) { + return getRecordID(); + } + + private String getUUID() { + return UUID.randomUUID().toString(); + } + + public URI getRecordID() { + try { + return new URI(SCHEME_COLON + getUUID()); + } catch (URISyntaxException e) { + // should be impossible + throw new RuntimeException(e); + } + } + + public URI getQualifiedRecordID( + final String key, final String value){ + return getRecordID(); + } + + public URI getQualifiedRecordID(Map qualifiers){ + return getRecordID(); + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/uid/package.html b/src/main/java/org/archive/uid/package.html new file mode 100644 index 00000000..dc49f07b --- /dev/null +++ b/src/main/java/org/archive/uid/package.html @@ -0,0 +1,28 @@ + + + +org.archive.uid package + + +A unique ID generator. +Default is {@link org.archive.uid.UUIDGenerator}. +To use another ID Generator, set the System Property +org.archive.uid.GeneratorFactory.generator to point +at an alternate implementation of {@link org.archive.uid.Generator}. + +

TODO

+
    +
  • MIME boundaries have upper-bound of 70 characters total including + 'blank line' (CRLFCRLF) and two leading hyphens. Add to + {@link org.archive.uid.Generator} + interface an upper-bound on generated ID length.
  • +
  • Add example of an actionable uid generator: +e.g. http://archive.org/UID-SCHEME/ID +where scheme might be UUID and an ID might be +f9472055-fbb6-4810-90e8-68fd39e145a6;type=metadata or, +using ARK: +http://archive.org/ark:/13030/f9472055-fbb6-4810-90e8-68fd39e145a6;type=metadata. +
  • +
+ + diff --git a/src/main/java/org/archive/url/ExtractRule.java b/src/main/java/org/archive/url/ExtractRule.java new file mode 100644 index 00000000..bcfb3b2f --- /dev/null +++ b/src/main/java/org/archive/url/ExtractRule.java @@ -0,0 +1,45 @@ +package org.archive.url; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class ExtractRule +{ + protected String startsWith; + protected String regex; + + protected Pattern regexPattern; + + public String getStartsWith() { + return startsWith; + } + public void setStartsWith(String startsWith) { + this.startsWith = startsWith; + } + public String getRegex() { + return regex; + } + public void setRegex(String regex) { + regexPattern = Pattern.compile(regex); + this.regex = regex; + } + + public Matcher extract(String url) + { + if ((startsWith != null) && !startsWith.isEmpty() && !url.startsWith(startsWith)) { + return null; + } + + if (regexPattern == null) { + return null; + } + + Matcher match = regexPattern.matcher(url); + + if (!match.find()) { + return null; + } + + return match; + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/url/RewriteRule.java b/src/main/java/org/archive/url/RewriteRule.java new file mode 100644 index 00000000..47292686 --- /dev/null +++ b/src/main/java/org/archive/url/RewriteRule.java @@ -0,0 +1,55 @@ +package org.archive.url; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class RewriteRule +{ + protected String startsWith; + protected String regex; + protected String replace; + + protected Pattern regexPattern; + + public String getStartsWith() { + return startsWith; + } + public void setStartsWith(String startsWith) { + this.startsWith = startsWith; + } + public String getRegex() { + return regex; + } + public void setRegex(String regex) { + regexPattern = Pattern.compile(regex); + this.regex = regex; + } + public String getReplace() { + return replace; + } + public void setReplace(String replace) { + this.replace = replace; + } + + public boolean rewrite(StringBuilder sb) + { + String urlkey = sb.toString(); + + if ((startsWith != null) && !urlkey.startsWith(startsWith)) { + return false; + } + + if (regexPattern == null || replace == null) { + return false; + } + + Matcher match = regexPattern.matcher(urlkey); + + if (match.matches()) { + sb.replace(0, sb.length(), match.replaceAll(replace)); + return true; + } + + return false; + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/url/UrlSurtRangeComputer.java b/src/main/java/org/archive/url/UrlSurtRangeComputer.java index 74057117..2b960e16 100644 --- a/src/main/java/org/archive/url/UrlSurtRangeComputer.java +++ b/src/main/java/org/archive/url/UrlSurtRangeComputer.java @@ -112,7 +112,7 @@ public String[] determineRange(String url, MatchType match, String from, String return new String[]{startKey, endKey, host}; } - protected String incLastChar(String input) + public static String incLastChar(String input) { StringBuilder sb = new StringBuilder(input); sb.setCharAt(sb.length() - 1, (char)(sb.charAt(sb.length() - 1) + 1)); diff --git a/src/main/java/org/archive/url/WaybackURLKeyMaker.java b/src/main/java/org/archive/url/WaybackURLKeyMaker.java index 23c67d06..99fb92e9 100644 --- a/src/main/java/org/archive/url/WaybackURLKeyMaker.java +++ b/src/main/java/org/archive/url/WaybackURLKeyMaker.java @@ -2,8 +2,6 @@ import java.net.URISyntaxException; import java.util.List; -import java.util.regex.Matcher; -import java.util.regex.Pattern; public class WaybackURLKeyMaker implements URLKeyMaker { // URLCanonicalizer canonicalizer = new NonMassagingIAURLCanonicalizer(); @@ -21,34 +19,6 @@ public void setCanonicalizer(URLCanonicalizer canonicalizer) { protected List customRules; - public static class RewriteRule - { - String startsWith; - String regex; - String replace; - Pattern regexPattern; - - public String getStartsWith() { - return startsWith; - } - public void setStartsWith(String startsWith) { - this.startsWith = startsWith; - } - public String getRegex() { - return regex; - } - public void setRegex(String regex) { - regexPattern = Pattern.compile(regex); - this.regex = regex; - } - public String getReplace() { - return replace; - } - public void setReplace(String replace) { - this.replace = replace; - } - } - public WaybackURLKeyMaker() { @@ -117,22 +87,12 @@ public void setCustomRules(List customRules) { protected String applyCustomRules(String urlkey) { + StringBuilder sb = new StringBuilder(urlkey); + for (RewriteRule rule : customRules) { - if ((rule.startsWith != null) && !urlkey.startsWith(rule.startsWith)) { - continue; - } - - if (rule.regexPattern == null || rule.replace == null) { - continue; - } - - Matcher match = rule.regexPattern.matcher(urlkey); - - if (match.matches()) { - urlkey = match.replaceAll(rule.replace); - } + rule.rewrite(sb); } - return urlkey; + return sb.toString(); } } diff --git a/src/main/java/org/archive/util/DevUtils.java b/src/main/java/org/archive/util/DevUtils.java new file mode 100644 index 00000000..d630a0b1 --- /dev/null +++ b/src/main/java/org/archive/util/DevUtils.java @@ -0,0 +1,116 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.util; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.PrintWriter; +import java.io.StringWriter; +import java.util.logging.Logger; + + +/** + * Write a message and stack trace to the 'org.archive.util.DevUtils' logger. + * + * @author gojomo + * @version $Revision$ $Date$ + */ +public class DevUtils { + public static Logger logger = + Logger.getLogger(DevUtils.class.getName()); + + /** + * Log a warning message to the logger 'org.archive.util.DevUtils' made of + * the passed 'note' and a stack trace based off passed exception. + * + * @param ex Exception we print a stacktrace on. + * @param note Message to print ahead of the stacktrace. + */ + public static void warnHandle(Throwable ex, String note) { + logger.warning(TextUtils.exceptionToString(note, ex)); + } + + /** + * @return Extra information gotten from current Thread. May not + * always be available in which case we return empty string. + */ + public static String extraInfo() { + StringWriter sw = new StringWriter(); + PrintWriter pw = new PrintWriter(sw); + final Thread current = Thread.currentThread(); + if (current instanceof Reporter) { + Reporter tt = (Reporter)current; + try { + tt.reportTo(pw); + } catch (IOException e) { + // Not really possible w/ a StringWriter + e.printStackTrace(); + } + } + if (current instanceof ProgressStatisticsReporter) { + ProgressStatisticsReporter tt = (ProgressStatisticsReporter)current; + try { + tt.progressStatisticsLegend(pw); + tt.progressStatisticsLine(pw); + } catch (IOException e) { + // Not really possible w/ a StringWriter + e.printStackTrace(); + } + } + pw.flush(); + return sw.toString(); + } + + /** + * Nothing to see here, move along. + * @deprecated This method was never used. + */ + @Deprecated + public static void betterPrintStack(RuntimeException re) { + re.printStackTrace(System.err); + } + + /** + * Send this JVM process a SIGQUIT; giving a thread dump and possibly + * a heap histogram (if using -XX:+PrintClassHistogram). + * + * Used to automatically dump info, for example when a serious error + * is encountered. Would use 'jmap'/'jstack', but have seen JVM + * lockups -- perhaps due to lost thread wake signals -- when using + * those against Sun 1.5.0+03 64bit JVM. + */ + public static void sigquitSelf() { + try { + Process p = Runtime.getRuntime().exec( + new String[] {"perl", "-e", "print getppid(). \"\n\";"}); + BufferedReader br = + new BufferedReader(new InputStreamReader(p.getInputStream())); + String ppid = br.readLine(); + Runtime.getRuntime().exec( + new String[] {"sh", "-c", "kill -3 "+ppid}).waitFor(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } +} diff --git a/src/main/java/org/archive/util/FileUtils.java b/src/main/java/org/archive/util/FileUtils.java new file mode 100644 index 00000000..3de276a9 --- /dev/null +++ b/src/main/java/org/archive/util/FileUtils.java @@ -0,0 +1,712 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.util; + +import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.FileFilter; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.channels.ClosedByInterruptException; +import java.nio.channels.FileChannel; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Properties; +import java.util.logging.Level; +import java.util.logging.Logger; +import java.util.regex.Pattern; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.io.filefilter.IOFileFilter; +import org.apache.commons.lang.math.LongRange; + + +/** Utility methods for manipulating files and directories. + * + * @contributor John Erik Halse + * @contributor gojomo + */ +public class FileUtils { + private static final Logger LOGGER = + Logger.getLogger(FileUtils.class.getName()); + + /** + * Constructor made private because all methods of this class are static. + */ + private FileUtils() { + super(); + } + + /** + * Copy the src file to the destination. Deletes any preexisting + * file at destination. + * + * @param src + * @param dest + * @return True if the extent was greater than actual bytes copied. + * @throws FileNotFoundException + * @throws IOException + */ + public static boolean copyFile(final File src, final File dest) + throws FileNotFoundException, IOException { + return copyFile(src, dest, -1, true); + } + + /** + * Copy up to extent bytes of the source file to the destination. + * Deletes any preexisting file at destination. + * + * @param src + * @param dest + * @param extent Maximum number of bytes to copy + * @return True if the extent was greater than actual bytes copied. + * @throws FileNotFoundException + * @throws IOException + */ + public static boolean copyFile(final File src, final File dest, + long extent) + throws FileNotFoundException, IOException { + return copyFile(src, dest, extent, true); + } + + /** + * Copy up to extent bytes of the source file to the destination + * + * @param src + * @param dest + * @param extent Maximum number of bytes to copy + * @param overwrite If target file already exits, and this parameter is + * true, overwrite target file (We do this by first deleting the target + * file before we begin the copy). + * @return True if the extent was greater than actual bytes copied. + * @throws FileNotFoundException + * @throws IOException + */ + public static boolean copyFile(final File src, final File dest, + long extent, final boolean overwrite) + throws FileNotFoundException, IOException { + boolean result = false; + if (LOGGER.isLoggable(Level.FINE)) { + LOGGER.fine("Copying file " + src + " to " + dest + " extent " + + extent + " exists " + dest.exists()); + } + if (dest.exists()) { + if (overwrite) { + dest.delete(); + LOGGER.finer(dest.getAbsolutePath() + " removed before copy."); + } else { + // Already in place and we're not to overwrite. Return. + return result; + } + } + FileInputStream fis = null; + FileOutputStream fos = null; + FileChannel fcin = null; + FileChannel fcout = null; + try { + // Get channels + fis = new FileInputStream(src); + fos = new FileOutputStream(dest); + fcin = fis.getChannel(); + fcout = fos.getChannel(); + if (extent < 0) { + extent = fcin.size(); + } + + // Do the file copy + long trans = fcin.transferTo(0, extent, fcout); + if (trans < extent) { + result = false; + } + result = true; + } catch (IOException e) { + // Add more info to the exception. Preserve old stacktrace. + // We get 'Invalid argument' on some file copies. See + // http://intellij.net/forums/thread.jsp?forum=13&thread=63027&message=853123 + // for related issue. + String message = "Copying " + src.getAbsolutePath() + " to " + + dest.getAbsolutePath() + " with extent " + extent + + " got IOE: " + e.getMessage(); + if ((e instanceof ClosedByInterruptException) || + ((e.getMessage()!=null) + &&e.getMessage().equals("Invalid argument"))) { + LOGGER.severe("Failed copy, trying workaround: " + message); + workaroundCopyFile(src, dest); + } else { + IOException newE = new IOException(message); + newE.initCause(e); + throw newE; + } + } finally { + // finish up + if (fcin != null) { + fcin.close(); + } + if (fcout != null) { + fcout.close(); + } + if (fis != null) { + fis.close(); + } + if (fos != null) { + fos.close(); + } + } + return result; + } + + protected static void workaroundCopyFile(final File src, + final File dest) + throws IOException { + FileInputStream from = null; + FileOutputStream to = null; + try { + from = new FileInputStream(src); + to = new FileOutputStream(dest); + byte[] buffer = new byte[4096]; + int bytesRead; + while ((bytesRead = from.read(buffer)) != -1) { + to.write(buffer, 0, bytesRead); + } + } finally { + if (from != null) { + try { + from.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + if (to != null) { + try { + to.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + } + + /** + * Get a list of all files in directory that have passed prefix. + * + * @param dir Dir to look in. + * @param prefix Basename of files to look for. Compare is case insensitive. + * + * @return List of files in dir that start w/ passed basename. + */ + public static File [] getFilesWithPrefix(File dir, final String prefix) { + FileFilter prefixFilter = new FileFilter() { + public boolean accept(File pathname) + { + return pathname.getName().toLowerCase(). + startsWith(prefix.toLowerCase()); + } + }; + return dir.listFiles(prefixFilter); + } + + /** Get a @link java.io.FileFilter that filters files based on a regular + * expression. + * + * @param regex the regular expression the files must match. + * @return the newly created filter. + */ + public static IOFileFilter getRegexFileFilter(String regex) { + // Inner class defining the RegexFileFilter + class RegexFileFilter implements IOFileFilter { + Pattern pattern; + + protected RegexFileFilter(String re) { + pattern = Pattern.compile(re); + } + + public boolean accept(File pathname) { + return pattern.matcher(pathname.getName()).matches(); + } + + public boolean accept(File dir, String name) { + return accept(new File(dir,name)); + } + } + + return new RegexFileFilter(regex); + } + + /** + * Test file exists and is readable. + * @param f File to test. + * @exception FileNotFoundException If file does not exist or is not unreadable. + */ + public static File assertReadable(final File f) throws FileNotFoundException { + if (!f.exists()) { + throw new FileNotFoundException(f.getAbsolutePath() + + " does not exist."); + } + + if (!f.canRead()) { + throw new FileNotFoundException(f.getAbsolutePath() + + " is not readable."); + } + + return f; + } + + /** + * @param f File to test. + * @return True if file is readable, has uncompressed extension, + * and magic string at file start. + * @exception IOException If file not readable or other problem. + */ + public static boolean isReadableWithExtensionAndMagic(final File f, + final String uncompressedExtension, final String magic) + throws IOException { + boolean result = false; + FileUtils.assertReadable(f); + if(f.getName().toLowerCase().endsWith(uncompressedExtension)) { + FileInputStream fis = new FileInputStream(f); + try { + byte [] b = new byte[magic.length()]; + int read = fis.read(b, 0, magic.length()); + fis.close(); + if (read == magic.length()) { + StringBuffer beginStr + = new StringBuffer(magic.length()); + for (int i = 0; i < magic.length(); i++) { + beginStr.append((char)b[i]); + } + + if (beginStr.toString(). + equalsIgnoreCase(magic)) { + result = true; + } + } + } finally { + fis.close(); + } + } + + return result; + } + + /** + * Turn path into a File, relative to context (which may be ignored + * if path is absolute). + * + * @param context File context if path is relative + * @param path String path to make into a File + * @return File created + */ + public static File maybeRelative(File context, String path) { + File f = new File(path); + if(f.isAbsolute()) { + return f; + } + return new File(context, path); + } + + /** + * Load Properties instance from a File + * + * @param file + * @return Properties + * @throws IOException + */ + public static Properties loadProperties(File file) throws IOException { + FileInputStream finp = new FileInputStream(file); + try { + Properties p = new Properties(); + p.load(finp); + return p; + } finally { + ArchiveUtils.closeQuietly(finp); + } + } + + /** + * Store Properties instance to a File + * @param p + * @param file destination File + * @throws IOException + */ + public static void storeProperties(Properties p, File file) throws IOException { + FileOutputStream fos = new FileOutputStream(file); + try { + p.store(fos,""); + } finally { + ArchiveUtils.closeQuietly(fos); + } + } + + // TODO: comment + public static boolean moveAsideIfExists(File file) throws IOException { + if(!file.exists()) { + return true; + } + String newName = + file.getCanonicalPath() + "." + + ArchiveUtils.get14DigitDate(file.lastModified()); + boolean retVal = file.renameTo(new File(newName)); + if(!retVal) { + LOGGER.warning("unable to move aside: "+file+" to "+newName); + } + return retVal; + + } + + /** + * Retrieve a number of lines from the file around the given + * position, as when paging forward or backward through a file. + * + * @param file File to retrieve lines + * @param position offset to anchor lines + * @param signedDesiredLineCount lines requested; if negative, + * want this number of lines ending with a line containing + * the position; if positive, want this number of lines, + * all starting at or after position. + * @param lines List to insert found lines + * @param lineEstimate int estimate of line size, 0 means use default + * of 128 + * @return LongRange indicating the file offsets corresponding to + * the beginning of the first line returned, and the point + * after the end of the last line returned + * @throws IOException + */ + @SuppressWarnings("unchecked") + public static LongRange pagedLines(File file, long position, + int signedDesiredLineCount, List lines, int lineEstimate) + throws IOException { + // consider negative positions as from end of file; -1 = last byte + if (position < 0) { + position = file.length() + position; + } + + // calculate a reasonably sized chunk likely to have all desired lines + if(lineEstimate == 0) { + lineEstimate = 128; + } + int desiredLineCount = Math.abs(signedDesiredLineCount); + long startPosition; + long fileEnd = file.length(); + int bufferSize = (desiredLineCount + 5) * lineEstimate; + if(signedDesiredLineCount>0) { + // reading forward; include previous char in case line-end + startPosition = position - 1; + } else { + // reading backward + startPosition = position - bufferSize + (2 * lineEstimate); + } + if(startPosition<0) { + startPosition = 0; + } + if(startPosition+bufferSize > fileEnd) { + bufferSize = (int)(fileEnd - startPosition); + } + + // read that reasonable chunk + FileInputStream fis = new FileInputStream(file); + fis.getChannel().position(startPosition); + byte[] buf = new byte[bufferSize]; + ArchiveUtils.readFully(fis, buf); + IOUtils.closeQuietly(fis); + + // find all line starts fully in buffer + // (positions after a line-end, per line-end definition in + // BufferedReader.readLine) + LinkedList lineStarts = new LinkedList(); + if(startPosition==0) { + lineStarts.add(0); + } + boolean atLineEnd = false; + boolean eatLF = false; + int i; + for(i = 0; i < bufferSize; i++) { + if ((char) buf[i] == '\n' && eatLF) { + eatLF = false; + continue; + } + if(atLineEnd) { + atLineEnd = false; + lineStarts.add(i); + if(signedDesiredLineCount<0 && startPosition+i > position) { + // reached next line past position, read no more + break; + } + } + if ((char) buf[i] == '\r') { + atLineEnd = true; + eatLF = true; + continue; + } + if ((char) buf[i] == '\n') { + atLineEnd = true; + } + } + if(startPosition+i == fileEnd) { + // add phantom lineStart after end + lineStarts.add(bufferSize); + } + int foundFullLines = lineStarts.size()-1; + + // if found no lines + if(foundFullLines<1) { + if(signedDesiredLineCount>0) { + if(startPosition+bufferSize == fileEnd) { + // nothing more to read: return nothing + return new LongRange(fileEnd,fileEnd); + } else { + // retry with larger lineEstimate + return pagedLines(file, position, signedDesiredLineCount, lines, Math.max(bufferSize,lineEstimate)); + } + + } else { + // try again with much larger line estimate + // TODO: fail gracefully before growing to multi-MB buffers + return pagedLines(file, position, signedDesiredLineCount, lines, bufferSize); + } + } + + // trim unneeded lines + while(signedDesiredLineCount>0 && startPosition+lineStarts.getFirst()desiredLineCount+1) { + if (signedDesiredLineCount < 0 && (startPosition+lineStarts.get(1) <= position) ) { + // discard from front until reach line containing target position + lineStarts.removeFirst(); + } else { + lineStarts.removeLast(); + } + } + int firstLine = lineStarts.getFirst(); + int partialLine = lineStarts.getLast(); + LongRange range = new LongRange(startPosition + firstLine, startPosition + partialLine); + List foundLines = + IOUtils.readLines(new ByteArrayInputStream(buf,firstLine,partialLine-firstLine)); + + if(foundFullLines< 0 && startPosition > 0) { + // if needed and reading backward, read more lines from earlier + range = expandRange( + range, + pagedLines(file, + range.getMinimumLong()-1, + signedDesiredLineCount+foundFullLines, + lines, + bufferSize/foundFullLines)); + + } + + lines.addAll(foundLines); + + if(signedDesiredLineCount < 0 && range.getMaximumLong() < position) { + // did not get line containining start position + range = expandRange( + range, + pagedLines(file, + partialLine, + 1, + lines, + bufferSize/foundFullLines)); + } + + if(signedDesiredLineCount > 0 && foundFullLines < desiredLineCount && range.getMaximumLong() < fileEnd) { + // need more forward lines + range = expandRange( + range, + pagedLines(file, + range.getMaximumLong(), + desiredLineCount - foundFullLines, + lines, + bufferSize/foundFullLines)); + } + + return range; + } + + public static LongRange expandRange(LongRange range1, LongRange range2) { + return new LongRange(Math.min(range1.getMinimumLong(), range2.getMinimumLong()), + Math.max(range1.getMaximumLong(), range2.getMaximumLong())); + + } + + public static LongRange pagedLines(File file, long position, int signedDesiredLongCount, List lines) throws IOException { + return pagedLines(file, position, signedDesiredLongCount, lines, 0); + } + + /** + * Delete the file now -- but in the event of failure, keep trying + * in the future. + * + * VERY IMPORTANT: Do not use with any file whose name/path may be + * reused, because the lagged delete could then wind up deleting the + * newer file. Essentially, only to be used with uniquely-named temp + * files. + * + * Necessary because some platforms (looking at you, + * JVM-on-Windows) will have deletes fail because of things like + * file-mapped buffers remaining, and there's no explicit way to + * unmap a buffer. (See 6-year-old Sun-stumping Java bug + * http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4724038 ) + * We just have to wait and retry. + * + * (Why not just File.deleteOnExit? There could be an arbitrary, + * unbounded number of files in such a situation, that are only + * deletable a few seconds or minutes after our first attempt. + * Waiting for JVM exist could mean disk exhaustion. It's also + * unclear if the native FS class implementations of deleteOnExit + * use RAM per pending file.) + * + * @param fileToDelete + */ + public static synchronized void deleteSoonerOrLater(File fileToDelete) { + pendingDeletes.add(fileToDelete); + // if things are getting out of hand, force gc/finalization + if(pendingDeletes.size()>50) { + LOGGER.warning(">50 pending Files to delete; forcing gc/finalization"); + System.gc(); + System.runFinalization(); + } + // try all pendingDeletes + Iterator iter = pendingDeletes.listIterator(); + while(iter.hasNext()) { + File pending = iter.next(); + if(pending.delete()) { + iter.remove(); + } + } + // if things are still out of hand, complain loudly + if(pendingDeletes.size()>50) { + LOGGER.severe(">50 pending Files to delete even after gc/finalization"); + } + } + protected static LinkedList pendingDeletes = new LinkedList(); + + /** + * Read the entire stream to EOF into the passed file. + * Closes is when done or if an exception. + * @param is Stream to read. + * @param toFile File to write to. + * @throws IOException + */ + public static long readFullyToFile(InputStream is, File toFile) + throws IOException { + OutputStream os = org.apache.commons.io.FileUtils.openOutputStream(toFile); + try { + return IOUtils.copyLarge(is, os); + } finally { + IOUtils.closeQuietly(os); + IOUtils.closeQuietly(is); + } + } + + /** + * Ensure writeable directory. + * + * If doesn't exist, we attempt creation. + * + * @param dir Directory to test for exitence and is writeable. + * + * @return The passed dir. + * + * @exception IOException If passed directory does not exist and is not + * createable, or directory is not writeable or is not a directory. + */ + public static File ensureWriteableDirectory(String dir) + throws IOException { + return FileUtils.ensureWriteableDirectory(new File(dir)); + } + + /** + * Ensure writeable directories. + * + * If doesn't exist, we attempt creation. + * + * @param dirs List of Files to test. + * + * @return The passed dirs. + * + * @exception IOException If passed directory does not exist and is not + * createable, or directory is not writeable or is not a directory. + */ + public static List ensureWriteableDirectory(List dirs) + throws IOException { + for (Iterator i = dirs.iterator(); i.hasNext();) { + FileUtils.ensureWriteableDirectory(i.next()); + } + return dirs; + } + + /** + * Ensure writeable directory. + * + * If doesn't exist, we attempt creation. + * + * @param dir Directory to test for exitence and is writeable. + * + * @return The passed dir. + * + * @exception IOException If passed directory does not exist and is not + * createable, or directory is not writeable or is not a directory. + */ + public static File ensureWriteableDirectory(File dir) + throws IOException { + if (!dir.exists()) { + boolean success = dir.mkdirs(); + if (!success) { + throw new IOException("Failed to create directory: " + dir); + } + } else { + if (!dir.canWrite()) { + throw new IOException("Dir " + dir.getAbsolutePath() + + " not writeable."); + } else if (!dir.isDirectory()) { + throw new IOException("Dir " + dir.getAbsolutePath() + + " is not a directory."); + } + } + + return dir; + } + + public static File tryToCanonicalize(File file) { + try { + return file.getCanonicalFile(); + } catch (IOException e) { + return file; + } + } + + public static void appendTo(File fileToAppendTo, File fileToAppendFrom) throws IOException { + // optimal io block size according to http://lingrok.org/xref/coreutils/src/ioblksize.h + byte[] buf = new byte[65536]; + FileOutputStream out = new FileOutputStream(fileToAppendTo, true); + FileInputStream in = new FileInputStream(fileToAppendFrom); + for (int n = in.read(buf); n > 0; n = in.read(buf)) { + out.write(buf, 0, n); + } + in.close(); + out.flush(); + out.close(); + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/util/InetAddressUtil.java b/src/main/java/org/archive/util/InetAddressUtil.java new file mode 100644 index 00000000..585ba772 --- /dev/null +++ b/src/main/java/org/archive/util/InetAddressUtil.java @@ -0,0 +1,116 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.util; + +import java.net.InetAddress; +import java.net.NetworkInterface; +import java.net.SocketException; +import java.net.UnknownHostException; +import java.util.ArrayList; +import java.util.Enumeration; +import java.util.List; +import java.util.logging.Logger; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * InetAddress utility. + * @author stack + * @version $Date$, $Revision$ + */ +public class InetAddressUtil { + private static Logger logger = + Logger.getLogger(InetAddressUtil.class.getName()); + + /** + * ipv4 address. + */ + public static Pattern IPV4_QUADS = Pattern.compile( + "([0-9]{1,3})\\.([0-9]{1,3})\\.([0-9]{1,3})\\.([0-9]{1,3})"); + + private InetAddressUtil () { + super(); + } + + /** + * Returns InetAddress for passed host IF its in + * IPV4 quads format (e.g. 128.128.128.128). + *

TODO: Move to an AddressParsingUtil class. + * @param host Host name to examine. + * @return InetAddress IF the passed name was an IP address, else null. + */ + public static InetAddress getIPHostAddress(String host) { + InetAddress result = null; + Matcher matcher = IPV4_QUADS.matcher(host); + if (matcher == null || !matcher.matches()) { + return result; + } + try { + // Doing an Inet.getByAddress() avoids a lookup. + result = InetAddress.getByAddress(host, + new byte[] { + (byte)(new Integer(matcher.group(1)).intValue()), + (byte)(new Integer(matcher.group(2)).intValue()), + (byte)(new Integer(matcher.group(3)).intValue()), + (byte)(new Integer(matcher.group(4)).intValue())}); + } catch (NumberFormatException e) { + logger.warning(e.getMessage()); + } catch (UnknownHostException e) { + logger.warning(e.getMessage()); + } + return result; + } + + /** + * @return All known local names for this host or null if none found. + */ + public static List getAllLocalHostNames() { + List localNames = new ArrayList(); + Enumeration e = null; + try { + e = NetworkInterface.getNetworkInterfaces(); + } catch(SocketException exception) { + throw new RuntimeException(exception); + } + for (; e.hasMoreElements();) { + for (Enumeration ee = e.nextElement().getInetAddresses(); + ee.hasMoreElements();) { + InetAddress ia = ee.nextElement(); + if (ia != null) { + if (ia.getHostName() != null) { + localNames.add(ia.getCanonicalHostName()); + } + if (ia.getHostAddress() != null) { + localNames.add(ia.getHostAddress()); + } + } + } + } + final String localhost = "localhost"; + if (!localNames.contains(localhost)) { + localNames.add(localhost); + } + final String localhostLocaldomain = "localhost.localdomain"; + if (!localNames.contains(localhostLocaldomain)) { + localNames.add(localhostLocaldomain); + } + return localNames; + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/util/IterableLineIterator.java b/src/main/java/org/archive/util/IterableLineIterator.java new file mode 100644 index 00000000..6e0d9dc8 --- /dev/null +++ b/src/main/java/org/archive/util/IterableLineIterator.java @@ -0,0 +1,26 @@ +package org.archive.util; + +import java.io.Reader; +import java.util.Iterator; + +import org.apache.commons.io.LineIterator; + +/** + * A LineIterator that also implements Iterable, so that it can be used with + * the java enhanced for-each loop syntax. + * + * @contributor nlevitt + */ +public class IterableLineIterator extends LineIterator + implements Iterable { + + public IterableLineIterator(final Reader reader) + throws IllegalArgumentException { + super(reader); + } + + @SuppressWarnings("unchecked") + public Iterator iterator() { + return this; + } +} diff --git a/src/main/java/org/archive/util/LaxHttpParser.java b/src/main/java/org/archive/util/LaxHttpParser.java new file mode 100644 index 00000000..c1f768f0 --- /dev/null +++ b/src/main/java/org/archive/util/LaxHttpParser.java @@ -0,0 +1,242 @@ +/* + * $Header: /home/jerenkrantz/tmp/commons/commons-convert/cvs/home/cvs/jakarta-commons//httpclient/src/java/org/apache/commons/httpclient/LaxHttpParser.java,v 1.13 2005/01/11 13:57:06 oglueck Exp $ + * $Revision$ + * $Date$ + * + * ==================================================================== + * + * Copyright 1999-2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * ==================================================================== + * + * This software consists of voluntary contributions made by many + * individuals on behalf of the Apache Software Foundation. For more + * information on the Apache Software Foundation, please see + * . + * + */ +/* + * + */ + +package org.archive.util; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; + +import org.apache.commons.httpclient.Header; +import org.apache.commons.httpclient.HttpException; +import org.apache.commons.httpclient.util.EncodingUtil; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +/** + * A Modified version of HttpParser which doesn't throw exceptions on bad header lines + * + * A utility class for parsing http header values according to + * RFC-2616 Section 4 and 19.3. + * + * @author Michael Becke + * @author Oleg Kalnichevski + * + * @since 2.0beta1 + */ +public class LaxHttpParser { + + /** Log object for this class. */ + private static final Log LOG = LogFactory.getLog(LaxHttpParser.class); + + /** + * Constructor for LaxHttpParser. + */ + protected LaxHttpParser() { } + + /** + * Return byte array from an (unchunked) input stream. + * Stop reading when "\n" terminator encountered + * If the stream ends before the line terminator is found, + * the last part of the string will still be returned. + * If no input data available, null is returned. + * + * @param inputStream the stream to read from + * + * @throws IOException if an I/O problem occurs + * @return a byte array from the stream + */ + public static byte[] readRawLine(InputStream inputStream) throws IOException { + LOG.trace("enter LaxHttpParser.readRawLine()"); + + ByteArrayOutputStream buf = new ByteArrayOutputStream(); + int ch; + while ((ch = inputStream.read()) >= 0) { + buf.write(ch); + if (ch == '\n') { // be tolerant (RFC-2616 Section 19.3) + break; + } + } + if (buf.size() == 0) { + return null; + } + return buf.toByteArray(); + } + + /** + * Read up to "\n" from an (unchunked) input stream. + * If the stream ends before the line terminator is found, + * the last part of the string will still be returned. + * If no input data available, null is returned. + * + * @param inputStream the stream to read from + * @param charset charset of HTTP protocol elements + * + * @throws IOException if an I/O problem occurs + * @return a line from the stream + * + * @since 3.0 + */ + public static String readLine(InputStream inputStream, String charset) throws IOException { + LOG.trace("enter LaxHttpParser.readLine(InputStream, String)"); + byte[] rawdata = readRawLine(inputStream); + if (rawdata == null) { + return null; + } + // strip CR and LF from the end + int len = rawdata.length; + int offset = 0; + if (len > 0) { + if (rawdata[len - 1] == '\n') { + offset++; + if (len > 1) { + if (rawdata[len - 2] == '\r') { + offset++; + } + } + } + } + return EncodingUtil.getString(rawdata, 0, len - offset, charset); + } + + /** + * Read up to "\n" from an (unchunked) input stream. + * If the stream ends before the line terminator is found, + * the last part of the string will still be returned. + * If no input data available, null is returned + * + * @param inputStream the stream to read from + * + * @throws IOException if an I/O problem occurs + * @return a line from the stream + * + * @deprecated use #readLine(InputStream, String) + */ + + public static String readLine(InputStream inputStream) throws IOException { + LOG.trace("enter LaxHttpParser.readLine(InputStream)"); + return readLine(inputStream, "US-ASCII"); + } + + /** + * Parses headers from the given stream. Headers with the same name are not + * combined. + * + * @param is the stream to read headers from + * @param charset the charset to use for reading the data + * + * @return an array of headers in the order in which they were parsed + * + * @throws IOException if an IO error occurs while reading from the stream + * @throws HttpException if there is an error parsing a header value + * + * @since 3.0 + */ + public static Header[] parseHeaders(InputStream is, String charset) throws IOException, HttpException { + LOG.trace("enter HeaderParser.parseHeaders(InputStream, String)"); + + ArrayList

headers = new ArrayList
(); + String name = null; + StringBuffer value = null; + for (; ;) { + String line = LaxHttpParser.readLine(is, charset); + if ((line == null) || (line.trim().length() < 1)) { + break; + } + + // Parse the header name and value + // Check for folded headers first + // Detect LWS-char see HTTP/1.0 or HTTP/1.1 Section 2.2 + // discussion on folded headers + if ((line.charAt(0) == ' ') || (line.charAt(0) == '\t')) { + // we have continuation folded header + // so append value + if (value != null) { + value.append(' '); + value.append(line.trim()); + } + } else { + // make sure we save the previous name,value pair if present + if (name != null) { + headers.add(new Header(name, value.toString())); + } + + // Otherwise we should have normal HTTP header line + // Parse the header name and value + int colon = line.indexOf(":"); + + // START IA/HERITRIX change + // Don't throw an exception if can't parse. We want to keep + // going even though header is bad. Rather, create + // pseudo-header. + if (colon < 0) { + // throw new ProtocolException("Unable to parse header: " + + // line); + name = "HttpClient-Bad-Header-Line-Failed-Parse"; + value = new StringBuffer(line); + + } else { + name = line.substring(0, colon).trim(); + value = new StringBuffer(line.substring(colon + 1).trim()); + } + // END IA/HERITRIX change + } + + } + + // make sure we save the last name,value pair if present + if (name != null) { + headers.add(new Header(name, value.toString())); + } + + return (Header[]) headers.toArray(new Header[headers.size()]); + } + + /** + * Parses headers from the given stream. Headers with the same name are not + * combined. + * + * @param is the stream to read headers from + * + * @return an array of headers in the order in which they were parsed + * + * @throws IOException if an IO error occurs while reading from the stream + * @throws HttpException if there is an error parsing a header value + * + * @deprecated use #parseHeaders(InputStream, String) + */ + public static Header[] parseHeaders(InputStream is) throws IOException, HttpException { + LOG.trace("enter HeaderParser.parseHeaders(InputStream, String)"); + return parseHeaders(is, "US-ASCII"); + } +} diff --git a/src/main/java/org/archive/util/MimetypeUtils.java b/src/main/java/org/archive/util/MimetypeUtils.java new file mode 100644 index 00000000..adfa1a0f --- /dev/null +++ b/src/main/java/org/archive/util/MimetypeUtils.java @@ -0,0 +1,75 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.util; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Class of mimetype utilities. + * @author stack + */ +public class MimetypeUtils { + /** + * The 'no-type' content-type. + * + * Defined in the ARC file spec at + * http://www.archive.org/web/researcher/ArcFileFormat.php. + */ + public static final String NO_TYPE_MIMETYPE = "no-type"; + + /** + * Truncation regex. + */ + protected final static Pattern TRUNCATION_REGEX = Pattern.compile("^([^\\s;,]+).*"); + + + /** + * Truncate passed mimetype. + * + * Ensure no spaces. Strip encoding. Truncation required by + * ARC files. + * + *

Truncate at delimiters [;, ]. + * Truncate multi-part content type header at ';'. + * Apache httpclient collapses values of multiple instances of the + * header into one comma-separated value,therefore truncated at ','. + * Current ia_tools that work with arc files expect 5-column + * space-separated meta-lines, therefore truncate at ' '. + * + * @param contentType Raw content-type. + * + * @return Computed content-type made from passed content-type after + * running it through a set of rules. + */ + public static String truncate(String contentType) { + if (contentType == null) { + contentType = NO_TYPE_MIMETYPE; + } else { + Matcher matcher = TRUNCATION_REGEX.matcher(contentType); + if (matcher.matches()) { + contentType = matcher.group(1); + } else { + contentType = NO_TYPE_MIMETYPE; + } + } + + return contentType; + } +} diff --git a/src/main/java/org/archive/util/ProcessUtils.java b/src/main/java/org/archive/util/ProcessUtils.java new file mode 100644 index 00000000..af792981 --- /dev/null +++ b/src/main/java/org/archive/util/ProcessUtils.java @@ -0,0 +1,151 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.util; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.Arrays; +import java.util.logging.Level; +import java.util.logging.Logger; + +/** + * Class to run an external process. + * @author stack + * @version $Date$ $Revision$ + */ +public class ProcessUtils { + private static final Logger LOGGER = + Logger.getLogger(ProcessUtils.class.getName()); + + protected ProcessUtils() { + super(); + } + + /** + * Thread to gobble up an output stream. + * See http://www.javaworld.com/javaworld/jw-12-2000/jw-1229-traps.html + */ + protected class StreamGobbler extends Thread { + private final InputStream is; + private final StringBuffer sink = new StringBuffer(); + + protected StreamGobbler(InputStream is, String name) { + this.is = is; + setName(name); + } + + public void run() { + try { + BufferedReader br = + new BufferedReader(new InputStreamReader(this.is)); + for (String line = null; (line = br.readLine()) != null;) { + this.sink.append(line); + } + } catch (IOException ioe) { + ioe.printStackTrace(); + } + } + + public String getSink() { + return this.sink.toString(); + } + } + + /** + * Data structure to hold result of a process exec. + * @author stack + * @version $Date$ $Revision$ + */ + public class ProcessResult { + private final String [] args; + private final int result; + private final String stdout; + private final String stderr; + + protected ProcessResult(String [] args, int result, String stdout, + String stderr) { + this.args = args; + this.result = result; + this.stderr = stderr; + this.stdout = stdout; + } + + public int getResult() { + return this.result; + } + + public String getStdout() { + return this.stdout; + } + + public String getStderr() { + return this.stderr; + } + + public String toString() { + StringBuffer sb = new StringBuffer(); + for (int i = 0; i < this.args.length; i++) { + sb.append(this.args[i]); + sb.append(", "); + } + return sb.toString() + " exit code: " + this.result + + ((this.stderr != null && this.stderr.length() > 0)? + "\nSTDERR: " + this.stderr: "") + + ((this.stdout != null && this.stdout.length() > 0)? + "\nSTDOUT: " + this.stdout: ""); + } + } + + /** + * Runs process. + * @param args List of process args. + * @return A ProcessResult data structure. + * @throws IOException If interrupted, we throw an IOException. If non-zero + * exit code, we throw an IOException (This may need to change). + */ + public static ProcessUtils.ProcessResult exec(String [] args) + throws IOException { + Process p = Runtime.getRuntime().exec(args); + ProcessUtils pu = new ProcessUtils(); + // Gobble up any output. + StreamGobbler err = pu.new StreamGobbler(p.getErrorStream(), "stderr"); + err.setDaemon(true); + err.start(); + StreamGobbler out = pu.new StreamGobbler(p.getInputStream(), "stdout"); + out.setDaemon(true); + out.start(); + int exitVal; + try { + exitVal = p.waitFor(); + } catch (InterruptedException e) { + throw new IOException("Wait on process " + Arrays.toString(args) + " interrupted: " + + e.getMessage()); + } + ProcessUtils.ProcessResult result = + pu.new ProcessResult(args, exitVal, out.getSink(), err.getSink()); + if (exitVal != 0) { + throw new IOException(result.toString()); + } else if (LOGGER.isLoggable(Level.INFO)) { + LOGGER.info(result.toString()); + } + return result; + } +} diff --git a/src/main/java/org/archive/util/ProgressStatisticsReporter.java b/src/main/java/org/archive/util/ProgressStatisticsReporter.java new file mode 100644 index 00000000..dc1e51f7 --- /dev/null +++ b/src/main/java/org/archive/util/ProgressStatisticsReporter.java @@ -0,0 +1,36 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.util; + +import java.io.IOException; +import java.io.PrintWriter; + +public interface ProgressStatisticsReporter { + /** + * @param writer Where to write statistics. + * @throws IOException + */ + public void progressStatisticsLine(PrintWriter writer) throws IOException; + + /** + * @param writer Where to write statistics legend. + * @throws IOException + */ + public void progressStatisticsLegend(PrintWriter writer) throws IOException; +} diff --git a/src/main/java/org/archive/util/PropertyUtils.java b/src/main/java/org/archive/util/PropertyUtils.java new file mode 100644 index 00000000..083615f6 --- /dev/null +++ b/src/main/java/org/archive/util/PropertyUtils.java @@ -0,0 +1,114 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.util; + +import java.util.Properties; +import java.util.regex.Matcher; + +import org.apache.commons.lang.StringUtils; + +/** + * Utilities for dealing with Java Properties (incl. System Properties) + * + * @contributor stack + * @contributor gojomo + * @version $Date$ $Revision$ + */ +public class PropertyUtils { + /*** + * @param key Property key. + * @return Named property or null if the property is null or empty. + */ + public static String getPropertyOrNull(final String key) { + String value = System.getProperty(key); + return (value == null || value.length() <= 0)? null: value; + } + + /*** + * @param key Property key. + * @return Boolean value or false if null or unreadable. + */ + public static boolean getBooleanProperty(final String key) { + return (getPropertyOrNull(key) == null)? + false: Boolean.valueOf(getPropertyOrNull(key)).booleanValue(); + } + + /** + * @param key Key to use looking up system property. + * @param fallback If no value found for passed key, return + * fallback. + * @return Value of property or fallback. + */ + public static int getIntProperty(final String key, final int fallback) { + return getPropertyOrNull(key) == null? + fallback: Integer.parseInt(getPropertyOrNull(key)); + } + + /** + * Given a string which may contain expressions of the form + * ${key}, replace each expression with the value corresponding to the + * given key in System Properties. If no value is present, + * the expression is replaced with the empty-string. + * + * @param original String + * @param properties Properties to try in order; first value found (if any) is used + * @return modified String + */ + public static String interpolateWithProperties(String original) { + return interpolateWithProperties(original,System.getProperties()); + } + + protected static String propRefPattern = "\\$\\{([^{}]+)\\}"; + + /** + * Given a string which may contain expressions of the form + * ${key}, replace each expression with the value corresponding to the + * given key in the supplied Properties instance. If no value is present, + * the expression is replaced with the empty-string. + * + * @param original String + * @param props Properties to try in order; first value found (if any) is used + * @return modified String + */ + public static String interpolateWithProperties(String original, + Properties... props) { + String result = original; + // cap number of interpolations as guard against unending loop + inter: for(int i =0; i < original.length()*2; i++) { + Matcher m = TextUtils.getMatcher(propRefPattern, result); + while(m.find()) { + String key = m.group(1); + String value = ""; + for(Properties properties : props) { + value = properties.getProperty(key, ""); + if(StringUtils.isNotEmpty(value)) { + break; + } + } + result = result.substring(0,m.start()) + + value + + result.substring(m.end()); + continue inter; + } + // we only hit here if there were no interpolations last while loop + break; + } + return result; + } +} diff --git a/src/main/java/org/archive/util/Recorder.java b/src/main/java/org/archive/util/Recorder.java new file mode 100644 index 00000000..425344bb --- /dev/null +++ b/src/main/java/org/archive/util/Recorder.java @@ -0,0 +1,593 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.util; + +import java.io.BufferedInputStream; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.OutputStream; +import java.nio.charset.Charset; +import java.util.HashSet; +import java.util.Set; +import java.util.logging.Level; +import java.util.logging.Logger; +import java.util.zip.DeflaterInputStream; +import java.util.zip.GZIPInputStream; + +import org.apache.commons.httpclient.ChunkedInputStream; +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang.StringUtils; +import org.archive.io.GenericReplayCharSequence; +import org.archive.io.RecordingInputStream; +import org.archive.io.RecordingOutputStream; +import org.archive.io.ReplayCharSequence; +import org.archive.io.ReplayInputStream; + +import com.google.common.base.Charsets; + + +/** + * Pairs together a RecordingInputStream and RecordingOutputStream + * to capture exactly a single HTTP transaction. + * + * Initially only supports HTTP/1.0 (one request, one response per stream) + * + * Call {@link #markContentBegin()} to demarc the transition between HTTP + * header and body. + * + * @author gojomo + */ +public class Recorder { + protected static Logger logger = + Logger.getLogger("org.archive.util.HttpRecorder"); + + private static final int DEFAULT_OUTPUT_BUFFER_SIZE = 16384; + private static final int DEFAULT_INPUT_BUFFER_SIZE = 524288; + + private RecordingInputStream ris = null; + private RecordingOutputStream ros = null; + + /** + * Backing file basename. + * + * Keep it around so can clean up backing files left on disk. + */ + private String backingFileBasename = null; + + /** + * Backing file output stream suffix. + */ + private static final String RECORDING_OUTPUT_STREAM_SUFFIX = ".ros"; + + /** + * Backing file input stream suffix. + */ + private static final String RECORDING_INPUT_STREAM_SUFFIX = ".ris"; + + /** + * recording-input (ris) content character encoding. + */ + protected String characterEncoding = null; + + /** + * Charset to use for CharSequence provision. Will be UTF-8 if no + * encoding ever requested; a Charset matching above characterEncoding + * if possible; ISO_8859 if above characterEncoding is unsatisfiable. + * TODO: unify to UTF-8 for unspecified and bad-specified cases? + * (current behavior is for consistency with our prior but perhaps not + * optimal behavior) + */ + protected Charset charset = Charsets.UTF_8; + + /** whether recording-input (ris) message-body is chunked */ + protected boolean inputIsChunked = false; + + /** recording-input (ris) entity content-encoding (eg gzip, deflate), if any */ + protected String contentEncoding = null; + + private ReplayCharSequence replayCharSequence; + + + /** + * Create an HttpRecorder. + * + * @param tempDir Directory into which we drop backing files for + * recorded input and output. + * @param backingFilenameBase Backing filename base to which we'll append + * suffices ris for recorded input stream and + * ros for recorded output stream. + * @param outBufferSize Size of output buffer to use. + * @param inBufferSize Size of input buffer to use. + */ + public Recorder(File tempDir, String backingFilenameBase, + int outBufferSize, int inBufferSize) { + this(new File(ensure(tempDir), backingFilenameBase), + outBufferSize, inBufferSize); + } + + + private static File ensure(File tempDir) { + try { + org.archive.util.FileUtils.ensureWriteableDirectory(tempDir); + } catch (IOException e) { + throw new IllegalStateException(e); + } + + return tempDir; + } + + public Recorder(File file, int outBufferSize, int inBufferSize) { + super(); + this.backingFileBasename = file.getAbsolutePath(); + this.ris = new RecordingInputStream(inBufferSize, + this.backingFileBasename + RECORDING_INPUT_STREAM_SUFFIX); + this.ros = new RecordingOutputStream(outBufferSize, + this.backingFileBasename + RECORDING_OUTPUT_STREAM_SUFFIX); + } + + /** + * Create an HttpRecorder. + * + * @param tempDir + * Directory into which we drop backing files for recorded input + * and output. + * @param backingFilenameBase + * Backing filename base to which we'll append suffices + * ris for recorded input stream and + * ros for recorded output stream. + */ + public Recorder(File tempDir, String backingFilenameBase) { + this(tempDir, backingFilenameBase, DEFAULT_INPUT_BUFFER_SIZE, + DEFAULT_OUTPUT_BUFFER_SIZE); + } + + + /** + * Wrap the provided stream with the internal RecordingInputStream + * + * open() throws an exception if RecordingInputStream is already open. + * + * @param is InputStream to wrap. + * + * @return The input stream wrapper which itself is an input stream. + * Pass this in place of the passed stream so input can be recorded. + * + * @throws IOException + */ + public InputStream inputWrap(InputStream is) + throws IOException { + logger.fine(Thread.currentThread().getName() + " wrapping input"); + + // discard any state from previously-recorded input + this.characterEncoding = null; + this.inputIsChunked = false; + this.contentEncoding = null; + + this.ris.open(is); + return this.ris; + } + + /** + * Wrap the provided stream with the internal RecordingOutputStream + * + * open() throws an exception if RecordingOutputStream is already open. + * + * @param os The output stream to wrap. + * + * @return The output stream wrapper which is itself an output stream. + * Pass this in place of the passed stream so output can be recorded. + * + * @throws IOException + */ + public OutputStream outputWrap(OutputStream os) + throws IOException { + this.ros.open(os); + return this.ros; + } + + /** + * Close all streams. + */ + public void close() { + logger.fine(Thread.currentThread().getName() + " closing"); + try { + this.ris.close(); + } catch (IOException e) { + // TODO: Can we not let the exception out of here and report it + // higher up in the caller? + DevUtils.logger.log(Level.SEVERE, "close() ris" + + DevUtils.extraInfo(), e); + } + try { + this.ros.close(); + } catch (IOException e) { + DevUtils.logger.log(Level.SEVERE, "close() ros" + + DevUtils.extraInfo(), e); + } + } + + /** + * Return the internal RecordingInputStream + * + * @return A RIS. + */ + public RecordingInputStream getRecordedInput() { + return this.ris; + } + + /** + * @return The RecordingOutputStream. + */ + public RecordingOutputStream getRecordedOutput() { + return this.ros; + } + + /** + * Mark current position as the point where the HTTP headers end. + */ + public void markContentBegin() { + this.ris.markContentBegin(); + } + + public long getResponseContentLength() { + return this.ris.getResponseContentLength(); + } + + /** + * Close both input and output recorders. + * + * Recorders are the output streams to which we are recording. + * {@link #close()} closes the stream that is being recorded and the + * recorder. This method explicitly closes the recorder only. + */ + public void closeRecorders() { + try { + this.ris.closeRecorder(); + this.ros.closeRecorder(); + } catch (IOException e) { + DevUtils.warnHandle(e, "Convert to runtime exception?"); + } + } + + /** + * Cleanup backing files. + * + * Call when completely done w/ recorder. Removes any backing files that + * may have been dropped. + */ + public void cleanup() { + this.close(); + this.delete(this.backingFileBasename + RECORDING_OUTPUT_STREAM_SUFFIX); + this.delete(this.backingFileBasename + RECORDING_INPUT_STREAM_SUFFIX); + } + + /** + * Delete file if exists. + * + * @param name Filename to delete. + */ + private void delete(String name) { + File f = new File(name); + if (f.exists()) { + f.delete(); + } + } + + + protected static ThreadLocal currentRecorder = new ThreadLocal(); + + public static void setHttpRecorder(Recorder httpRecorder) { + currentRecorder.set(httpRecorder); + } + + /** + * Get the current threads' HttpRecorder. + * + * @return This threads' HttpRecorder. Returns null if can't find a + * HttpRecorder in current instance. + */ + public static Recorder getHttpRecorder() { + return currentRecorder.get(); + } + + /** + * @param characterEncoding Character encoding of input recording. + * @return actual charset in use after attempt to set + */ + public void setCharset(Charset cs) { + this.charset = cs; + } + + /** + * @return effective Charset of input recording + */ + public Charset getCharset() { + return this.charset; + } + + /** + * @param characterEncoding Character encoding of input recording. + */ + public void setInputIsChunked(boolean chunked) { + this.inputIsChunked = chunked; + } + + protected static Set SUPPORTED_ENCODINGS = new HashSet(); + static { + SUPPORTED_ENCODINGS.add("gzip"); + SUPPORTED_ENCODINGS.add("x-gzip"); + SUPPORTED_ENCODINGS.add("deflate"); + SUPPORTED_ENCODINGS.add("identity"); + SUPPORTED_ENCODINGS.add("none"); // unofficial but common + } + /** + * @param contentEncoding declared content-encoding of input recording. + */ + public void setContentEncoding(String contentEncoding) { + String lowerCoding = contentEncoding.toLowerCase(); + if(!SUPPORTED_ENCODINGS.contains(contentEncoding.toLowerCase())) { + throw new IllegalArgumentException("contentEncoding unsupported: "+contentEncoding); + } + this.contentEncoding = lowerCoding; + } + + /** + * @return Returns the characterEncoding. + */ + public String getContentEncoding() { + return this.contentEncoding; + } + + + /** + * @return + * @throws IOException + * @deprecated use getContentReplayCharSequence + */ + public ReplayCharSequence getReplayCharSequence() throws IOException { + return getContentReplayCharSequence(); + } + + /** + * @return A ReplayCharSequence. Caller may call + * {@link ReplayCharSequence#close()} when finished. However, in + * heritrix, the ReplayCharSequence is closed automatically when url + * processing has finished; in that context it's preferable not + * to close, so that processors can reuse the same instance. + * @throws IOException + * @see {@link #endReplays()} + */ + public ReplayCharSequence getContentReplayCharSequence() throws IOException { + if (replayCharSequence == null || !replayCharSequence.isOpen() + || !replayCharSequence.getCharset().equals(charset)) { + if(replayCharSequence!=null && replayCharSequence.isOpen()) { + // existing sequence must not have matched now-configured Charset; close + replayCharSequence.close(); + } + replayCharSequence = getContentReplayCharSequence(this.charset); + } + return replayCharSequence; + } + + + /** + * @param characterEncoding Encoding of recorded stream. + * @return A ReplayCharSequence Will return null if an IOException. Call + * close on returned RCS when done. + * @throws IOException + */ + public ReplayCharSequence getContentReplayCharSequence(Charset requestedCharset) throws IOException { + // raw data overflows to disk; use temp file + InputStream ris = getContentReplayInputStream(); + ReplayCharSequence rcs = new GenericReplayCharSequence( + ris, + calcRecommendedCharBufferSize(this.getRecordedInput()), + this.backingFileBasename + RECORDING_OUTPUT_STREAM_SUFFIX, + requestedCharset); + ris.close(); + return rcs; + } + + /** + * Calculate a recommended size for an in-memory decoded-character buffer + * of this content. We seek a size that is itself no larger (in 2-byte chars) + * than the memory already used by the RecordingInputStream's internal raw + * byte buffer, and also no larger than likely necessary. So, we take the + * minimum of the actual recorded byte size and the RecordingInputStream's + * max buffer size. + * + * @param inStream + * @return int length for in-memory decoded-character buffer + */ + static protected int calcRecommendedCharBufferSize(RecordingInputStream inStream) { + return (int) Math.min(inStream.getRecordedBufferLength()/2, inStream.getSize()); + } + + /** + * Get a raw replay of all recorded data (including, for example, HTTP + * protocol headers) + * + * @return A replay input stream. + * @throws IOException + */ + public ReplayInputStream getReplayInputStream() throws IOException { + return getRecordedInput().getReplayInputStream(); + } + + /** + * Get a raw replay of the 'message-body'. For the common case of + * HTTP, this is the raw, possibly chunked-transfer-encoded message + * contents not including the leading headers. + * + * @return A replay input stream. + * @throws IOException + */ + public ReplayInputStream getMessageBodyReplayInputStream() throws IOException { + return getRecordedInput().getMessageBodyReplayInputStream(); + } + + /** + * Get a raw replay of the 'entity'. For the common case of + * HTTP, this is the message-body after any (usually-unnecessary) + * transfer-decoding but before any content-encoding (eg gzip) decoding + * + * @return A replay input stream. + * @throws IOException + */ + public InputStream getEntityReplayInputStream() throws IOException { + if(inputIsChunked) { + return new ChunkedInputStream(getRecordedInput().getMessageBodyReplayInputStream()); + } else { + return getRecordedInput().getMessageBodyReplayInputStream(); + } + } + + /** + * Get a replay cued up for the 'content' (after all leading headers) + * + * @return A replay input stream. + * @throws IOException + */ + public InputStream getContentReplayInputStream() throws IOException { + InputStream entityStream = getEntityReplayInputStream(); + if(StringUtils.isEmpty(contentEncoding)) { + return entityStream; + } else if ("gzip".equalsIgnoreCase(contentEncoding) || "x-gzip".equalsIgnoreCase(contentEncoding)) { + try { + return new GZIPInputStream(entityStream); + } catch (IOException ioe) { + logger.log(Level.WARNING,"gzip problem; using raw entity instead",ioe); + IOUtils.closeQuietly(entityStream); // close partially-read stream + return getEntityReplayInputStream(); + } + } else if ("deflate".equalsIgnoreCase(contentEncoding)) { + return new DeflaterInputStream(entityStream); + } else if ("identity".equalsIgnoreCase(contentEncoding) || "none".equalsIgnoreCase(contentEncoding)) { + return entityStream; + } else { + // shouldn't be reached given check on setContentEncoding + logger.log(Level.INFO,"Unknown content-encoding '"+contentEncoding+"' declared; using raw entity instead"); + return entityStream; + } + } + + /** + * Return a short prefix of the presumed-textual content as a String. + * + * @param size max length of String to return + * @return String prefix, or empty String (with logged exception) on any error + */ + public String getContentReplayPrefixString(int size) { + return getContentReplayPrefixString(size, this.charset); + } + + /** + * Return a short prefix of the presumed-textual content as a String. + * + * @param size max length of String to return + * @return String prefix, or empty String (with logged exception) on any error + */ + public String getContentReplayPrefixString(int size, Charset cs) { + try { + InputStreamReader isr = new InputStreamReader(getContentReplayInputStream(), cs); + char[] chars = new char[size]; + int count = isr.read(chars); + isr.close(); + if (count > 0) { + return new String(chars,0,count); + } else { + return ""; + } + } catch (IOException e) { + logger.log(Level.SEVERE,"unable to get replay prefix string", e); + return ""; + } + } + + /** + * @param tempFile + * @throws IOException + */ + public void copyContentBodyTo(File tempFile) throws IOException { + InputStream inStream = null; + OutputStream outStream = null; + try { + inStream = getContentReplayInputStream(); + outStream = FileUtils.openOutputStream(tempFile); + IOUtils.copy(inStream, outStream); + } finally { + IOUtils.closeQuietly(inStream); + IOUtils.closeQuietly(outStream); + } + } + + /** + * Record the input stream for later playback by an extractor, etc. + * This is convenience method used to setup an artificial HttpRecorder + * scenario used in unit tests, etc. + * @param dir Directory to write backing file to. + * @param basename of what we're recording. + * @param in Stream to read. + * @param encoding Stream encoding. + * @throws IOException + * @return An {@link org.archive.util.Recorder}. + */ + public static Recorder wrapInputStreamWithHttpRecord(File dir, + String basename, InputStream in, String encoding) + throws IOException { + Recorder rec = new Recorder(dir, basename); + if (encoding != null && encoding.length() > 0) { + rec.setCharset(Charset.forName(encoding)); + } + // Do not use FastBufferedInputStream here. It does not + // support mark. + InputStream is = rec.inputWrap(new BufferedInputStream(in)); + final int BUFFER_SIZE = 1024 * 4; + byte [] buffer = new byte[BUFFER_SIZE]; + while(true) { + // Just read it all down. + int x = is.read(buffer); + if (x == -1) { + break; + } + } + is.close(); + return rec; + } + + public void endReplays() { + ArchiveUtils.closeQuietly(replayCharSequence); + replayCharSequence = null; + + // like closeQuietly + try { + ris.clearForReuse(); + } catch (IOException ioe) { + } + + // like closeQuietly + try { + ros.clearForReuse(); + } catch (IOException e) { + } + } +} diff --git a/src/main/java/org/archive/util/Reporter.java b/src/main/java/org/archive/util/Reporter.java new file mode 100644 index 00000000..2fcb8cd8 --- /dev/null +++ b/src/main/java/org/archive/util/Reporter.java @@ -0,0 +1,56 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.util; + +import java.io.IOException; +import java.io.PrintWriter; +import java.util.Map; + +public interface Reporter { + /** + * Make a default report to the passed-in Writer. Should + * be equivalent to reportTo(null, writer) + * + * @param writer to receive report + */ + public void reportTo(PrintWriter writer) throws IOException; + + /** + * Write a short single-line summary report + * + * @param writer to receive report + */ + @Deprecated + public void shortReportLineTo(PrintWriter pw) throws IOException; + + + /** + * @return Same data that's in the single line report, as key-value pairs + */ + public Map shortReportMap(); + + + /** + * Return a legend for the single-line summary report as a String. + * + * @return String single-line summary legend + */ + public String shortReportLegend(); +} diff --git a/src/main/java/org/archive/util/anvl/ANVLRecord.java b/src/main/java/org/archive/util/anvl/ANVLRecord.java new file mode 100644 index 00000000..de2d3101 --- /dev/null +++ b/src/main/java/org/archive/util/anvl/ANVLRecord.java @@ -0,0 +1,336 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.util.anvl; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.UnsupportedEncodingException; +import java.util.Collection; +import java.util.HashMap; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.logging.Level; +import java.util.logging.Logger; + +import org.archive.io.UTF8Bytes; + +/** + * An ordered {@link List} with 'data' {@link Element} values. + * ANVLRecords end with a blank line. + * + * @see A Name-Value + * Language (ANVL) + * @author stack + */ +public class ANVLRecord extends LinkedList implements UTF8Bytes { + private static final Logger logger = + Logger.getLogger(ANVLRecord.class.getName()); + + public static final String MIMETYPE = "application/warc-fields"; + + public static final ANVLRecord EMPTY_ANVL_RECORD = new ANVLRecord(); + + /** + * Arbitrary upper bound on maximum size of ANVL Record. + * Will throw an IOException if exceed this size. + */ + public static final long MAXIMUM_SIZE = 1024 * 10; + + /** + * An ANVL 'newline'. + * @see http://en.wikipedia.org/wiki/CRLF + */ + protected static final String CRLF = "\r\n"; + + protected static final String FOLD_PREFIX = CRLF + ' '; + + public ANVLRecord() { + super(); + } + + public ANVLRecord(Collection c) { + super(c); + } + + /** @deprecated */ + public ANVLRecord(int initialCapacity) { + super(); + } + + public boolean addLabel(final String l) { + return super.add(new Element(new Label(l))); + } + + public boolean addLabelValue(final String l, final String v) { + try { + return super.add(new Element(new Label(l), new Value(v))); + } catch (IllegalArgumentException e) { + logger.log(Level.WARNING, "bad label " + l + " or value " + v, e); + return false; + } + } + + @Override + public String toString() { + // TODO: What to emit for empty ANVLRecord? + StringBuilder sb = new StringBuilder(); + for (final Iterator i = iterator(); i.hasNext();) { + sb.append(i.next()); + sb.append(CRLF); + } + // 'ANVL Records end in a blank line'. + sb.append(CRLF); + return sb.toString(); + } + + public Map asMap() { + Map m = new HashMap(size()); + for (final Iterator i = iterator(); i.hasNext();) { + Element e = i.next(); + m.put(e.getLabel().toString(), + e.isValue()? e.getValue().toString(): (String)null); + } + return m; + } + + @Override + public ANVLRecord clone() { + return (ANVLRecord) super.clone(); + } + + /** + * @return This ANVLRecord as UTF8 bytes. + */ + public byte [] getUTF8Bytes() + throws UnsupportedEncodingException { + return toString().getBytes(UTF8); + } + + /** + * Parses a single ANVLRecord from passed InputStream. + * Read as a single-byte stream until we get to a CRLFCRLF which + * signifies End-of-ANVLRecord. Then parse all read as a UTF-8 Stream. + * Doing it this way, while requiring a double-scan, it makes it so do not + * need to be passed a RepositionableStream or a Stream that supports + * marking. Also no danger of over-reading which can happen when we + * wrap passed Stream with an InputStreamReader for doing UTF-8 + * character conversion (See the ISR class comment). + * @param is InputStream + * @return An ANVLRecord instance. + * @throws IOException + */ + public static ANVLRecord load(final InputStream is) + throws IOException { + // It doesn't look like a CRLF sequence is possible in UTF-8 without + // it signifying CRLF: The top bits are set in multibyte characters. + // Was thinking of recording CRLF as I was running through this first + // parse but the offsets would then be incorrect if any multibyte + // characters in the intervening gaps between CRLF. + boolean isCRLF = false; + boolean recordStart = false; + ByteArrayOutputStream baos = new ByteArrayOutputStream(1024); + boolean done = false; + int read = 0; + for (int c = -1, previousCharacter; !done;) { + if (read++ >= MAXIMUM_SIZE) { + throw new IOException("Read " + MAXIMUM_SIZE + + " bytes without finding \\r\\n\\r\\n " + + "End-Of-ANVLRecord"); + } + previousCharacter = c; + c = is.read(); + if (c == -1) { + throw new IOException("End-Of-Stream before \\r\\n\\r\\n " + + "End-Of-ANVLRecord:\n" + + new String(baos.toByteArray(), UTF8)); + } + if (isLF((char)c) && isCR((char)previousCharacter)) { + if (isCRLF) { + // If we just had a CRLF, then its two CRLFs and its end of + // record. We're done. + done = true; + } else { + isCRLF = true; + } + } else if (!recordStart && Character.isWhitespace(c)) { + // Skip any whitespace at start of ANVLRecord. + continue; + } else { + // Clear isCRLF flag if this character is NOT a '\r'. + if (isCRLF && !isCR((char)c)) { + isCRLF = false; + } + // Not whitespace so start record if we haven't already. + if (!recordStart) { + recordStart = true; + } + } + baos.write(c); + } + return load(new String(baos.toByteArray(), UTF8)); + } + + /** + * Parse passed String for an ANVL Record. + * Looked at writing javacc grammer but preprocessing is required to + * handle folding: See + * https://javacc.dev.java.net/servlets/BrowseList?list=users&by=thread&from=56173. + * Looked at Terence Parr's ANTLR. More capable. Can set lookahead count. + * A value of 3 would help with folding. But its a pain defining UNICODE + * grammers -- needed by ANVL -- and support seems incomplete + * anyways: http://www.doc.ic.ac.uk/lab/secondyear/Antlr/lexer.html#unicode. + * For now, go with the below hand-rolled parser. + * @param s String with an ANVLRecord. + * @return ANVLRecord parsed from passed String. + * @throws IOException + */ + public static ANVLRecord load(final String s) + throws IOException { + ANVLRecord record = new ANVLRecord(); + boolean inValue = false, inLabel = false, inComment = false, + inNewLine = false; + String label = null; + StringBuilder sb = new StringBuilder(s.length()); + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + + // Assert I can do look-ahead. + if ((i + 1) > s.length()) { + throw new IOException("Premature End-of-ANVLRecord:\n" + + s.substring(i)); + } + + // If at LF of a CRLF, just go around again. Eat up the LF. + if (inNewLine && isLF(c)) { + continue; + } + + // If we're at a CRLF and we were just on one, exit. Found Record. + if (inNewLine && isCR(c) && isLF(s.charAt(i + 1))) { + break; + } + + // Check if we're on a fold inside a Value. Skip multiple white + // space after CRLF. + if (inNewLine && inValue && Character.isWhitespace(c)) { + continue; + } + + // Else set flag if we're at a CRLF. + inNewLine = isCR(c) && isLF(s.charAt(i + 1)); + + if (inNewLine) { + if (inComment) { + inComment = false; + } else if (label != null && !inValue) { + // Label only 'data element'. + record.addLabel(label); + label = null; + sb.setLength(0); + } else if (inValue) { + // Assert I can do look-ahead past current CRLF. + if ((i + 3) > s.length()) { + throw new IOException("Premature End-of-ANVLRecord " + + "(2):\n" + s.substring(i)); + } + if (!isCR(s.charAt(i + 2)) && !isLF(s.charAt(i + 3)) + && Character.isWhitespace(s.charAt(i + 2))) { + // Its a fold. Let it go around. But add in a CRLF and + // space and do it here. We don't let CRLF fall through + // to the sb.append on the end of this loop. + sb.append(CRLF); + sb.append(' '); + } else { + // Next line is a new SubElement, a new Comment or + // Label. + record.addLabelValue(label, sb.toString()); + sb.setLength(0); + label = null; + inValue = false; + } + } else { + // We're whitespace between label and value or whitespace + // before we've figured whether label or comment. + } + // Don't let the '\r' or CRLF through. + continue; + } + + if (inComment) { + continue; + } else if (inLabel) { + if (c == Label.COLON) { + label = sb.toString(); + sb.setLength(0); + inLabel = false; + continue; + } + } else { + if (!inLabel && !inValue && !inComment) { + // We have no state. Figure one. + if (Character.isWhitespace(c)) { + // If no state, and whitespace, skip. Don't record. + continue; + } else if (label == null && c == '#') { + inComment = true; + // Don't record comments. + continue; + } else if (label == null) { + inLabel = true; + } else { + inValue = true; + } + } + } + sb.append(c); + } + return record; + } + + /** + * @return Count of ANVLRecord bytes. Be careful, an empty ANVLRecord is + * CRLFCRLF so is of size 4. Also, expensive, since it makes String of + * the record so it can count bytes. + */ + public synchronized int getLength() { + int length = -1; + try { + length = getUTF8Bytes().length; + } catch (UnsupportedEncodingException e) { + throw new RuntimeException(e); + } + return length; + } + + public static boolean isCROrLF(final char c) { + return isCR(c) || isLF(c); + } + + public static boolean isCR(final char c) { + return c == ANVLRecord.CRLF.charAt(0); + } + + public static boolean isLF(final char c) { + return c == ANVLRecord.CRLF.charAt(1); + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/util/anvl/Element.java b/src/main/java/org/archive/util/anvl/Element.java new file mode 100644 index 00000000..5881fa9b --- /dev/null +++ b/src/main/java/org/archive/util/anvl/Element.java @@ -0,0 +1,73 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.util.anvl; + + +/** + * ANVL 'data element'. + * Made of a lone {@link Label}, or a {@link Label} plus {@link Value}. + * + * @author stack + * @see A Name-Value + * Language (ANVL) + */ +public class Element { + private final SubElement [] subElements; + + public Element(final Label l) { + this.subElements = new SubElement [] {l}; + } + + public Element(final Label l, final Value v) { + this.subElements = new SubElement [] {l, v}; + } + + public boolean isValue() { + return this.subElements.length > 1; + } + + public Label getLabel() { + return (Label)this.subElements[0]; + } + + public Value getValue() { + if (!isValue()) { + return null; + } + return (Value)this.subElements[1]; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < subElements.length; i++) { + sb.append(subElements[i].toString()); + if (i == 0) { + // Add colon after Label. + sb.append(':'); + if (isValue()) { + // Add space to intro the value. + sb.append(' '); + } + } + } + return sb.toString(); + } +} diff --git a/src/main/java/org/archive/util/anvl/Label.java b/src/main/java/org/archive/util/anvl/Label.java new file mode 100644 index 00000000..fdadb735 --- /dev/null +++ b/src/main/java/org/archive/util/anvl/Label.java @@ -0,0 +1,41 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.util.anvl; + +class Label extends SubElement { + public static final char COLON = ':'; + + @SuppressWarnings("unused") + private Label() { + this(null); + } + + public Label(final String s) { + super(s); + } + + @Override + protected void checkCharacter(char c, String srcStr, int index) { + super.checkCharacter(c, srcStr, index); + if (c == COLON) { + throw new IllegalArgumentException("Label cannot contain " + COLON); + } + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/util/anvl/SubElement.java b/src/main/java/org/archive/util/anvl/SubElement.java new file mode 100644 index 00000000..33b9e9bb --- /dev/null +++ b/src/main/java/org/archive/util/anvl/SubElement.java @@ -0,0 +1,78 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.util.anvl; + +/** + * Abstract ANVL 'data element' sub-part. + * Subclass to make a Comment, a Label, or a Value. + * @author stack + */ +abstract class SubElement { + private final String e; + + protected SubElement() { + this(null); + } + + public SubElement(final String s) { + this.e = baseCheck(s); + } + + protected String baseCheck(final String s) { + // Check for null. + if (s == null) { + throw new IllegalArgumentException("Can't be null"); + } + // Check for CRLF. + for (int i = 0; i < s.length(); i++) { + checkCharacter(s.charAt(i), s, i); + } + return s; + } + + protected void checkCharacter(final char c, final String srcStr, + final int index) { + checkControlCharacter(c, srcStr, index); + checkCRLF(c, srcStr, index); + } + + protected void checkControlCharacter(final char c, final String srcStr, + final int index) { + if (Character.isISOControl(c) && !Character.isWhitespace(c) || + !Character.isValidCodePoint(c)) { + throw new IllegalArgumentException(srcStr + + " contains a control character(s) or invalid code point: 0x" + + Integer.toHexString(c)); + } + } + + protected void checkCRLF(final char c, final String srcStr, + final int index) { + if (ANVLRecord.isCROrLF(c)) { + throw new IllegalArgumentException(srcStr + + " contains disallowed CRLF control character(s): 0x" + + Integer.toHexString(c)); + } + } + + @Override + public String toString() { + return e; + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/util/anvl/Value.java b/src/main/java/org/archive/util/anvl/Value.java new file mode 100644 index 00000000..2a650ba2 --- /dev/null +++ b/src/main/java/org/archive/util/anvl/Value.java @@ -0,0 +1,71 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.util.anvl; + +/** + * TODO: Now values 'fold' but should but perhaps they shouldn't be stored + * folded. Only when we serialize should we fold (But how to know where + * to fold?). + * @author stack + * @version $Date$ $Version$ + */ +class Value extends SubElement { + + private StringBuilder sb; + private boolean folding = false; + + @SuppressWarnings("unused") + private Value() { + this(null); + } + + public Value(final String s) { + super(s); + } + + protected String baseCheck(String s) { + this.sb = new StringBuilder(s.length() * 2); + super.baseCheck(s); + return sb.toString(); + } + + @Override + protected void checkCharacter(char c, String srcStr, int index) { + checkControlCharacter(c, srcStr, index); + // Now, rewrite the value String with folding (If CR or LF or CRLF + // present. + if (ANVLRecord.isCR(c)) { + this.folding = true; + this.sb.append(ANVLRecord.FOLD_PREFIX); + } else if (ANVLRecord.isLF(c)) { + if (!this.folding) { + this.folding = true; + this.sb.append(ANVLRecord.FOLD_PREFIX); + } else { + // Previous character was a CR. Fold prefix has been added. + } + } else if (this.folding && Character.isWhitespace(c)) { + // Only write out one whitespace character. Skip. + } else { + this.folding = false; + this.sb.append(c); + } + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/util/anvl/package.html b/src/main/java/org/archive/util/anvl/package.html new file mode 100644 index 00000000..4a2a8963 --- /dev/null +++ b/src/main/java/org/archive/util/anvl/package.html @@ -0,0 +1,42 @@ + + + +org.archive.util.anvl package + + +Parsers and Writers for the (expired) Internet-Draft A Name-Value +Language (ANVL). Use {@link org.archive.util.anvl.ANVLRecord} +to create new instances of ANVL Records and for parsing. + +

Implementation Details

+

The ANVL Internet-Draft of 14 February, 2005 is inspecific as to the +definition of 'blank line' and 'newline'. This parser implementation +assumes CRNL. +

+

Says "An element consists of a label, a colon, and an optional value". +Should that be: "An element consists of a label and an optional value, or a +comment."

+ +

Specification is unclear regards CR or NL in label or +comment (This implementation disallows CR or NL in labels but lets +them pass in comments).

+ +

A grammar would help. Here is RFC822: +

+     field       =  field-name ":" [ field-body ] CRLF
+     
+     field-name  =  1*<any CHAR, excluding CTLs, SPACE, and ":">
+     
+     field-body  =  field-body-contents
+                    [CRLF LWSP-char field-body]
+     
+     field-body-contents =
+                   <the ASCII characters making up the field-body, as
+                    defined in the following sections, and consisting
+                    of combinations of atom, quoted-string, and
+                    specials tokens, or else consisting of texts>
+
+

+ + diff --git a/src/main/java/org/archive/util/binsearch/SeekableLineReaderIterator.java b/src/main/java/org/archive/util/binsearch/SeekableLineReaderIterator.java index ca443ad4..991553c8 100644 --- a/src/main/java/org/archive/util/binsearch/SeekableLineReaderIterator.java +++ b/src/main/java/org/archive/util/binsearch/SeekableLineReaderIterator.java @@ -26,7 +26,7 @@ public String getNextInner() { next = slr.readLine(); } catch (IOException e) { if (propagateException) { - throw new RuntimeIOException(); + throw new RuntimeIOException(e.toString()); } } } diff --git a/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReader.java b/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReader.java index 63eab9b4..d686a5e2 100644 --- a/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReader.java +++ b/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReader.java @@ -32,6 +32,8 @@ public int getStatus() protected boolean noKeepAlive; protected String cookie; protected String connectedUrl; + protected String errHeader; + protected String saveErrHeader; public abstract String getUrl(); @@ -76,4 +78,20 @@ public String getConnectedUrl() { return connectedUrl; } + + public String getSaveErrHeader() { + return saveErrHeader; + } + + public void setSaveErrHeader(String saveErrHeader) { + this.saveErrHeader = saveErrHeader; + } + + public String getErrHeader() { + return errHeader; + } + + public void setErrHeader(String errHeader) { + this.errHeader = errHeader; + } } \ No newline at end of file diff --git a/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReaderFactory.java b/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReaderFactory.java index c1fa6fb6..b4a23db0 100644 --- a/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReaderFactory.java +++ b/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReaderFactory.java @@ -4,6 +4,7 @@ import org.archive.util.binsearch.SeekableLineReaderFactory; import org.archive.util.binsearch.impl.http.ApacheHttp31SLRFactory; +import org.archive.util.binsearch.impl.http.ApacheHttp43SLRFactory; import org.archive.util.binsearch.impl.http.HTTPURLConnSLRFactory; public abstract class HTTPSeekableLineReaderFactory implements SeekableLineReaderFactory { @@ -20,6 +21,7 @@ protected HTTPSeekableLineReaderFactory() public enum HttpLibs { APACHE_31, + APACHE_43, URLCONN, } @@ -50,6 +52,10 @@ public static HTTPSeekableLineReaderFactory getHttpFactory(HttpLibs type, String case URLCONN: factory = new HTTPURLConnSLRFactory(); break; + + case APACHE_43: + factory = new ApacheHttp43SLRFactory(); + break; } if (factory == null) { diff --git a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java index 0857bfd6..c4fdbba8 100644 --- a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java +++ b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java @@ -8,6 +8,7 @@ import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpException; import org.apache.commons.httpclient.HttpMethod; +import org.apache.commons.httpclient.cookie.CookiePolicy; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.httpclient.methods.HeadMethod; import org.apache.commons.io.input.CountingInputStream; @@ -121,22 +122,28 @@ protected InputStream doSeekLoad(long offset, int maxLength) throws IOException } if (this.getCookie() != null) { + activeMethod.getParams().setCookiePolicy(CookiePolicy.IGNORE_COOKIES); activeMethod.setRequestHeader("Cookie", this.getCookie()); } int code = http.executeMethod(activeMethod); + connectedUrl = activeMethod.getURI().toString(); + if ((code != 206) && (code != 200)) { - throw new BadHttpStatusException(code, url + " " + rangeHeader); + throw new BadHttpStatusException(code, connectedUrl + " " + rangeHeader); } - connectedUrl = activeMethod.getURI().toString(); - InputStream is = activeMethod.getResponseBodyAsStream(); cin = new CountingInputStream(is); return cin; } catch (IOException io) { + if (saveErrHeader != null) { + errHeader = getHeaderValue(saveErrHeader); + } + + connectedUrl = activeMethod.getURI().toString(); doClose(); throw io; } diff --git a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java index 52e73a94..9bd7542b 100644 --- a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java +++ b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java @@ -3,12 +3,12 @@ import java.io.IOException; import java.text.SimpleDateFormat; import java.util.Date; -import java.util.logging.Level; import java.util.logging.Logger; import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler; import org.apache.commons.httpclient.HostConfiguration; import org.apache.commons.httpclient.HttpClient; +import org.apache.commons.httpclient.HttpConnectionManager; import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager; import org.apache.commons.httpclient.params.HttpClientParams; import org.archive.util.binsearch.impl.HTTPSeekableLineReader; @@ -17,7 +17,7 @@ public class ApacheHttp31SLRFactory extends HTTPSeekableLineReaderFactory { private final static Logger LOGGER = Logger.getLogger(ApacheHttp31SLRFactory.class.getName()); - private MultiThreadedHttpConnectionManager connectionManager = null; + private HttpConnectionManager connectionManager = null; private HostConfiguration hostConfiguration = null; private HttpClient http = null; @@ -27,6 +27,7 @@ public ApacheHttp31SLRFactory(String uriString) { public ApacheHttp31SLRFactory() { connectionManager = new MultiThreadedHttpConnectionManager(); + //connectionManager = new ThreadLocalHttpConnectionManager(); hostConfiguration = new HostConfiguration(); HttpClientParams params = new HttpClientParams(); http = new HttpClient(params,connectionManager); @@ -35,15 +36,16 @@ public ApacheHttp31SLRFactory() { public void close() throws IOException { - connectionManager.deleteClosedConnections(); + //connectionManager.deleteClosedConnections(); + connectionManager.closeIdleConnections(0); } @Override public ApacheHttp31SLR get(String url) throws IOException { - if (LOGGER.isLoggable(Level.FINEST)) { - LOGGER.finest("Connections: " + connectionManager.getConnectionsInPool(hostConfiguration)); - } +// if (LOGGER.isLoggable(Level.FINEST)) { +// LOGGER.finest("Connections: " + connectionManager.getConnectionsInPool(hostConfiguration)); +// } return new ApacheHttp31SLR(http, url); } diff --git a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp43SLR.java b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp43SLR.java new file mode 100644 index 00000000..ef206bb1 --- /dev/null +++ b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp43SLR.java @@ -0,0 +1,214 @@ +package org.archive.util.binsearch.impl.http; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.net.InetSocketAddress; +import java.net.Socket; +import java.net.SocketAddress; +import java.net.URL; + +import org.apache.http.Header; +import org.apache.http.HttpException; +import org.apache.http.HttpRequest; +import org.apache.http.HttpResponse; +import org.apache.http.HttpVersion; +import org.apache.http.impl.DefaultBHttpClientConnection; +import org.apache.http.message.BasicHttpRequest; +import org.apache.http.util.EntityUtils; +import org.archive.util.binsearch.impl.HTTPSeekableLineReader; +import org.archive.util.zip.GZIPMembersInputStream; + +public class ApacheHttp43SLR extends HTTPSeekableLineReader { + + private String urlString; + + private int connectTimeout = 0; + private int readTimeout = 0; + + private Socket socket = null; + private DefaultBHttpClientConnection activeConn = null; + private HttpResponse response = null; + + private final static int BUFF_SIZE = 8192; + + public ApacheHttp43SLR(String url) + { + urlString = url; + } + + public ApacheHttp43SLR(String url, int connectTimeout, int readTimeout) + { + this.urlString = url; + this.connectTimeout = connectTimeout; + this.readTimeout = readTimeout; + } + + @Override + public String getUrl() { + return urlString; + } + + @Override + public long getSize() throws IOException { + if (response == null) { + return 0; + } + + return response.getEntity().getContentLength(); + } + + @Override + public String getHeaderValue(String headerName) { + if (response == null) { + return null; + } + + Header header = response.getFirstHeader(headerName); + if (header == null) { + return null; + } + + return header.getValue(); + } + + protected static int getPort(URL url) + { + int port = url.getPort(); + + if (port > 0) { + return port; + } + + return url.getDefaultPort(); + } + + protected InputStream doSeekLoad(long offset, int maxLength, URL url) + throws IOException { + + try { + SocketAddress endpoint = new InetSocketAddress(url.getHost(), getPort(url)); + + socket = new Socket(); + socket.connect(endpoint, connectTimeout); + + activeConn = new DefaultBHttpClientConnection(BUFF_SIZE); + activeConn.bind(socket); + activeConn.setSocketTimeout(readTimeout); + + HttpRequest request = new BasicHttpRequest("GET", url.getFile(), HttpVersion.HTTP_1_1); + + String rangeHeader = makeRangeHeader(offset, maxLength); + + if (rangeHeader != null) { + request.setHeader("Range", rangeHeader); + } + + if (this.isNoKeepAlive()) { + request.setHeader("Connection", "close"); + } else { + request.setHeader("Connection", "keep-alive"); + } + + if (this.getCookie() != null) { + request.setHeader("Cookie", this.getCookie()); + } + + request.setHeader("Accept", "*/*"); + request.setHeader("Host", url.getHost()); + + activeConn.sendRequestHeader(request); + activeConn.flush(); + + response = activeConn.receiveResponseHeader(); + + int code = response.getStatusLine().getStatusCode(); + + connectedUrl = url.toString(); + + if (code > 300 && code < 400) { + Header header = response.getFirstHeader("Location"); + + doClose(); + + if (header != null) { + URL redirectURL = new URL(header.getValue()); + return doSeekLoad(offset, maxLength, redirectURL); + } + } + + if (code != 200 && code != 206) { + throw new BadHttpStatusException(code, connectedUrl + " " + rangeHeader); + } + + activeConn.receiveResponseEntity(response); + + return response.getEntity().getContent(); + + } catch (HttpException e) { + doClose(); + throw new IOException(e); + + } catch (IOException io) { + + if (saveErrHeader != null) { + errHeader = getHeaderValue(saveErrHeader); + } + + connectedUrl = url.toString(); + + doClose(); + throw io; + } + } + + @Override + public void seekWithMaxRead(long offset, boolean gzip, int maxLength) throws IOException + { + if (closed) { + throw new IOException("Seek after close()"); + } + + br = null; + + try { + doSeekLoad(offset, maxLength); + + if (bufferFully && (maxLength > 0)) { + byte[] buffer = EntityUtils.toByteArray(response.getEntity()); + + doClose(); + + is = new ByteArrayInputStream(buffer); + } + + if (gzip) { + is = new GZIPMembersInputStream(is, blockSize); + } + + } catch (IOException io) { + doClose(); + throw io; + } + } + + @Override + protected void doClose() throws IOException { + if (activeConn != null) { + activeConn.close(); + activeConn = null; + socket = null; + } else if (socket != null) { + socket.close(); + socket = null; + } + response = null; + } + + @Override + protected InputStream doSeekLoad(long offset, int maxLength) + throws IOException { + + return doSeekLoad(offset, maxLength, new URL(urlString)); + } +} diff --git a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp43SLRFactory.java b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp43SLRFactory.java new file mode 100644 index 00000000..5e3bb3ed --- /dev/null +++ b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp43SLRFactory.java @@ -0,0 +1,100 @@ +package org.archive.util.binsearch.impl.http; + +import java.io.IOException; + +import org.archive.util.binsearch.impl.HTTPSeekableLineReader; +import org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory; + +public class ApacheHttp43SLRFactory extends HTTPSeekableLineReaderFactory { + + private int readTimeout = 0; + private int connectTimeout = 0; + + public ApacheHttp43SLRFactory() + { + + } + + @Override + public HTTPSeekableLineReader get(String url) throws IOException { + return new ApacheHttp43SLR(url, connectTimeout, readTimeout); + } + + @Override + public void close() throws IOException { + // TODO Auto-generated method stub + } + + @Override + public void setProxyHostPort(String hostPort) { + // TODO Auto-generated method stub + + } + + @Override + public void setMaxTotalConnections(int maxTotalConnections) { + // TODO Auto-generated method stub + + } + + @Override + public int getMaxTotalConnections() { + // TODO Auto-generated method stub + return 0; + } + + @Override + public void setMaxHostConnections(int maxHostConnections) { + // TODO Auto-generated method stub + + } + + @Override + public int getMaxHostConnections() { + // TODO Auto-generated method stub + return 0; + } + + @Override + public int getConnectionTimeoutMS() { + return connectTimeout; + } + + @Override + public void setConnectionTimeoutMS(int connectionTimeoutMS) { + connectTimeout = connectionTimeoutMS; + + } + + @Override + public int getSocketTimeoutMS() { + return readTimeout; + } + + @Override + public void setSocketTimeoutMS(int socketTimeoutMS) { + readTimeout = socketTimeoutMS; + } + + @Override + public void setStaleChecking(boolean enabled) { + + } + + @Override + public boolean isStaleChecking() { + // TODO Auto-generated method stub + return false; + } + + @Override + public long getModTime() { + // TODO Auto-generated method stub + return 0; + } + + @Override + public void setNumRetries(int numRetries) { + // TODO Auto-generated method stub + } +} diff --git a/src/main/java/org/archive/util/binsearch/impl/http/HTTPURLConnSLR.java b/src/main/java/org/archive/util/binsearch/impl/http/HTTPURLConnSLR.java index f21437f7..6d618e43 100644 --- a/src/main/java/org/archive/util/binsearch/impl/http/HTTPURLConnSLR.java +++ b/src/main/java/org/archive/util/binsearch/impl/http/HTTPURLConnSLR.java @@ -76,13 +76,12 @@ protected InputStream doSeekLoad(long offset, int maxLength) httpUrlConn.connect(); int code = httpUrlConn.getResponseCode(); + connectedUrl = httpUrlConn.getURL().toString(); if ((code != 206) && (code != 200)) { - throw new BadHttpStatusException(code, url + " " + rangeHeader); + throw new BadHttpStatusException(code, connectedUrl + " " + rangeHeader); } - connectedUrl = httpUrlConn.getURL().toString(); - InputStream is = httpUrlConn.getInputStream(); cin = new CountingInputStream(is); return cin; diff --git a/src/main/java/org/archive/util/io/RuntimeIOException.java b/src/main/java/org/archive/util/io/RuntimeIOException.java index b6efbf74..1d74f79c 100644 --- a/src/main/java/org/archive/util/io/RuntimeIOException.java +++ b/src/main/java/org/archive/util/io/RuntimeIOException.java @@ -3,13 +3,36 @@ public class RuntimeIOException extends RuntimeException { private static final long serialVersionUID = 4762025404760379497L; + private int status = 503; + public RuntimeIOException() { } + public RuntimeIOException(String message) + { + super(message); + } + + public RuntimeIOException(int status) + { + this.status = status; + } + public RuntimeIOException(Throwable cause) { super(cause); - } + } + + public RuntimeIOException(int status, Throwable cause) + { + super(cause); + this.status = status; + } + + public int getStatus() + { + return status; + } } diff --git a/src/main/java/org/archive/util/iterator/CloseableCompositeIterator.java b/src/main/java/org/archive/util/iterator/CloseableCompositeIterator.java new file mode 100644 index 00000000..b9f632e2 --- /dev/null +++ b/src/main/java/org/archive/util/iterator/CloseableCompositeIterator.java @@ -0,0 +1,73 @@ +package org.archive.util.iterator; + +import java.io.IOException; +import java.util.Iterator; +import java.util.LinkedList; + +public class CloseableCompositeIterator implements CloseableIterator { + + protected LinkedList> iters; + protected Iterator> iterPtr; + protected CloseableIterator currIter; + + public CloseableCompositeIterator() + { + iters = new LinkedList>(); + } + + public void addFirst(CloseableIterator e) + { + iters.addFirst(e); + } + + public void addLast(CloseableIterator e) + { + iters.addLast(e); + } + + @Override + public boolean hasNext() { + + if (iterPtr == null) { + iterPtr = iters.iterator(); + currIter = iterPtr.next(); + } + + if (currIter == null) { + return false; + } + + while (currIter != null) { + if (currIter.hasNext()) { + return true; + } + + currIter = (iterPtr.hasNext() ? iterPtr.next() : null); + } + + return false; + } + + @Override + public E next() { + return currIter.next(); + } + + @Override + public void remove() { + currIter.remove(); + } + + @Override + public void close() throws IOException { + for (CloseableIterator e : iters) { + if (e != null) { + try { + e.close(); + } catch (IOException io) { + + } + } + } + } +} diff --git a/src/main/java/org/archive/util/iterator/CloseableIteratorWrapper.java b/src/main/java/org/archive/util/iterator/CloseableIteratorWrapper.java new file mode 100644 index 00000000..f35c85e5 --- /dev/null +++ b/src/main/java/org/archive/util/iterator/CloseableIteratorWrapper.java @@ -0,0 +1,42 @@ +package org.archive.util.iterator; + +import java.io.IOException; +import java.util.Iterator; + +/** + * Wrap a regular Iterator to create a CloseableIterator where the close() is a no-op + * @author ilya + * + * @param + */ + +public class CloseableIteratorWrapper implements CloseableIterator +{ + protected Iterator iter; + + public CloseableIteratorWrapper(Iterator iter) + { + this.iter = iter; + } + + @Override + public boolean hasNext() { + return this.iter.hasNext(); + } + + @Override + public S next() { + return this.iter.next(); + } + + @Override + public void remove() { + this.iter.remove(); + + } + + @Override + public void close() throws IOException { + //No Op + } +} \ No newline at end of file diff --git a/src/main/resources/effective_tld_names.dat b/src/main/resources/effective_tld_names.dat index 2c201312..7c4a0860 100644 --- a/src/main/resources/effective_tld_names.dat +++ b/src/main/resources/effective_tld_names.dat @@ -1,44 +1,6 @@ -// ***** BEGIN LICENSE BLOCK ***** -// Version: MPL 1.1/GPL 2.0/LGPL 2.1 -// -// The contents of this file are subject to the Mozilla Public License Version -// 1.1 (the "License"); you may not use this file except in compliance with -// the License. You may obtain a copy of the License at -// http://www.mozilla.org/MPL/ -// -// Software distributed under the License is distributed on an "AS IS" basis, -// WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License -// for the specific language governing rights and limitations under the -// License. -// -// The Original Code is the Public Suffix List. -// -// The Initial Developer of the Original Code is -// Jo Hermans . -// Portions created by the Initial Developer are Copyright (C) 2007 -// the Initial Developer. All Rights Reserved. -// -// Contributor(s): -// Ruben Arakelyan -// Gervase Markham -// Pamela Greene -// David Triendl -// Jothan Frakes -// The kind representatives of many TLD registries -// -// Alternatively, the contents of this file may be used under the terms of -// either the GNU General Public License Version 2 or later (the "GPL"), or -// the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), -// in which case the provisions of the GPL or the LGPL are applicable instead -// of those above. If you wish to allow use of your version of this file only -// under the terms of either the GPL or the LGPL, and not to allow others to -// use your version of this file under the terms of the MPL, indicate your -// decision by deleting the provisions above and replace them with the notice -// and other provisions required by the GPL or the LGPL. If you do not delete -// the provisions above, a recipient may use your version of this file under -// the terms of any one of the MPL, the GPL or the LGPL. -// -// ***** END LICENSE BLOCK ***** +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. // ===BEGIN ICANN DOMAINS=== @@ -257,9 +219,9 @@ net.au org.au edu.au gov.au -csiro.au asn.au id.au +csiro.au // Historic 2LDs (closed to new registration, but sites still exist) info.au conf.au @@ -453,13 +415,13 @@ b.br bio.br blog.br bmd.br -can.br cim.br cng.br cnt.br com.br coop.br ecn.br +eco.br edu.br emp.br eng.br @@ -480,6 +442,7 @@ ind.br inf.br jor.br jus.br +leg.br lel.br mat.br med.br @@ -729,6 +692,14 @@ inf.cu // cv : http://en.wikipedia.org/wiki/.cv cv +// cw : http://www.una.cw/cw_registry/ +// Confirmed by registry 2013-03-26 +cw +com.cw +edu.cw +net.cw +org.cw + // cx : http://en.wikipedia.org/wiki/.cx // list of other 2nd level tlds ? cx @@ -987,9 +958,15 @@ gov.gr // gs : http://en.wikipedia.org/wiki/.gs gs -// gt : http://www.gt/politicas.html -*.gt -!www.gt +// gt : http://www.gt/politicas_de_registro.html +gt +com.gt +edu.gt +gob.gt +ind.gt +mil.gt +net.gt +org.gt // gu : http://gadao.gov.gu/registration.txt *.gu @@ -1103,13 +1080,14 @@ tozsde.hu utazas.hu video.hu -// id : http://en.wikipedia.org/wiki/.id -// see also: https://register.pandi.or.id/ +// id : https://register.pandi.or.id/ id ac.id +biz.id co.id go.id mil.id +my.id net.id or.id sch.id @@ -1511,10 +1489,9 @@ jobs // jp : http://en.wikipedia.org/wiki/.jp // http://jprs.co.jp/en/jpdomain.html -// Submitted by registry 2008-06-11 -// Updated by registry 2008-12-04 +// Updated by registry 2012-05-28 jp -// jp organizational type names +// jp organizational type names ac.jp ad.jp co.jp @@ -1524,125 +1501,1750 @@ gr.jp lg.jp ne.jp or.jp +// jp preficture type names +aichi.jp +akita.jp +aomori.jp +chiba.jp +ehime.jp +fukui.jp +fukuoka.jp +fukushima.jp +gifu.jp +gunma.jp +hiroshima.jp +hokkaido.jp +hyogo.jp +ibaraki.jp +ishikawa.jp +iwate.jp +kagawa.jp +kagoshima.jp +kanagawa.jp +kochi.jp +kumamoto.jp +kyoto.jp +mie.jp +miyagi.jp +miyazaki.jp +nagano.jp +nagasaki.jp +nara.jp +niigata.jp +oita.jp +okayama.jp +okinawa.jp +osaka.jp +saga.jp +saitama.jp +shiga.jp +shimane.jp +shizuoka.jp +tochigi.jp +tokushima.jp +tokyo.jp +tottori.jp +toyama.jp +wakayama.jp +yamagata.jp +yamaguchi.jp +yamanashi.jp // jp geographic type names // http://jprs.jp/doc/rule/saisoku-1.html -*.aichi.jp -*.akita.jp -*.aomori.jp -*.chiba.jp -*.ehime.jp -*.fukui.jp -*.fukuoka.jp -*.fukushima.jp -*.gifu.jp -*.gunma.jp -*.hiroshima.jp -*.hokkaido.jp -*.hyogo.jp -*.ibaraki.jp -*.ishikawa.jp -*.iwate.jp -*.kagawa.jp -*.kagoshima.jp -*.kanagawa.jp *.kawasaki.jp *.kitakyushu.jp *.kobe.jp -*.kochi.jp -*.kumamoto.jp -*.kyoto.jp -*.mie.jp -*.miyagi.jp -*.miyazaki.jp -*.nagano.jp -*.nagasaki.jp *.nagoya.jp -*.nara.jp -*.niigata.jp -*.oita.jp -*.okayama.jp -*.okinawa.jp -*.osaka.jp -*.saga.jp -*.saitama.jp *.sapporo.jp *.sendai.jp -*.shiga.jp -*.shimane.jp -*.shizuoka.jp -*.tochigi.jp -*.tokushima.jp -*.tokyo.jp -*.tottori.jp -*.toyama.jp -*.wakayama.jp -*.yamagata.jp -*.yamaguchi.jp -*.yamanashi.jp *.yokohama.jp -!metro.tokyo.jp -!pref.aichi.jp -!pref.akita.jp -!pref.aomori.jp -!pref.chiba.jp -!pref.ehime.jp -!pref.fukui.jp -!pref.fukuoka.jp -!pref.fukushima.jp -!pref.gifu.jp -!pref.gunma.jp -!pref.hiroshima.jp -!pref.hokkaido.jp -!pref.hyogo.jp -!pref.ibaraki.jp -!pref.ishikawa.jp -!pref.iwate.jp -!pref.kagawa.jp -!pref.kagoshima.jp -!pref.kanagawa.jp -!pref.kochi.jp -!pref.kumamoto.jp -!pref.kyoto.jp -!pref.mie.jp -!pref.miyagi.jp -!pref.miyazaki.jp -!pref.nagano.jp -!pref.nagasaki.jp -!pref.nara.jp -!pref.niigata.jp -!pref.oita.jp -!pref.okayama.jp -!pref.okinawa.jp -!pref.osaka.jp -!pref.saga.jp -!pref.saitama.jp -!pref.shiga.jp -!pref.shimane.jp -!pref.shizuoka.jp -!pref.tochigi.jp -!pref.tokushima.jp -!pref.tottori.jp -!pref.toyama.jp -!pref.wakayama.jp -!pref.yamagata.jp -!pref.yamaguchi.jp -!pref.yamanashi.jp -!city.chiba.jp -!city.fukuoka.jp -!city.hiroshima.jp !city.kawasaki.jp !city.kitakyushu.jp !city.kobe.jp -!city.kyoto.jp !city.nagoya.jp -!city.niigata.jp -!city.okayama.jp -!city.osaka.jp -!city.saitama.jp !city.sapporo.jp !city.sendai.jp -!city.shizuoka.jp !city.yokohama.jp +// 4th level registration +aisai.aichi.jp +ama.aichi.jp +anjo.aichi.jp +asuke.aichi.jp +chiryu.aichi.jp +chita.aichi.jp +fuso.aichi.jp +gamagori.aichi.jp +handa.aichi.jp +hazu.aichi.jp +hekinan.aichi.jp +higashiura.aichi.jp +ichinomiya.aichi.jp +inazawa.aichi.jp +inuyama.aichi.jp +isshiki.aichi.jp +iwakura.aichi.jp +kanie.aichi.jp +kariya.aichi.jp +kasugai.aichi.jp +kira.aichi.jp +kiyosu.aichi.jp +komaki.aichi.jp +konan.aichi.jp +kota.aichi.jp +mihama.aichi.jp +miyoshi.aichi.jp +nagakute.aichi.jp +nishio.aichi.jp +nisshin.aichi.jp +obu.aichi.jp +oguchi.aichi.jp +oharu.aichi.jp +okazaki.aichi.jp +owariasahi.aichi.jp +seto.aichi.jp +shikatsu.aichi.jp +shinshiro.aichi.jp +shitara.aichi.jp +tahara.aichi.jp +takahama.aichi.jp +tobishima.aichi.jp +toei.aichi.jp +togo.aichi.jp +tokai.aichi.jp +tokoname.aichi.jp +toyoake.aichi.jp +toyohashi.aichi.jp +toyokawa.aichi.jp +toyone.aichi.jp +toyota.aichi.jp +tsushima.aichi.jp +yatomi.aichi.jp +akita.akita.jp +daisen.akita.jp +fujisato.akita.jp +gojome.akita.jp +hachirogata.akita.jp +happou.akita.jp +higashinaruse.akita.jp +honjo.akita.jp +honjyo.akita.jp +ikawa.akita.jp +kamikoani.akita.jp +kamioka.akita.jp +katagami.akita.jp +kazuno.akita.jp +kitaakita.akita.jp +kosaka.akita.jp +kyowa.akita.jp +misato.akita.jp +mitane.akita.jp +moriyoshi.akita.jp +nikaho.akita.jp +noshiro.akita.jp +odate.akita.jp +oga.akita.jp +ogata.akita.jp +semboku.akita.jp +yokote.akita.jp +yurihonjo.akita.jp +aomori.aomori.jp +gonohe.aomori.jp +hachinohe.aomori.jp +hashikami.aomori.jp +hiranai.aomori.jp +hirosaki.aomori.jp +itayanagi.aomori.jp +kuroishi.aomori.jp +misawa.aomori.jp +mutsu.aomori.jp +nakadomari.aomori.jp +noheji.aomori.jp +oirase.aomori.jp +owani.aomori.jp +rokunohe.aomori.jp +sannohe.aomori.jp +shichinohe.aomori.jp +shingo.aomori.jp +takko.aomori.jp +towada.aomori.jp +tsugaru.aomori.jp +tsuruta.aomori.jp +abiko.chiba.jp +asahi.chiba.jp +chonan.chiba.jp +chosei.chiba.jp +choshi.chiba.jp +chuo.chiba.jp +funabashi.chiba.jp +futtsu.chiba.jp +hanamigawa.chiba.jp +ichihara.chiba.jp +ichikawa.chiba.jp +ichinomiya.chiba.jp +inzai.chiba.jp +isumi.chiba.jp +kamagaya.chiba.jp +kamogawa.chiba.jp +kashiwa.chiba.jp +katori.chiba.jp +katsuura.chiba.jp +kimitsu.chiba.jp +kisarazu.chiba.jp +kozaki.chiba.jp +kujukuri.chiba.jp +kyonan.chiba.jp +matsudo.chiba.jp +midori.chiba.jp +mihama.chiba.jp +minamiboso.chiba.jp +mobara.chiba.jp +mutsuzawa.chiba.jp +nagara.chiba.jp +nagareyama.chiba.jp +narashino.chiba.jp +narita.chiba.jp +noda.chiba.jp +oamishirasato.chiba.jp +omigawa.chiba.jp +onjuku.chiba.jp +otaki.chiba.jp +sakae.chiba.jp +sakura.chiba.jp +shimofusa.chiba.jp +shirako.chiba.jp +shiroi.chiba.jp +shisui.chiba.jp +sodegaura.chiba.jp +sosa.chiba.jp +tako.chiba.jp +tateyama.chiba.jp +togane.chiba.jp +tohnosho.chiba.jp +tomisato.chiba.jp +urayasu.chiba.jp +yachimata.chiba.jp +yachiyo.chiba.jp +yokaichiba.chiba.jp +yokoshibahikari.chiba.jp +yotsukaido.chiba.jp +ainan.ehime.jp +honai.ehime.jp +ikata.ehime.jp +imabari.ehime.jp +iyo.ehime.jp +kamijima.ehime.jp +kihoku.ehime.jp +kumakogen.ehime.jp +masaki.ehime.jp +matsuno.ehime.jp +matsuyama.ehime.jp +namikata.ehime.jp +niihama.ehime.jp +ozu.ehime.jp +saijo.ehime.jp +seiyo.ehime.jp +shikokuchuo.ehime.jp +tobe.ehime.jp +toon.ehime.jp +uchiko.ehime.jp +uwajima.ehime.jp +yawatahama.ehime.jp +echizen.fukui.jp +eiheiji.fukui.jp +fukui.fukui.jp +ikeda.fukui.jp +katsuyama.fukui.jp +mihama.fukui.jp +minamiechizen.fukui.jp +obama.fukui.jp +ohi.fukui.jp +ono.fukui.jp +sabae.fukui.jp +sakai.fukui.jp +takahama.fukui.jp +tsuruga.fukui.jp +wakasa.fukui.jp +ashiya.fukuoka.jp +buzen.fukuoka.jp +chikugo.fukuoka.jp +chikuho.fukuoka.jp +chikujo.fukuoka.jp +chikushino.fukuoka.jp +chikuzen.fukuoka.jp +chuo.fukuoka.jp +dazaifu.fukuoka.jp +fukuchi.fukuoka.jp +hakata.fukuoka.jp +higashi.fukuoka.jp +hirokawa.fukuoka.jp +hisayama.fukuoka.jp +iizuka.fukuoka.jp +inatsuki.fukuoka.jp +kaho.fukuoka.jp +kasuga.fukuoka.jp +kasuya.fukuoka.jp +kawara.fukuoka.jp +keisen.fukuoka.jp +koga.fukuoka.jp +kurate.fukuoka.jp +kurogi.fukuoka.jp +kurume.fukuoka.jp +minami.fukuoka.jp +miyako.fukuoka.jp +miyama.fukuoka.jp +miyawaka.fukuoka.jp +mizumaki.fukuoka.jp +munakata.fukuoka.jp +nakagawa.fukuoka.jp +nakama.fukuoka.jp +nishi.fukuoka.jp +nogata.fukuoka.jp +ogori.fukuoka.jp +okagaki.fukuoka.jp +okawa.fukuoka.jp +oki.fukuoka.jp +omuta.fukuoka.jp +onga.fukuoka.jp +onojo.fukuoka.jp +oto.fukuoka.jp +saigawa.fukuoka.jp +sasaguri.fukuoka.jp +shingu.fukuoka.jp +shinyoshitomi.fukuoka.jp +shonai.fukuoka.jp +soeda.fukuoka.jp +sue.fukuoka.jp +tachiarai.fukuoka.jp +tagawa.fukuoka.jp +takata.fukuoka.jp +toho.fukuoka.jp +toyotsu.fukuoka.jp +tsuiki.fukuoka.jp +ukiha.fukuoka.jp +umi.fukuoka.jp +usui.fukuoka.jp +yamada.fukuoka.jp +yame.fukuoka.jp +yanagawa.fukuoka.jp +yukuhashi.fukuoka.jp +aizubange.fukushima.jp +aizumisato.fukushima.jp +aizuwakamatsu.fukushima.jp +asakawa.fukushima.jp +bandai.fukushima.jp +date.fukushima.jp +fukushima.fukushima.jp +furudono.fukushima.jp +futaba.fukushima.jp +hanawa.fukushima.jp +higashi.fukushima.jp +hirata.fukushima.jp +hirono.fukushima.jp +iitate.fukushima.jp +inawashiro.fukushima.jp +ishikawa.fukushima.jp +iwaki.fukushima.jp +izumizaki.fukushima.jp +kagamiishi.fukushima.jp +kaneyama.fukushima.jp +kawamata.fukushima.jp +kitakata.fukushima.jp +kitashiobara.fukushima.jp +koori.fukushima.jp +koriyama.fukushima.jp +kunimi.fukushima.jp +miharu.fukushima.jp +mishima.fukushima.jp +namie.fukushima.jp +nango.fukushima.jp +nishiaizu.fukushima.jp +nishigo.fukushima.jp +okuma.fukushima.jp +omotego.fukushima.jp +ono.fukushima.jp +otama.fukushima.jp +samegawa.fukushima.jp +shimogo.fukushima.jp +shirakawa.fukushima.jp +showa.fukushima.jp +soma.fukushima.jp +sukagawa.fukushima.jp +taishin.fukushima.jp +tamakawa.fukushima.jp +tanagura.fukushima.jp +tenei.fukushima.jp +yabuki.fukushima.jp +yamato.fukushima.jp +yamatsuri.fukushima.jp +yanaizu.fukushima.jp +yugawa.fukushima.jp +anpachi.gifu.jp +ena.gifu.jp +gifu.gifu.jp +ginan.gifu.jp +godo.gifu.jp +gujo.gifu.jp +hashima.gifu.jp +hichiso.gifu.jp +hida.gifu.jp +higashishirakawa.gifu.jp +ibigawa.gifu.jp +ikeda.gifu.jp +kakamigahara.gifu.jp +kani.gifu.jp +kasahara.gifu.jp +kasamatsu.gifu.jp +kawaue.gifu.jp +kitagata.gifu.jp +mino.gifu.jp +minokamo.gifu.jp +mitake.gifu.jp +mizunami.gifu.jp +motosu.gifu.jp +nakatsugawa.gifu.jp +ogaki.gifu.jp +sakahogi.gifu.jp +seki.gifu.jp +sekigahara.gifu.jp +shirakawa.gifu.jp +tajimi.gifu.jp +takayama.gifu.jp +tarui.gifu.jp +toki.gifu.jp +tomika.gifu.jp +wanouchi.gifu.jp +yamagata.gifu.jp +yaotsu.gifu.jp +yoro.gifu.jp +annaka.gunma.jp +chiyoda.gunma.jp +fujioka.gunma.jp +higashiagatsuma.gunma.jp +isesaki.gunma.jp +itakura.gunma.jp +kanna.gunma.jp +kanra.gunma.jp +katashina.gunma.jp +kawaba.gunma.jp +kiryu.gunma.jp +kusatsu.gunma.jp +maebashi.gunma.jp +meiwa.gunma.jp +midori.gunma.jp +minakami.gunma.jp +naganohara.gunma.jp +nakanojo.gunma.jp +nanmoku.gunma.jp +numata.gunma.jp +oizumi.gunma.jp +ora.gunma.jp +ota.gunma.jp +shibukawa.gunma.jp +shimonita.gunma.jp +shinto.gunma.jp +showa.gunma.jp +takasaki.gunma.jp +takayama.gunma.jp +tamamura.gunma.jp +tatebayashi.gunma.jp +tomioka.gunma.jp +tsukiyono.gunma.jp +tsumagoi.gunma.jp +ueno.gunma.jp +yoshioka.gunma.jp +asaminami.hiroshima.jp +daiwa.hiroshima.jp +etajima.hiroshima.jp +fuchu.hiroshima.jp +fukuyama.hiroshima.jp +hatsukaichi.hiroshima.jp +higashihiroshima.hiroshima.jp +hongo.hiroshima.jp +jinsekikogen.hiroshima.jp +kaita.hiroshima.jp +kui.hiroshima.jp +kumano.hiroshima.jp +kure.hiroshima.jp +mihara.hiroshima.jp +miyoshi.hiroshima.jp +naka.hiroshima.jp +onomichi.hiroshima.jp +osakikamijima.hiroshima.jp +otake.hiroshima.jp +saka.hiroshima.jp +sera.hiroshima.jp +seranishi.hiroshima.jp +shinichi.hiroshima.jp +shobara.hiroshima.jp +takehara.hiroshima.jp +abashiri.hokkaido.jp +abira.hokkaido.jp +aibetsu.hokkaido.jp +akabira.hokkaido.jp +akkeshi.hokkaido.jp +asahikawa.hokkaido.jp +ashibetsu.hokkaido.jp +ashoro.hokkaido.jp +assabu.hokkaido.jp +atsuma.hokkaido.jp +bibai.hokkaido.jp +biei.hokkaido.jp +bifuka.hokkaido.jp +bihoro.hokkaido.jp +biratori.hokkaido.jp +chippubetsu.hokkaido.jp +chitose.hokkaido.jp +date.hokkaido.jp +ebetsu.hokkaido.jp +embetsu.hokkaido.jp +eniwa.hokkaido.jp +erimo.hokkaido.jp +esan.hokkaido.jp +esashi.hokkaido.jp +fukagawa.hokkaido.jp +fukushima.hokkaido.jp +furano.hokkaido.jp +furubira.hokkaido.jp +haboro.hokkaido.jp +hakodate.hokkaido.jp +hamatonbetsu.hokkaido.jp +hidaka.hokkaido.jp +higashikagura.hokkaido.jp +higashikawa.hokkaido.jp +hiroo.hokkaido.jp +hokuryu.hokkaido.jp +hokuto.hokkaido.jp +honbetsu.hokkaido.jp +horokanai.hokkaido.jp +horonobe.hokkaido.jp +ikeda.hokkaido.jp +imakane.hokkaido.jp +ishikari.hokkaido.jp +iwamizawa.hokkaido.jp +iwanai.hokkaido.jp +kamifurano.hokkaido.jp +kamikawa.hokkaido.jp +kamishihoro.hokkaido.jp +kamisunagawa.hokkaido.jp +kamoenai.hokkaido.jp +kayabe.hokkaido.jp +kembuchi.hokkaido.jp +kikonai.hokkaido.jp +kimobetsu.hokkaido.jp +kitahiroshima.hokkaido.jp +kitami.hokkaido.jp +kiyosato.hokkaido.jp +koshimizu.hokkaido.jp +kunneppu.hokkaido.jp +kuriyama.hokkaido.jp +kuromatsunai.hokkaido.jp +kushiro.hokkaido.jp +kutchan.hokkaido.jp +kyowa.hokkaido.jp +mashike.hokkaido.jp +matsumae.hokkaido.jp +mikasa.hokkaido.jp +minamifurano.hokkaido.jp +mombetsu.hokkaido.jp +moseushi.hokkaido.jp +mukawa.hokkaido.jp +muroran.hokkaido.jp +naie.hokkaido.jp +nakagawa.hokkaido.jp +nakasatsunai.hokkaido.jp +nakatombetsu.hokkaido.jp +nanae.hokkaido.jp +nanporo.hokkaido.jp +nayoro.hokkaido.jp +nemuro.hokkaido.jp +niikappu.hokkaido.jp +niki.hokkaido.jp +nishiokoppe.hokkaido.jp +noboribetsu.hokkaido.jp +numata.hokkaido.jp +obihiro.hokkaido.jp +obira.hokkaido.jp +oketo.hokkaido.jp +okoppe.hokkaido.jp +otaru.hokkaido.jp +otobe.hokkaido.jp +otofuke.hokkaido.jp +otoineppu.hokkaido.jp +oumu.hokkaido.jp +ozora.hokkaido.jp +pippu.hokkaido.jp +rankoshi.hokkaido.jp +rebun.hokkaido.jp +rikubetsu.hokkaido.jp +rishiri.hokkaido.jp +rishirifuji.hokkaido.jp +saroma.hokkaido.jp +sarufutsu.hokkaido.jp +shakotan.hokkaido.jp +shari.hokkaido.jp +shibecha.hokkaido.jp +shibetsu.hokkaido.jp +shikabe.hokkaido.jp +shikaoi.hokkaido.jp +shimamaki.hokkaido.jp +shimizu.hokkaido.jp +shimokawa.hokkaido.jp +shinshinotsu.hokkaido.jp +shintoku.hokkaido.jp +shiranuka.hokkaido.jp +shiraoi.hokkaido.jp +shiriuchi.hokkaido.jp +sobetsu.hokkaido.jp +sunagawa.hokkaido.jp +taiki.hokkaido.jp +takasu.hokkaido.jp +takikawa.hokkaido.jp +takinoue.hokkaido.jp +teshikaga.hokkaido.jp +tobetsu.hokkaido.jp +tohma.hokkaido.jp +tomakomai.hokkaido.jp +tomari.hokkaido.jp +toya.hokkaido.jp +toyako.hokkaido.jp +toyotomi.hokkaido.jp +toyoura.hokkaido.jp +tsubetsu.hokkaido.jp +tsukigata.hokkaido.jp +urakawa.hokkaido.jp +urausu.hokkaido.jp +uryu.hokkaido.jp +utashinai.hokkaido.jp +wakkanai.hokkaido.jp +wassamu.hokkaido.jp +yakumo.hokkaido.jp +yoichi.hokkaido.jp +aioi.hyogo.jp +akashi.hyogo.jp +ako.hyogo.jp +amagasaki.hyogo.jp +aogaki.hyogo.jp +asago.hyogo.jp +ashiya.hyogo.jp +awaji.hyogo.jp +fukusaki.hyogo.jp +goshiki.hyogo.jp +harima.hyogo.jp +himeji.hyogo.jp +ichikawa.hyogo.jp +inagawa.hyogo.jp +itami.hyogo.jp +kakogawa.hyogo.jp +kamigori.hyogo.jp +kamikawa.hyogo.jp +kasai.hyogo.jp +kasuga.hyogo.jp +kawanishi.hyogo.jp +miki.hyogo.jp +minamiawaji.hyogo.jp +nishinomiya.hyogo.jp +nishiwaki.hyogo.jp +ono.hyogo.jp +sanda.hyogo.jp +sannan.hyogo.jp +sasayama.hyogo.jp +sayo.hyogo.jp +shingu.hyogo.jp +shinonsen.hyogo.jp +shiso.hyogo.jp +sumoto.hyogo.jp +taishi.hyogo.jp +taka.hyogo.jp +takarazuka.hyogo.jp +takasago.hyogo.jp +takino.hyogo.jp +tamba.hyogo.jp +tatsuno.hyogo.jp +toyooka.hyogo.jp +yabu.hyogo.jp +yashiro.hyogo.jp +yoka.hyogo.jp +yokawa.hyogo.jp +ami.ibaraki.jp +asahi.ibaraki.jp +bando.ibaraki.jp +chikusei.ibaraki.jp +daigo.ibaraki.jp +fujishiro.ibaraki.jp +hitachi.ibaraki.jp +hitachinaka.ibaraki.jp +hitachiomiya.ibaraki.jp +hitachiota.ibaraki.jp +ibaraki.ibaraki.jp +ina.ibaraki.jp +inashiki.ibaraki.jp +itako.ibaraki.jp +iwama.ibaraki.jp +joso.ibaraki.jp +kamisu.ibaraki.jp +kasama.ibaraki.jp +kashima.ibaraki.jp +kasumigaura.ibaraki.jp +koga.ibaraki.jp +miho.ibaraki.jp +mito.ibaraki.jp +moriya.ibaraki.jp +naka.ibaraki.jp +namegata.ibaraki.jp +oarai.ibaraki.jp +ogawa.ibaraki.jp +omitama.ibaraki.jp +ryugasaki.ibaraki.jp +sakai.ibaraki.jp +sakuragawa.ibaraki.jp +shimodate.ibaraki.jp +shimotsuma.ibaraki.jp +shirosato.ibaraki.jp +sowa.ibaraki.jp +suifu.ibaraki.jp +takahagi.ibaraki.jp +tamatsukuri.ibaraki.jp +tokai.ibaraki.jp +tomobe.ibaraki.jp +tone.ibaraki.jp +toride.ibaraki.jp +tsuchiura.ibaraki.jp +tsukuba.ibaraki.jp +uchihara.ibaraki.jp +ushiku.ibaraki.jp +yachiyo.ibaraki.jp +yamagata.ibaraki.jp +yawara.ibaraki.jp +yuki.ibaraki.jp +anamizu.ishikawa.jp +hakui.ishikawa.jp +hakusan.ishikawa.jp +kaga.ishikawa.jp +kahoku.ishikawa.jp +kanazawa.ishikawa.jp +kawakita.ishikawa.jp +komatsu.ishikawa.jp +nakanoto.ishikawa.jp +nanao.ishikawa.jp +nomi.ishikawa.jp +nonoichi.ishikawa.jp +noto.ishikawa.jp +shika.ishikawa.jp +suzu.ishikawa.jp +tsubata.ishikawa.jp +tsurugi.ishikawa.jp +uchinada.ishikawa.jp +wajima.ishikawa.jp +fudai.iwate.jp +fujisawa.iwate.jp +hanamaki.iwate.jp +hiraizumi.iwate.jp +hirono.iwate.jp +ichinohe.iwate.jp +ichinoseki.iwate.jp +iwaizumi.iwate.jp +iwate.iwate.jp +joboji.iwate.jp +kamaishi.iwate.jp +kanegasaki.iwate.jp +karumai.iwate.jp +kawai.iwate.jp +kitakami.iwate.jp +kuji.iwate.jp +kunohe.iwate.jp +kuzumaki.iwate.jp +miyako.iwate.jp +mizusawa.iwate.jp +morioka.iwate.jp +ninohe.iwate.jp +noda.iwate.jp +ofunato.iwate.jp +oshu.iwate.jp +otsuchi.iwate.jp +rikuzentakata.iwate.jp +shiwa.iwate.jp +shizukuishi.iwate.jp +sumita.iwate.jp +takizawa.iwate.jp +tanohata.iwate.jp +tono.iwate.jp +yahaba.iwate.jp +yamada.iwate.jp +ayagawa.kagawa.jp +higashikagawa.kagawa.jp +kanonji.kagawa.jp +kotohira.kagawa.jp +manno.kagawa.jp +marugame.kagawa.jp +mitoyo.kagawa.jp +naoshima.kagawa.jp +sanuki.kagawa.jp +tadotsu.kagawa.jp +takamatsu.kagawa.jp +tonosho.kagawa.jp +uchinomi.kagawa.jp +utazu.kagawa.jp +zentsuji.kagawa.jp +akune.kagoshima.jp +amami.kagoshima.jp +hioki.kagoshima.jp +isa.kagoshima.jp +isen.kagoshima.jp +izumi.kagoshima.jp +kagoshima.kagoshima.jp +kanoya.kagoshima.jp +kawanabe.kagoshima.jp +kinko.kagoshima.jp +kouyama.kagoshima.jp +makurazaki.kagoshima.jp +matsumoto.kagoshima.jp +minamitane.kagoshima.jp +nakatane.kagoshima.jp +nishinoomote.kagoshima.jp +satsumasendai.kagoshima.jp +soo.kagoshima.jp +tarumizu.kagoshima.jp +yusui.kagoshima.jp +aikawa.kanagawa.jp +atsugi.kanagawa.jp +ayase.kanagawa.jp +chigasaki.kanagawa.jp +ebina.kanagawa.jp +fujisawa.kanagawa.jp +hadano.kanagawa.jp +hakone.kanagawa.jp +hiratsuka.kanagawa.jp +isehara.kanagawa.jp +kaisei.kanagawa.jp +kamakura.kanagawa.jp +kiyokawa.kanagawa.jp +matsuda.kanagawa.jp +minamiashigara.kanagawa.jp +miura.kanagawa.jp +nakai.kanagawa.jp +ninomiya.kanagawa.jp +odawara.kanagawa.jp +oi.kanagawa.jp +oiso.kanagawa.jp +sagamihara.kanagawa.jp +samukawa.kanagawa.jp +tsukui.kanagawa.jp +yamakita.kanagawa.jp +yamato.kanagawa.jp +yokosuka.kanagawa.jp +yugawara.kanagawa.jp +zama.kanagawa.jp +zushi.kanagawa.jp +aki.kochi.jp +geisei.kochi.jp +hidaka.kochi.jp +higashitsuno.kochi.jp +ino.kochi.jp +kagami.kochi.jp +kami.kochi.jp +kitagawa.kochi.jp +kochi.kochi.jp +mihara.kochi.jp +motoyama.kochi.jp +muroto.kochi.jp +nahari.kochi.jp +nakamura.kochi.jp +nankoku.kochi.jp +nishitosa.kochi.jp +niyodogawa.kochi.jp +ochi.kochi.jp +okawa.kochi.jp +otoyo.kochi.jp +otsuki.kochi.jp +sakawa.kochi.jp +sukumo.kochi.jp +susaki.kochi.jp +tosa.kochi.jp +tosashimizu.kochi.jp +toyo.kochi.jp +tsuno.kochi.jp +umaji.kochi.jp +yasuda.kochi.jp +yusuhara.kochi.jp +amakusa.kumamoto.jp +arao.kumamoto.jp +aso.kumamoto.jp +choyo.kumamoto.jp +gyokuto.kumamoto.jp +hitoyoshi.kumamoto.jp +kamiamakusa.kumamoto.jp +kashima.kumamoto.jp +kikuchi.kumamoto.jp +kosa.kumamoto.jp +kumamoto.kumamoto.jp +mashiki.kumamoto.jp +mifune.kumamoto.jp +minamata.kumamoto.jp +minamioguni.kumamoto.jp +nagasu.kumamoto.jp +nishihara.kumamoto.jp +oguni.kumamoto.jp +ozu.kumamoto.jp +sumoto.kumamoto.jp +takamori.kumamoto.jp +uki.kumamoto.jp +uto.kumamoto.jp +yamaga.kumamoto.jp +yamato.kumamoto.jp +yatsushiro.kumamoto.jp +ayabe.kyoto.jp +fukuchiyama.kyoto.jp +higashiyama.kyoto.jp +ide.kyoto.jp +ine.kyoto.jp +joyo.kyoto.jp +kameoka.kyoto.jp +kamo.kyoto.jp +kita.kyoto.jp +kizu.kyoto.jp +kumiyama.kyoto.jp +kyotamba.kyoto.jp +kyotanabe.kyoto.jp +kyotango.kyoto.jp +maizuru.kyoto.jp +minami.kyoto.jp +minamiyamashiro.kyoto.jp +miyazu.kyoto.jp +muko.kyoto.jp +nagaokakyo.kyoto.jp +nakagyo.kyoto.jp +nantan.kyoto.jp +oyamazaki.kyoto.jp +sakyo.kyoto.jp +seika.kyoto.jp +tanabe.kyoto.jp +uji.kyoto.jp +ujitawara.kyoto.jp +wazuka.kyoto.jp +yamashina.kyoto.jp +yawata.kyoto.jp +asahi.mie.jp +inabe.mie.jp +ise.mie.jp +kameyama.mie.jp +kawagoe.mie.jp +kiho.mie.jp +kisosaki.mie.jp +kiwa.mie.jp +komono.mie.jp +kumano.mie.jp +kuwana.mie.jp +matsusaka.mie.jp +meiwa.mie.jp +mihama.mie.jp +minamiise.mie.jp +misugi.mie.jp +miyama.mie.jp +nabari.mie.jp +shima.mie.jp +suzuka.mie.jp +tado.mie.jp +taiki.mie.jp +taki.mie.jp +tamaki.mie.jp +toba.mie.jp +tsu.mie.jp +udono.mie.jp +ureshino.mie.jp +watarai.mie.jp +yokkaichi.mie.jp +furukawa.miyagi.jp +higashimatsushima.miyagi.jp +ishinomaki.miyagi.jp +iwanuma.miyagi.jp +kakuda.miyagi.jp +kami.miyagi.jp +kawasaki.miyagi.jp +kesennuma.miyagi.jp +marumori.miyagi.jp +matsushima.miyagi.jp +minamisanriku.miyagi.jp +misato.miyagi.jp +murata.miyagi.jp +natori.miyagi.jp +ogawara.miyagi.jp +ohira.miyagi.jp +onagawa.miyagi.jp +osaki.miyagi.jp +rifu.miyagi.jp +semine.miyagi.jp +shibata.miyagi.jp +shichikashuku.miyagi.jp +shikama.miyagi.jp +shiogama.miyagi.jp +shiroishi.miyagi.jp +tagajo.miyagi.jp +taiwa.miyagi.jp +tome.miyagi.jp +tomiya.miyagi.jp +wakuya.miyagi.jp +watari.miyagi.jp +yamamoto.miyagi.jp +zao.miyagi.jp +aya.miyazaki.jp +ebino.miyazaki.jp +gokase.miyazaki.jp +hyuga.miyazaki.jp +kadogawa.miyazaki.jp +kawaminami.miyazaki.jp +kijo.miyazaki.jp +kitagawa.miyazaki.jp +kitakata.miyazaki.jp +kitaura.miyazaki.jp +kobayashi.miyazaki.jp +kunitomi.miyazaki.jp +kushima.miyazaki.jp +mimata.miyazaki.jp +miyakonojo.miyazaki.jp +miyazaki.miyazaki.jp +morotsuka.miyazaki.jp +nichinan.miyazaki.jp +nishimera.miyazaki.jp +nobeoka.miyazaki.jp +saito.miyazaki.jp +shiiba.miyazaki.jp +shintomi.miyazaki.jp +takaharu.miyazaki.jp +takanabe.miyazaki.jp +takazaki.miyazaki.jp +tsuno.miyazaki.jp +achi.nagano.jp +agematsu.nagano.jp +anan.nagano.jp +aoki.nagano.jp +asahi.nagano.jp +azumino.nagano.jp +chikuhoku.nagano.jp +chikuma.nagano.jp +chino.nagano.jp +fujimi.nagano.jp +hakuba.nagano.jp +hara.nagano.jp +hiraya.nagano.jp +iida.nagano.jp +iijima.nagano.jp +iiyama.nagano.jp +iizuna.nagano.jp +ikeda.nagano.jp +ikusaka.nagano.jp +ina.nagano.jp +karuizawa.nagano.jp +kawakami.nagano.jp +kiso.nagano.jp +kisofukushima.nagano.jp +kitaaiki.nagano.jp +komagane.nagano.jp +komoro.nagano.jp +matsukawa.nagano.jp +matsumoto.nagano.jp +miasa.nagano.jp +minamiaiki.nagano.jp +minamimaki.nagano.jp +minamiminowa.nagano.jp +minowa.nagano.jp +miyada.nagano.jp +miyota.nagano.jp +mochizuki.nagano.jp +nagano.nagano.jp +nagawa.nagano.jp +nagiso.nagano.jp +nakagawa.nagano.jp +nakano.nagano.jp +nozawaonsen.nagano.jp +obuse.nagano.jp +ogawa.nagano.jp +okaya.nagano.jp +omachi.nagano.jp +omi.nagano.jp +ookuwa.nagano.jp +ooshika.nagano.jp +otaki.nagano.jp +otari.nagano.jp +sakae.nagano.jp +sakaki.nagano.jp +saku.nagano.jp +sakuho.nagano.jp +shimosuwa.nagano.jp +shinanomachi.nagano.jp +shiojiri.nagano.jp +suwa.nagano.jp +suzaka.nagano.jp +takagi.nagano.jp +takamori.nagano.jp +takayama.nagano.jp +tateshina.nagano.jp +tatsuno.nagano.jp +togakushi.nagano.jp +togura.nagano.jp +tomi.nagano.jp +ueda.nagano.jp +wada.nagano.jp +yamagata.nagano.jp +yamanouchi.nagano.jp +yasaka.nagano.jp +yasuoka.nagano.jp +chijiwa.nagasaki.jp +futsu.nagasaki.jp +goto.nagasaki.jp +hasami.nagasaki.jp +hirado.nagasaki.jp +iki.nagasaki.jp +isahaya.nagasaki.jp +kawatana.nagasaki.jp +kuchinotsu.nagasaki.jp +matsuura.nagasaki.jp +nagasaki.nagasaki.jp +obama.nagasaki.jp +omura.nagasaki.jp +oseto.nagasaki.jp +saikai.nagasaki.jp +sasebo.nagasaki.jp +seihi.nagasaki.jp +shimabara.nagasaki.jp +shinkamigoto.nagasaki.jp +togitsu.nagasaki.jp +tsushima.nagasaki.jp +unzen.nagasaki.jp +ando.nara.jp +gose.nara.jp +heguri.nara.jp +higashiyoshino.nara.jp +ikaruga.nara.jp +ikoma.nara.jp +kamikitayama.nara.jp +kanmaki.nara.jp +kashiba.nara.jp +kashihara.nara.jp +katsuragi.nara.jp +kawai.nara.jp +kawakami.nara.jp +kawanishi.nara.jp +koryo.nara.jp +kurotaki.nara.jp +mitsue.nara.jp +miyake.nara.jp +nara.nara.jp +nosegawa.nara.jp +oji.nara.jp +ouda.nara.jp +oyodo.nara.jp +sakurai.nara.jp +sango.nara.jp +shimoichi.nara.jp +shimokitayama.nara.jp +shinjo.nara.jp +soni.nara.jp +takatori.nara.jp +tawaramoto.nara.jp +tenkawa.nara.jp +tenri.nara.jp +uda.nara.jp +yamatokoriyama.nara.jp +yamatotakada.nara.jp +yamazoe.nara.jp +yoshino.nara.jp +aga.niigata.jp +agano.niigata.jp +gosen.niigata.jp +itoigawa.niigata.jp +izumozaki.niigata.jp +joetsu.niigata.jp +kamo.niigata.jp +kariwa.niigata.jp +kashiwazaki.niigata.jp +minamiuonuma.niigata.jp +mitsuke.niigata.jp +muika.niigata.jp +murakami.niigata.jp +myoko.niigata.jp +nagaoka.niigata.jp +niigata.niigata.jp +ojiya.niigata.jp +omi.niigata.jp +sado.niigata.jp +sanjo.niigata.jp +seiro.niigata.jp +seirou.niigata.jp +sekikawa.niigata.jp +shibata.niigata.jp +tagami.niigata.jp +tainai.niigata.jp +tochio.niigata.jp +tokamachi.niigata.jp +tsubame.niigata.jp +tsunan.niigata.jp +uonuma.niigata.jp +yahiko.niigata.jp +yoita.niigata.jp +yuzawa.niigata.jp +beppu.oita.jp +bungoono.oita.jp +bungotakada.oita.jp +hasama.oita.jp +hiji.oita.jp +himeshima.oita.jp +hita.oita.jp +kamitsue.oita.jp +kokonoe.oita.jp +kuju.oita.jp +kunisaki.oita.jp +kusu.oita.jp +oita.oita.jp +saiki.oita.jp +taketa.oita.jp +tsukumi.oita.jp +usa.oita.jp +usuki.oita.jp +yufu.oita.jp +akaiwa.okayama.jp +asakuchi.okayama.jp +bizen.okayama.jp +hayashima.okayama.jp +ibara.okayama.jp +kagamino.okayama.jp +kasaoka.okayama.jp +kibichuo.okayama.jp +kumenan.okayama.jp +kurashiki.okayama.jp +maniwa.okayama.jp +misaki.okayama.jp +nagi.okayama.jp +niimi.okayama.jp +nishiawakura.okayama.jp +okayama.okayama.jp +satosho.okayama.jp +setouchi.okayama.jp +shinjo.okayama.jp +shoo.okayama.jp +soja.okayama.jp +takahashi.okayama.jp +tamano.okayama.jp +tsuyama.okayama.jp +wake.okayama.jp +yakage.okayama.jp +aguni.okinawa.jp +ginowan.okinawa.jp +ginoza.okinawa.jp +gushikami.okinawa.jp +haebaru.okinawa.jp +higashi.okinawa.jp +hirara.okinawa.jp +iheya.okinawa.jp +ishigaki.okinawa.jp +ishikawa.okinawa.jp +itoman.okinawa.jp +izena.okinawa.jp +kadena.okinawa.jp +kin.okinawa.jp +kitadaito.okinawa.jp +kitanakagusuku.okinawa.jp +kumejima.okinawa.jp +kunigami.okinawa.jp +minamidaito.okinawa.jp +motobu.okinawa.jp +nago.okinawa.jp +naha.okinawa.jp +nakagusuku.okinawa.jp +nakijin.okinawa.jp +nanjo.okinawa.jp +nishihara.okinawa.jp +ogimi.okinawa.jp +okinawa.okinawa.jp +onna.okinawa.jp +shimoji.okinawa.jp +taketomi.okinawa.jp +tarama.okinawa.jp +tokashiki.okinawa.jp +tomigusuku.okinawa.jp +tonaki.okinawa.jp +urasoe.okinawa.jp +uruma.okinawa.jp +yaese.okinawa.jp +yomitan.okinawa.jp +yonabaru.okinawa.jp +yonaguni.okinawa.jp +zamami.okinawa.jp +abeno.osaka.jp +chihayaakasaka.osaka.jp +chuo.osaka.jp +daito.osaka.jp +fujiidera.osaka.jp +habikino.osaka.jp +hannan.osaka.jp +higashiosaka.osaka.jp +higashisumiyoshi.osaka.jp +higashiyodogawa.osaka.jp +hirakata.osaka.jp +ibaraki.osaka.jp +ikeda.osaka.jp +izumi.osaka.jp +izumiotsu.osaka.jp +izumisano.osaka.jp +kadoma.osaka.jp +kaizuka.osaka.jp +kanan.osaka.jp +kashiwara.osaka.jp +katano.osaka.jp +kawachinagano.osaka.jp +kishiwada.osaka.jp +kita.osaka.jp +kumatori.osaka.jp +matsubara.osaka.jp +minato.osaka.jp +minoh.osaka.jp +misaki.osaka.jp +moriguchi.osaka.jp +neyagawa.osaka.jp +nishi.osaka.jp +nose.osaka.jp +osakasayama.osaka.jp +sakai.osaka.jp +sayama.osaka.jp +sennan.osaka.jp +settsu.osaka.jp +shijonawate.osaka.jp +shimamoto.osaka.jp +suita.osaka.jp +tadaoka.osaka.jp +taishi.osaka.jp +tajiri.osaka.jp +takaishi.osaka.jp +takatsuki.osaka.jp +tondabayashi.osaka.jp +toyonaka.osaka.jp +toyono.osaka.jp +yao.osaka.jp +ariake.saga.jp +arita.saga.jp +fukudomi.saga.jp +genkai.saga.jp +hamatama.saga.jp +hizen.saga.jp +imari.saga.jp +kamimine.saga.jp +kanzaki.saga.jp +karatsu.saga.jp +kashima.saga.jp +kitagata.saga.jp +kitahata.saga.jp +kiyama.saga.jp +kouhoku.saga.jp +kyuragi.saga.jp +nishiarita.saga.jp +ogi.saga.jp +omachi.saga.jp +ouchi.saga.jp +saga.saga.jp +shiroishi.saga.jp +taku.saga.jp +tara.saga.jp +tosu.saga.jp +yoshinogari.saga.jp +arakawa.saitama.jp +asaka.saitama.jp +chichibu.saitama.jp +fujimi.saitama.jp +fujimino.saitama.jp +fukaya.saitama.jp +hanno.saitama.jp +hanyu.saitama.jp +hasuda.saitama.jp +hatogaya.saitama.jp +hatoyama.saitama.jp +hidaka.saitama.jp +higashichichibu.saitama.jp +higashimatsuyama.saitama.jp +honjo.saitama.jp +ina.saitama.jp +iruma.saitama.jp +iwatsuki.saitama.jp +kamiizumi.saitama.jp +kamikawa.saitama.jp +kamisato.saitama.jp +kasukabe.saitama.jp +kawagoe.saitama.jp +kawaguchi.saitama.jp +kawajima.saitama.jp +kazo.saitama.jp +kitamoto.saitama.jp +koshigaya.saitama.jp +kounosu.saitama.jp +kuki.saitama.jp +kumagaya.saitama.jp +matsubushi.saitama.jp +minano.saitama.jp +misato.saitama.jp +miyashiro.saitama.jp +miyoshi.saitama.jp +moroyama.saitama.jp +nagatoro.saitama.jp +namegawa.saitama.jp +niiza.saitama.jp +ogano.saitama.jp +ogawa.saitama.jp +ogose.saitama.jp +okegawa.saitama.jp +omiya.saitama.jp +otaki.saitama.jp +ranzan.saitama.jp +ryokami.saitama.jp +saitama.saitama.jp +sakado.saitama.jp +satte.saitama.jp +sayama.saitama.jp +shiki.saitama.jp +shiraoka.saitama.jp +soka.saitama.jp +sugito.saitama.jp +toda.saitama.jp +tokigawa.saitama.jp +tokorozawa.saitama.jp +tsurugashima.saitama.jp +urawa.saitama.jp +warabi.saitama.jp +yashio.saitama.jp +yokoze.saitama.jp +yono.saitama.jp +yorii.saitama.jp +yoshida.saitama.jp +yoshikawa.saitama.jp +yoshimi.saitama.jp +aisho.shiga.jp +gamo.shiga.jp +higashiomi.shiga.jp +hikone.shiga.jp +koka.shiga.jp +konan.shiga.jp +kosei.shiga.jp +koto.shiga.jp +kusatsu.shiga.jp +maibara.shiga.jp +moriyama.shiga.jp +nagahama.shiga.jp +nishiazai.shiga.jp +notogawa.shiga.jp +omihachiman.shiga.jp +otsu.shiga.jp +ritto.shiga.jp +ryuoh.shiga.jp +takashima.shiga.jp +takatsuki.shiga.jp +torahime.shiga.jp +toyosato.shiga.jp +yasu.shiga.jp +akagi.shimane.jp +ama.shimane.jp +gotsu.shimane.jp +hamada.shimane.jp +higashiizumo.shimane.jp +hikawa.shimane.jp +hikimi.shimane.jp +izumo.shimane.jp +kakinoki.shimane.jp +masuda.shimane.jp +matsue.shimane.jp +misato.shimane.jp +nishinoshima.shimane.jp +ohda.shimane.jp +okinoshima.shimane.jp +okuizumo.shimane.jp +shimane.shimane.jp +tamayu.shimane.jp +tsuwano.shimane.jp +unnan.shimane.jp +yakumo.shimane.jp +yasugi.shimane.jp +yatsuka.shimane.jp +arai.shizuoka.jp +atami.shizuoka.jp +fuji.shizuoka.jp +fujieda.shizuoka.jp +fujikawa.shizuoka.jp +fujinomiya.shizuoka.jp +fukuroi.shizuoka.jp +gotemba.shizuoka.jp +haibara.shizuoka.jp +hamamatsu.shizuoka.jp +higashiizu.shizuoka.jp +ito.shizuoka.jp +iwata.shizuoka.jp +izu.shizuoka.jp +izunokuni.shizuoka.jp +kakegawa.shizuoka.jp +kannami.shizuoka.jp +kawanehon.shizuoka.jp +kawazu.shizuoka.jp +kikugawa.shizuoka.jp +kosai.shizuoka.jp +makinohara.shizuoka.jp +matsuzaki.shizuoka.jp +minamiizu.shizuoka.jp +mishima.shizuoka.jp +morimachi.shizuoka.jp +nishiizu.shizuoka.jp +numazu.shizuoka.jp +omaezaki.shizuoka.jp +shimada.shizuoka.jp +shimizu.shizuoka.jp +shimoda.shizuoka.jp +shizuoka.shizuoka.jp +susono.shizuoka.jp +yaizu.shizuoka.jp +yoshida.shizuoka.jp +ashikaga.tochigi.jp +bato.tochigi.jp +haga.tochigi.jp +ichikai.tochigi.jp +iwafune.tochigi.jp +kaminokawa.tochigi.jp +kanuma.tochigi.jp +karasuyama.tochigi.jp +kuroiso.tochigi.jp +mashiko.tochigi.jp +mibu.tochigi.jp +moka.tochigi.jp +motegi.tochigi.jp +nasu.tochigi.jp +nasushiobara.tochigi.jp +nikko.tochigi.jp +nishikata.tochigi.jp +nogi.tochigi.jp +ohira.tochigi.jp +ohtawara.tochigi.jp +oyama.tochigi.jp +sakura.tochigi.jp +sano.tochigi.jp +shimotsuke.tochigi.jp +shioya.tochigi.jp +takanezawa.tochigi.jp +tochigi.tochigi.jp +tsuga.tochigi.jp +ujiie.tochigi.jp +utsunomiya.tochigi.jp +yaita.tochigi.jp +aizumi.tokushima.jp +anan.tokushima.jp +ichiba.tokushima.jp +itano.tokushima.jp +kainan.tokushima.jp +komatsushima.tokushima.jp +matsushige.tokushima.jp +mima.tokushima.jp +minami.tokushima.jp +miyoshi.tokushima.jp +mugi.tokushima.jp +nakagawa.tokushima.jp +naruto.tokushima.jp +sanagochi.tokushima.jp +shishikui.tokushima.jp +tokushima.tokushima.jp +wajiki.tokushima.jp +adachi.tokyo.jp +akiruno.tokyo.jp +akishima.tokyo.jp +aogashima.tokyo.jp +arakawa.tokyo.jp +bunkyo.tokyo.jp +chiyoda.tokyo.jp +chofu.tokyo.jp +chuo.tokyo.jp +edogawa.tokyo.jp +fuchu.tokyo.jp +fussa.tokyo.jp +hachijo.tokyo.jp +hachioji.tokyo.jp +hamura.tokyo.jp +higashikurume.tokyo.jp +higashimurayama.tokyo.jp +higashiyamato.tokyo.jp +hino.tokyo.jp +hinode.tokyo.jp +hinohara.tokyo.jp +inagi.tokyo.jp +itabashi.tokyo.jp +katsushika.tokyo.jp +kita.tokyo.jp +kiyose.tokyo.jp +kodaira.tokyo.jp +koganei.tokyo.jp +kokubunji.tokyo.jp +komae.tokyo.jp +koto.tokyo.jp +kouzushima.tokyo.jp +kunitachi.tokyo.jp +machida.tokyo.jp +meguro.tokyo.jp +minato.tokyo.jp +mitaka.tokyo.jp +mizuho.tokyo.jp +musashimurayama.tokyo.jp +musashino.tokyo.jp +nakano.tokyo.jp +nerima.tokyo.jp +ogasawara.tokyo.jp +okutama.tokyo.jp +ome.tokyo.jp +oshima.tokyo.jp +ota.tokyo.jp +setagaya.tokyo.jp +shibuya.tokyo.jp +shinagawa.tokyo.jp +shinjuku.tokyo.jp +suginami.tokyo.jp +sumida.tokyo.jp +tachikawa.tokyo.jp +taito.tokyo.jp +tama.tokyo.jp +toshima.tokyo.jp +chizu.tottori.jp +hino.tottori.jp +kawahara.tottori.jp +koge.tottori.jp +kotoura.tottori.jp +misasa.tottori.jp +nanbu.tottori.jp +nichinan.tottori.jp +sakaiminato.tottori.jp +tottori.tottori.jp +wakasa.tottori.jp +yazu.tottori.jp +yonago.tottori.jp +asahi.toyama.jp +fuchu.toyama.jp +fukumitsu.toyama.jp +funahashi.toyama.jp +himi.toyama.jp +imizu.toyama.jp +inami.toyama.jp +johana.toyama.jp +kamiichi.toyama.jp +kurobe.toyama.jp +nakaniikawa.toyama.jp +namerikawa.toyama.jp +nanto.toyama.jp +nyuzen.toyama.jp +oyabe.toyama.jp +taira.toyama.jp +takaoka.toyama.jp +tateyama.toyama.jp +toga.toyama.jp +tonami.toyama.jp +toyama.toyama.jp +unazuki.toyama.jp +uozu.toyama.jp +yamada.toyama.jp +arida.wakayama.jp +aridagawa.wakayama.jp +gobo.wakayama.jp +hashimoto.wakayama.jp +hidaka.wakayama.jp +hirogawa.wakayama.jp +inami.wakayama.jp +iwade.wakayama.jp +kainan.wakayama.jp +kamitonda.wakayama.jp +katsuragi.wakayama.jp +kimino.wakayama.jp +kinokawa.wakayama.jp +kitayama.wakayama.jp +koya.wakayama.jp +koza.wakayama.jp +kozagawa.wakayama.jp +kudoyama.wakayama.jp +kushimoto.wakayama.jp +mihama.wakayama.jp +misato.wakayama.jp +nachikatsuura.wakayama.jp +shingu.wakayama.jp +shirahama.wakayama.jp +taiji.wakayama.jp +tanabe.wakayama.jp +wakayama.wakayama.jp +yuasa.wakayama.jp +yura.wakayama.jp +asahi.yamagata.jp +funagata.yamagata.jp +higashine.yamagata.jp +iide.yamagata.jp +kahoku.yamagata.jp +kaminoyama.yamagata.jp +kaneyama.yamagata.jp +kawanishi.yamagata.jp +mamurogawa.yamagata.jp +mikawa.yamagata.jp +murayama.yamagata.jp +nagai.yamagata.jp +nakayama.yamagata.jp +nanyo.yamagata.jp +nishikawa.yamagata.jp +obanazawa.yamagata.jp +oe.yamagata.jp +oguni.yamagata.jp +ohkura.yamagata.jp +oishida.yamagata.jp +sagae.yamagata.jp +sakata.yamagata.jp +sakegawa.yamagata.jp +shinjo.yamagata.jp +shirataka.yamagata.jp +shonai.yamagata.jp +takahata.yamagata.jp +tendo.yamagata.jp +tozawa.yamagata.jp +tsuruoka.yamagata.jp +yamagata.yamagata.jp +yamanobe.yamagata.jp +yonezawa.yamagata.jp +yuza.yamagata.jp +abu.yamaguchi.jp +hagi.yamaguchi.jp +hikari.yamaguchi.jp +hofu.yamaguchi.jp +iwakuni.yamaguchi.jp +kudamatsu.yamaguchi.jp +mitou.yamaguchi.jp +nagato.yamaguchi.jp +oshima.yamaguchi.jp +shimonoseki.yamaguchi.jp +shunan.yamaguchi.jp +tabuse.yamaguchi.jp +tokuyama.yamaguchi.jp +toyota.yamaguchi.jp +ube.yamaguchi.jp +yuu.yamaguchi.jp +chuo.yamanashi.jp +doshi.yamanashi.jp +fuefuki.yamanashi.jp +fujikawa.yamanashi.jp +fujikawaguchiko.yamanashi.jp +fujiyoshida.yamanashi.jp +hayakawa.yamanashi.jp +hokuto.yamanashi.jp +ichikawamisato.yamanashi.jp +kai.yamanashi.jp +kofu.yamanashi.jp +koshu.yamanashi.jp +kosuge.yamanashi.jp +minami-alps.yamanashi.jp +minobu.yamanashi.jp +nakamichi.yamanashi.jp +nanbu.yamanashi.jp +narusawa.yamanashi.jp +nirasaki.yamanashi.jp +nishikatsura.yamanashi.jp +oshino.yamanashi.jp +otsuki.yamanashi.jp +showa.yamanashi.jp +tabayama.yamanashi.jp +tsuru.yamanashi.jp +uenohara.yamanashi.jp +yamanakako.yamanashi.jp +yamanashi.yamanashi.jp // ke : http://www.kenic.or.ke/index.php?option=com_content&task=view&id=117&Itemid=145 *.ke @@ -2579,6 +4181,7 @@ name.my // mz : http://www.gobin.info/domainname/mz-template.doc *.mz +!teledata.mz // na : http://www.na-nic.com.na/ // http://www.info.na/domain/ @@ -3714,6 +5317,9 @@ org.pn edu.pn net.pn +// post : http://en.wikipedia.org/wiki/.post +post + // pr : http://www.nic.pr/index.asp?f=1 pr com.pr @@ -3772,8 +5378,16 @@ ed.pw go.pw belau.pw -// py : http://www.nic.py/faq_a.html#faq_b -*.py +// py : http://www.nic.py/pautas.html#seccion_9 +// Confirmed by registry 2012-10-03 +py +com.py +coop.py +edu.py +gov.py +mil.py +net.py +org.py // qa : http://domains.qa/en/ qa @@ -4004,6 +5618,7 @@ net.sd org.sd edu.sd med.sd +tv.sd gov.sd info.sd @@ -4051,7 +5666,7 @@ x.se y.se z.se -// sg : http://www.nic.net.sg/sub_policies_agreement/2ld.html +// sg : http://www.nic.net.sg/page/registration-policies-procedures-and-guidelines sg com.sg net.sg @@ -4060,9 +5675,13 @@ gov.sg edu.sg per.sg -// sh : http://www.nic.sh/rules.html -// list of 2nd level domains ? +// sh : http://www.nic.sh/registrar.html sh +com.sh +net.sh +gov.sh +org.sh +mil.sh // si : http://en.wikipedia.org/wiki/.si si @@ -4126,6 +5745,11 @@ su // sv : http://www.svnet.org.sv/svpolicy.html *.sv +// sx : http://en.wikipedia.org/wiki/.sx +// Confirmed by registry 2012-05-31 +sx +gov.sx + // sy : http://en.wikipedia.org/wiki/.sy // see also: http://www.gobin.info/domainname/sy.doc sy @@ -4157,8 +5781,7 @@ tel tf // tg : http://en.wikipedia.org/wiki/.tg -// http://www.nic.tg/nictg/index.php implies no reserved 2nd-level domains, -// although this contradicts wikipedia. +// http://www.nic.tg/ tg // th : http://en.wikipedia.org/wiki/.th @@ -4172,7 +5795,7 @@ mi.th net.th or.th -// tj : http://www.nic.tj/policy.htm +// tj : http://www.nic.tj/policy.html tj ac.tj biz.tj @@ -4197,9 +5820,16 @@ tk tl gov.tl -// tm : http://www.nic.tm/rules.html -// list of 2nd level tlds ? +// tm : http://www.nic.tm/local.html tm +com.tm +co.tm +org.tm +net.tm +nom.tm +gov.tm +mil.tm +edu.tm // tn : http://en.wikipedia.org/wiki/.tn // http://whois.ati.tn/ @@ -4286,101 +5916,133 @@ club.tw 組織.tw 商業.tw -// tz : http://en.wikipedia.org/wiki/.tz -// Submitted by registry 2008-06-17 -// Updated from http://www.tznic.or.tz/index.php/domains.html 2010-10-25 +// tz : http://www.tznic.or.tz/index.php/domains +// Confirmed by registry 2013-01-22 ac.tz co.tz go.tz +hotel.tz +info.tz +me.tz mil.tz +mobi.tz ne.tz or.tz sc.tz +tv.tz -// ua : http://www.nic.net.ua/ +// ua : https://hostmaster.ua/policy/?ua +// Submitted by registry 2012-04-27 ua +// ua 2LD com.ua edu.ua gov.ua in.ua net.ua org.ua -// ua geo-names +// ua geographic names +// https://hostmaster.ua/2ld/ cherkassy.ua +cherkasy.ua chernigov.ua +chernihiv.ua +chernivtsi.ua chernovtsy.ua ck.ua cn.ua +cr.ua crimea.ua cv.ua dn.ua dnepropetrovsk.ua +dnipropetrovsk.ua +dominic.ua donetsk.ua dp.ua if.ua ivano-frankivsk.ua kh.ua +kharkiv.ua kharkov.ua kherson.ua khmelnitskiy.ua +khmelnytskyi.ua kiev.ua kirovograd.ua km.ua kr.ua +krym.ua ks.ua kv.ua +kyiv.ua lg.ua +lt.ua lugansk.ua lutsk.ua +lv.ua lviv.ua mk.ua +mykolaiv.ua nikolaev.ua od.ua +odesa.ua odessa.ua pl.ua poltava.ua +rivne.ua rovno.ua rv.ua +sb.ua sebastopol.ua +sevastopol.ua +sm.ua sumy.ua te.ua ternopil.ua +uz.ua uzhgorod.ua vinnica.ua +vinnytsia.ua vn.ua +volyn.ua +yalta.ua zaporizhzhe.ua -zp.ua +zaporizhzhia.ua zhitomir.ua +zhytomyr.ua +zp.ua zt.ua // Private registries in .ua co.ua pp.ua -// ug : http://www.registry.co.ug/ +// ug : https://www.registry.co.ug/ ug co.ug +or.ug ac.ug sc.ug go.ug ne.ug -or.ug +com.ug +org.ug // uk : http://en.wikipedia.org/wiki/.uk +// Submitted by registry 2012-10-02 +// and tweaked by us pending further consultation. *.uk *.sch.uk !bl.uk !british-library.uk -!icnet.uk !jet.uk !mod.uk +!national-library-scotland.uk !nel.uk -!nhs.uk !nic.uk !nls.uk -!national-library-scotland.uk !parliament.uk -!police.uk // us : http://en.wikipedia.org/wiki/.us us @@ -4627,14 +6289,21 @@ pvt.k12.ma.us chtr.k12.ma.us paroch.k12.ma.us -// uy : http://www.antel.com.uy/ -*.uy +// uy : http://www.nic.org.uy/ +uy +com.uy +edu.uy +gub.uy +mil.uy +net.uy +org.uy -// uz : http://www.reg.uz/registerr.html -// are there other 2nd level tlds ? +// uz : http://www.reg.uz/ uz -com.uz co.uz +com.uz +net.uz +org.uz // va : http://en.wikipedia.org/wiki/.va va @@ -4649,8 +6318,19 @@ gov.vc mil.vc edu.vc -// ve : http://registro.nic.ve/nicve/registro/index.html -*.ve +// ve : https://registro.nic.ve/ +// Confirmed by registry 2012-10-04 +ve +co.ve +com.ve +e12.ve +edu.ve +gov.ve +info.ve +mil.ve +net.ve +org.ve +web.ve // vg : http://en.wikipedia.org/wiki/.vg vg @@ -4708,7 +6388,7 @@ yt // // xn--mgbaam7a8h ("Emerat" Arabic) : AE -//http://nic.ae/english/arabicdomain/rules.jsp +// http://nic.ae/english/arabicdomain/rules.jsp امارات // xn--54b7fta0cc ("Bangla" Bangla) : BD @@ -4772,9 +6452,9 @@ yt // xn--mgba3a4fra ("Iran" Arabic) : IR ايران -//xn--mgbayh7gpa ("al-Ordon" Arabic) JO -//National Information Technology Center (NITC) -//Royal Scientific Society, Al-Jubeiha +// xn--mgbayh7gpa ("al-Ordon" Arabic) : JO +// National Information Technology Center (NITC) +// Royal Scientific Society, Al-Jubeiha الاردن // xn--3e0b707e ("Republic of Korea" Hangul) : KR @@ -4878,27 +6558,75 @@ xxx // ===END ICANN DOMAINS=== // ===BEGIN PRIVATE DOMAINS=== -// info.at : http://www.info.at/ -biz.at -info.at - -// priv.at : http://www.nic.priv.at/ -// Submitted by registry 2008-06-09 -priv.at - -// co.ca : http://registry.co.ca -co.ca +// Amazon CloudFront : https://aws.amazon.com/cloudfront/ +// Requested by Donavan Miller 2013-03-22 +cloudfront.net + +// Amazon Elastic Compute Cloud: https://aws.amazon.com/ec2/ +// Requested by Osman Surkatty 2013-04-02 +compute.amazonaws.com +us-east-1.amazonaws.com +compute-1.amazonaws.com +z-1.compute-1.amazonaws.com +z-2.compute-1.amazonaws.com +ap-northeast-1.compute.amazonaws.com +ap-southeast-1.compute.amazonaws.com +ap-southeast-2.compute.amazonaws.com +eu-west-1.compute.amazonaws.com +sa-east-1.compute.amazonaws.com +us-gov-west-1.compute.amazonaws.com +us-west-1.compute.amazonaws.com +us-west-2.compute.amazonaws.com + +// Amazon Elastic Beanstalk : https://aws.amazon.com/elasticbeanstalk/ +// Requested by Adam Stein 2013-04-02 +elasticbeanstalk.com + +// Amazon Elastic Load Balancing : https://aws.amazon.com/elasticloadbalancing/ +// Requested by Scott Vidmar 2013-03-27 +elb.amazonaws.com + +// Amazon S3 : https://aws.amazon.com/s3/ +// Requested by Courtney Eckhardt 2013-03-22 +s3.amazonaws.com +s3-us-west-2.amazonaws.com +s3-us-west-1.amazonaws.com +s3-eu-west-1.amazonaws.com +s3-ap-southeast-1.amazonaws.com +s3-ap-southeast-2.amazonaws.com +s3-ap-northeast-1.amazonaws.com +s3-sa-east-1.amazonaws.com +s3-us-gov-west-1.amazonaws.com +s3-fips-us-gov-west-1.amazonaws.com +s3-website-us-east-1.amazonaws.com +s3-website-us-west-2.amazonaws.com +s3-website-us-west-1.amazonaws.com +s3-website-eu-west-1.amazonaws.com +s3-website-ap-southeast-1.amazonaws.com +s3-website-ap-southeast-2.amazonaws.com +s3-website-ap-northeast-1.amazonaws.com +s3-website-sa-east-1.amazonaws.com +s3-website-us-gov-west-1.amazonaws.com + +// BetaInABox +// Requested by adrian@betainabox.com 2012-09-13 +betainabox.com // CentralNic : http://www.centralnic.com/names/domains -// Confirmed by registry 2008-06-09 +// Requested by registry 2012-09-27 +ae.org ar.com br.com cn.com +com.de de.com eu.com gb.com +gb.net gr.com hu.com +hu.net +jp.net jpn.com kr.com no.com @@ -4906,44 +6634,32 @@ qc.com ru.com sa.com se.com +se.net uk.com +uk.net us.com +us.org uy.com za.com -gb.net -jp.net -se.net -uk.net -ae.org -us.org -com.de - -// Opera Software, A.S.A. -// Requested by Yngve Pettersen 2009-11-26 -operaunite.com - -// Google, Inc. -// Requested by Eduardo Vela 2010-09-06 -appspot.com - -// iki.fi : Submitted by Hannu Aronsson 2009-11-05 -iki.fi // c.la : http://www.c.la/ c.la -// ZaNiC : http://www.za.net/ -// Confirmed by registry 2009-10-03 -za.net -za.org +// cloudControl : https://www.cloudcontrol.com/ +// Requested by Tobias Wilken 2013-07-23 +cloudcontrolled.com +cloudcontrolapp.com + +// co.ca : http://registry.co.ca/ +co.ca // CoDNS B.V. -// Added 2010-05-23. co.nl co.no -// Mainseek Sp. z o.o. : http://www.co.pl/ -co.pl +// DreamHost : http://www.dreamhost.com/ +// Requested by Andrew Farmer 2012-10-02 +dreamhosters.com // DynDNS.com : http://www.dyndns.com/services/dns/dyndns/ dyndns-at-home.com @@ -5226,4 +6942,104 @@ webhop.org worse-than.tv writesthisblog.com +// Fastly Inc. http://www.fastly.com/ +// Requested by Vladimir Vuksan 2013-05-31 +a.ssl.fastly.net +b.ssl.fastly.net +global.ssl.fastly.net +a.prod.fastly.net +global.prod.fastly.net + +// GitHub, Inc. +// Requested by Ben Toews 2013-04-18 +github.io + +// GlobeHosting, Inc. +// Requested by Zoltan Egresi 2013-07-12 +ro.com + +// Google, Inc. +// Requested by Eduardo Vela 2012-10-24 +appspot.com +blogspot.be +blogspot.bj +blogspot.ca +blogspot.cf +blogspot.ch +blogspot.co.at +blogspot.co.il +blogspot.co.nz +blogspot.co.uk +blogspot.com +blogspot.com.ar +blogspot.com.au +blogspot.com.br +blogspot.com.es +blogspot.cv +blogspot.cz +blogspot.de +blogspot.dk +blogspot.fi +blogspot.fr +blogspot.gr +blogspot.hk +blogspot.hu +blogspot.ie +blogspot.in +blogspot.it +blogspot.jp +blogspot.kr +blogspot.mr +blogspot.mx +blogspot.nl +blogspot.no +blogspot.pt +blogspot.re +blogspot.ro +blogspot.se +blogspot.sg +blogspot.sk +blogspot.td +blogspot.tw +codespot.com +googleapis.com +googlecode.com + +// Heroku : https://www.heroku.com/ +// Requested by Tom Maher 2013-05-02 +herokuapp.com +herokussl.com + +// iki.fi +// Requested by Hannu Aronsson 2009-11-05 +iki.fi + +// info.at : http://www.info.at/ +biz.at +info.at + +// Michau Enterprises Limited : http://www.co.pl/ +co.pl + +// NYC.mn : http://www.information.nyc.mn +// Requested by Matthew Brown 2013-03-11 +nyc.mn + +// Opera Software, A.S.A. +// Requested by Yngve Pettersen 2009-11-26 +operaunite.com + +// Red Hat, Inc. OpenShift : https://openshift.redhat.com/ +// Requested by Tim Kramer 2012-10-24 +rhcloud.com + +// priv.at : http://www.nic.priv.at/ +// Requested by registry 2008-06-09 +priv.at + +// ZaNiC : http://www.za.net/ +// Requested by registry 2009-10-03 +za.net +za.org + // ===END PRIVATE DOMAINS=== diff --git a/src/test/java/org/archive/url/PublicSuffixesTest.java b/src/test/java/org/archive/net/PublicSuffixesTest.java similarity index 94% rename from src/test/java/org/archive/url/PublicSuffixesTest.java rename to src/test/java/org/archive/net/PublicSuffixesTest.java index e2bb288a..b88acb6d 100644 --- a/src/test/java/org/archive/url/PublicSuffixesTest.java +++ b/src/test/java/org/archive/net/PublicSuffixesTest.java @@ -17,7 +17,7 @@ * limitations under the License. */ -package org.archive.url; +package org.archive.net; import java.io.PrintWriter; import java.io.StringWriter; @@ -26,7 +26,7 @@ import junit.framework.TestCase; -import org.archive.url.PublicSuffixes.Node; +import org.archive.net.PublicSuffixes.Node; /** * Test cases for PublicSuffixes utility. Confirm expected matches/nonmatches @@ -132,9 +132,9 @@ public void testBasics() { matchPrefix("uk,co,virgin,", "uk,co,virgin,"); matchPrefix("au,com,example,www,", "au,com,example,"); matchPrefix("au,com,example,", "au,com,example,"); - matchPrefix("jp,tokyo,public,assigned,www,", - "jp,tokyo,public,assigned,"); - matchPrefix("jp,tokyo,public,assigned,", "jp,tokyo,public,assigned,"); + matchPrefix("jp,yokohama,public,assigned,www,", + "jp,yokohama,public,assigned,"); + matchPrefix("jp,yokohama,public,assigned,", "jp,yokohama,public,assigned,"); } public void testDomainWithDash() { @@ -161,8 +161,8 @@ public void testIPV6() { public void testExceptions() { matchPrefix("uk,bl,www,", "uk,bl,"); matchPrefix("uk,bl,", "uk,bl,"); - matchPrefix("jp,tokyo,metro,subdomain,", "jp,tokyo,metro,"); - matchPrefix("jp,tokyo,metro,", "jp,tokyo,metro,"); + matchPrefix("jp,tokyo,city,subdomain,", "jp,tokyo,city,"); + matchPrefix("jp,tokyo,city,", "jp,tokyo,city,"); } public void testFakeTLD() {