From 28c299518367808b86d92713b4119bcfc7acd9c9 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 3 Oct 2013 16:21:11 -0700 Subject: [PATCH 01/27] ZipNum: fixes: cached locations: don't throw RunTimeException if required (handled later), add a locCacheMaxDuration to cache only if below threshold --- .../format/gzip/zipnum/ZipNumBlockLoader.java | 7 +- .../format/gzip/zipnum/ZipNumCluster.java | 69 +++++++++++-------- 2 files changed, 44 insertions(+), 32 deletions(-) diff --git a/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java b/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java index a1682818..91a822a2 100644 --- a/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java +++ b/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java @@ -15,7 +15,6 @@ import org.archive.util.binsearch.impl.HTTPSeekableLineReader; import org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory; import org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory.HttpLibs; -import org.archive.util.io.RuntimeIOException; public class ZipNumBlockLoader { @@ -169,9 +168,9 @@ public SeekableLineReader attemptLoadBlock(String location, long startOffset, in currReader = null; } - if (isRequired) { - throw new RuntimeIOException(io); - } +// if (isRequired) { +// throw new RuntimeIOException(io); +// } } return currReader; diff --git a/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java b/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java index 5e91c507..09e58064 100644 --- a/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java +++ b/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java @@ -122,7 +122,15 @@ class BlockSize protected boolean newIsDisabled = false; protected boolean disabled = false; - final static int DEFAULT_LOC_CACHE_EXPIRE_MILLIS = 5000; + //final static int DEFAULT_LOC_CACHE_EXPIRE_MILLIS = 120000; + + protected ConcurrentHashMap locCacheMap; + + protected boolean cacheRemoteLoc = false; + + protected int locCacheExpireMillis = 120000; + + protected int locCacheMaxDuration = 1000; class LocCacheEntry { @@ -151,14 +159,7 @@ public boolean equals(Object obj) return false; } - } - - protected ConcurrentHashMap locCacheMap; - - protected boolean cacheRemoteLoc = false; - - protected int locCacheExpireMillis = DEFAULT_LOC_CACHE_EXPIRE_MILLIS; - + } @Override public void init() throws IOException @@ -287,6 +288,14 @@ public void setLocCacheExpireMillis(int locCacheExpireMillis) { this.locCacheExpireMillis = locCacheExpireMillis; } + public int getLocCacheMaxDuration() { + return locCacheMaxDuration; + } + + public void setLocCacheMaxDuration(int locCacheMaxDuration) { + this.locCacheMaxDuration = locCacheMaxDuration; + } + public boolean isCacheRemoteLoc() { return cacheRemoteLoc; } @@ -525,25 +534,19 @@ SeekableLineReader doBlockLoad(String partId, long startOffset, int totalLength) } // Attempt cached load for http - if (cacheRemoteLoc && (locCacheMap != null)) { - // Non-http requests follow standard load path - if ((locations.length > 0) && GeneralURIStreamFactory.isHttp(locations[0])) { - reader = loadCachedBalancedReader(partId, locations, startOffset, totalLength); - } - } - - if (reader != null) { - return reader; - } - - for (String location : locations) { - reader = blockLoader.attemptLoadBlock(location, startOffset, totalLength, true, isRequired()); - if (reader != null) { - return reader; + if (cacheRemoteLoc && (locCacheMap != null) && (locations.length > 0) && GeneralURIStreamFactory.isHttp(locations[0])) { + reader = loadCachedBalancedReader(partId, locations, startOffset, totalLength); + } else { + // Standard block load path + for (String location : locations) { + reader = blockLoader.attemptLoadBlock(location, startOffset, totalLength, true, isRequired()); + if (reader != null) { + return reader; + } } } - return null; + return reader; } protected String locCacheGet(String key) @@ -574,12 +577,18 @@ SeekableLineReader loadCachedBalancedReader(String partId, String[] locations, l String cachedUrl = locCacheGet(partId); if (cachedUrl != null) { + long start = System.currentTimeMillis(); + reader = blockLoader.attemptLoadBlock(cachedUrl, offset, length, true, isRequired()); + long duration = System.currentTimeMillis() - start; + + if ((reader == null) || (duration > locCacheMaxDuration)) { + locCacheMap.remove(partId, cachedUrl); + } + if (reader != null) { return reader; - } else { - locCacheMap.remove(partId, cachedUrl); } } @@ -593,12 +602,16 @@ SeekableLineReader loadCachedBalancedReader(String partId, String[] locations, l } for (int index : indexs) { + long start = System.currentTimeMillis(); + reader = blockLoader.attemptLoadBlock(locations[index], offset, length, true, isRequired()); + long duration = System.currentTimeMillis() - start; + if (reader != null) { String connectedUrl = ((HTTPSeekableLineReader)reader).getConnectedUrl(); - if (connectedUrl != null) { + if ((duration < locCacheMaxDuration) && (connectedUrl != null)) { locCachePut(partId, connectedUrl); } From 41d3a58a0d03a4cac3b99f2fb05eebb7e569afab Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 3 Oct 2013 19:06:06 -0700 Subject: [PATCH 02/27] CDX: Add CloseableCompositeIterator which iterates in sequence, optimization for zipnum clusters to be loaded sequentially when only looking for last line --- .../format/cdx/MultiCDXInputSource.java | 28 +++++++ .../format/gzip/zipnum/ZipNumParams.java | 9 +++ .../iterator/CloseableCompositeIterator.java | 73 +++++++++++++++++++ 3 files changed, 110 insertions(+) create mode 100644 src/main/java/org/archive/util/iterator/CloseableCompositeIterator.java diff --git a/src/main/java/org/archive/format/cdx/MultiCDXInputSource.java b/src/main/java/org/archive/format/cdx/MultiCDXInputSource.java index 66367077..7f1ff002 100644 --- a/src/main/java/org/archive/format/cdx/MultiCDXInputSource.java +++ b/src/main/java/org/archive/format/cdx/MultiCDXInputSource.java @@ -8,6 +8,7 @@ import org.archive.format.gzip.zipnum.ZipNumIndex; import org.archive.format.gzip.zipnum.ZipNumParams; +import org.archive.util.iterator.CloseableCompositeIterator; import org.archive.util.iterator.CloseableIterator; import org.archive.util.iterator.SortedCompositeIterator; @@ -70,9 +71,36 @@ public CloseableIterator getCDXIterator(String key, String prefix, boole return scitr; } + public CloseableIterator createSeqIterator(String key, String start, String end, ZipNumParams params) + { + CloseableCompositeIterator composite = new CloseableCompositeIterator(); + CloseableIterator iter = null; + + for (CDXInputSource cdxReader : cdx) { + try { + iter = cdxReader.getCDXIterator(key, start, end, params); + + if (!params.isReverse()) { + composite.addLast(iter); + } else { + composite.addFirst(iter); + } + + } catch (IOException io) { + LOGGER.warning(io.toString()); + } + } + + return composite; + } + public CloseableIterator getCDXIterator(String key, String start, String end, ZipNumParams params) throws IOException { + if (params.isSequential()) { + return this.createSeqIterator(key, start, end, params); + } + SortedCompositeIterator scitr = new SortedCompositeIterator(cdx.size(), params.isReverse() ? reverseComparator : comparator); CloseableIterator iter = null; diff --git a/src/main/java/org/archive/format/gzip/zipnum/ZipNumParams.java b/src/main/java/org/archive/format/gzip/zipnum/ZipNumParams.java index 15e22e1d..668743ae 100644 --- a/src/main/java/org/archive/format/gzip/zipnum/ZipNumParams.java +++ b/src/main/java/org/archive/format/gzip/zipnum/ZipNumParams.java @@ -6,6 +6,7 @@ public class ZipNumParams protected int timestampDedupLength = 0; protected int maxBlocks = 0; private boolean reverse = false; + private boolean sequential = false; public ZipNumParams() { @@ -56,4 +57,12 @@ public boolean isReverse() { public void setReverse(boolean reverse) { this.reverse = reverse; } + + public boolean isSequential() { + return sequential; + } + + public void setSequential(boolean sequential) { + this.sequential = sequential; + } } \ No newline at end of file diff --git a/src/main/java/org/archive/util/iterator/CloseableCompositeIterator.java b/src/main/java/org/archive/util/iterator/CloseableCompositeIterator.java new file mode 100644 index 00000000..b9f632e2 --- /dev/null +++ b/src/main/java/org/archive/util/iterator/CloseableCompositeIterator.java @@ -0,0 +1,73 @@ +package org.archive.util.iterator; + +import java.io.IOException; +import java.util.Iterator; +import java.util.LinkedList; + +public class CloseableCompositeIterator implements CloseableIterator { + + protected LinkedList> iters; + protected Iterator> iterPtr; + protected CloseableIterator currIter; + + public CloseableCompositeIterator() + { + iters = new LinkedList>(); + } + + public void addFirst(CloseableIterator e) + { + iters.addFirst(e); + } + + public void addLast(CloseableIterator e) + { + iters.addLast(e); + } + + @Override + public boolean hasNext() { + + if (iterPtr == null) { + iterPtr = iters.iterator(); + currIter = iterPtr.next(); + } + + if (currIter == null) { + return false; + } + + while (currIter != null) { + if (currIter.hasNext()) { + return true; + } + + currIter = (iterPtr.hasNext() ? iterPtr.next() : null); + } + + return false; + } + + @Override + public E next() { + return currIter.next(); + } + + @Override + public void remove() { + currIter.remove(); + } + + @Override + public void close() throws IOException { + for (CloseableIterator e : iters) { + if (e != null) { + try { + e.close(); + } catch (IOException io) { + + } + } + } + } +} From b4f639de2493d5cf09dae449c9bab5e464a208eb Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 6 Oct 2013 13:07:04 -0700 Subject: [PATCH 03/27] ZIP & SLR improvements: store connectedUrl immediately on connection, print connectedUrl on error, skip cached url on fallback to location list --- .../format/gzip/zipnum/ZipNumBlockLoader.java | 12 +++++++++++- .../archive/format/gzip/zipnum/ZipNumCluster.java | 5 +++++ .../java/org/archive/url/UrlSurtRangeComputer.java | 2 +- .../util/binsearch/impl/http/ApacheHttp31SLR.java | 4 ++-- .../util/binsearch/impl/http/HTTPURLConnSLR.java | 3 +-- .../org/archive/util/io/RuntimeIOException.java | 14 +++++++++++++- 6 files changed, 33 insertions(+), 7 deletions(-) diff --git a/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java b/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java index 91a822a2..2144bd30 100644 --- a/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java +++ b/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java @@ -155,8 +155,18 @@ public SeekableLineReader attemptLoadBlock(String location, long startOffset, in } catch (IOException io) { Level level = (isRequired ? Level.SEVERE : Level.WARNING); + String actualLocation = null; + + if (currReader instanceof HTTPSeekableLineReader) { + actualLocation = ((HTTPSeekableLineReader)currReader).getConnectedUrl(); + } + + if (actualLocation == null) { + actualLocation = location; + } + if (LOGGER.isLoggable(level)) { - LOGGER.log(level, io.toString() + " -- -r " + startOffset + "-" + (startOffset + totalLength - 1) + " " + location + " req? " + isRequired); + LOGGER.log(level, io.toString() + " -- -r " + startOffset + "-" + (startOffset + totalLength - 1) + " " + actualLocation + " req? " + isRequired); } if (currReader != null) { diff --git a/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java b/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java index 09e58064..fbe88033 100644 --- a/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java +++ b/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java @@ -602,6 +602,11 @@ SeekableLineReader loadCachedBalancedReader(String partId, String[] locations, l } for (int index : indexs) { + // Skip failed cached url + if (cachedUrl != null && locations[index].equals(cachedUrl)) { + continue; + } + long start = System.currentTimeMillis(); reader = blockLoader.attemptLoadBlock(locations[index], offset, length, true, isRequired()); diff --git a/src/main/java/org/archive/url/UrlSurtRangeComputer.java b/src/main/java/org/archive/url/UrlSurtRangeComputer.java index 74057117..2b960e16 100644 --- a/src/main/java/org/archive/url/UrlSurtRangeComputer.java +++ b/src/main/java/org/archive/url/UrlSurtRangeComputer.java @@ -112,7 +112,7 @@ public String[] determineRange(String url, MatchType match, String from, String return new String[]{startKey, endKey, host}; } - protected String incLastChar(String input) + public static String incLastChar(String input) { StringBuilder sb = new StringBuilder(input); sb.setCharAt(sb.length() - 1, (char)(sb.charAt(sb.length() - 1) + 1)); diff --git a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java index 0857bfd6..e09c02f9 100644 --- a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java +++ b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java @@ -126,12 +126,12 @@ protected InputStream doSeekLoad(long offset, int maxLength) throws IOException int code = http.executeMethod(activeMethod); + connectedUrl = activeMethod.getURI().toString(); + if ((code != 206) && (code != 200)) { throw new BadHttpStatusException(code, url + " " + rangeHeader); } - connectedUrl = activeMethod.getURI().toString(); - InputStream is = activeMethod.getResponseBodyAsStream(); cin = new CountingInputStream(is); return cin; diff --git a/src/main/java/org/archive/util/binsearch/impl/http/HTTPURLConnSLR.java b/src/main/java/org/archive/util/binsearch/impl/http/HTTPURLConnSLR.java index f21437f7..c811ef68 100644 --- a/src/main/java/org/archive/util/binsearch/impl/http/HTTPURLConnSLR.java +++ b/src/main/java/org/archive/util/binsearch/impl/http/HTTPURLConnSLR.java @@ -76,13 +76,12 @@ protected InputStream doSeekLoad(long offset, int maxLength) httpUrlConn.connect(); int code = httpUrlConn.getResponseCode(); + connectedUrl = httpUrlConn.getURL().toString(); if ((code != 206) && (code != 200)) { throw new BadHttpStatusException(code, url + " " + rangeHeader); } - connectedUrl = httpUrlConn.getURL().toString(); - InputStream is = httpUrlConn.getInputStream(); cin = new CountingInputStream(is); return cin; diff --git a/src/main/java/org/archive/util/io/RuntimeIOException.java b/src/main/java/org/archive/util/io/RuntimeIOException.java index b6efbf74..e93e5639 100644 --- a/src/main/java/org/archive/util/io/RuntimeIOException.java +++ b/src/main/java/org/archive/util/io/RuntimeIOException.java @@ -3,13 +3,25 @@ public class RuntimeIOException extends RuntimeException { private static final long serialVersionUID = 4762025404760379497L; + private int status = 503; + public RuntimeIOException() { } + public RuntimeIOException(int status) + { + this.status = status; + } + public RuntimeIOException(Throwable cause) { super(cause); - } + } + + public int getStatus() + { + return status; + } } From 612cdebd1c5b359da57c3b83e818f4491b535e22 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 14 Oct 2013 09:36:50 -0700 Subject: [PATCH 04/27] ADD: CloseableIteratorWrapper utility class for wrapping regular iterators FIX ZIPNUM: Flush cache, if any, when reloading locations --- .../format/gzip/zipnum/ZipNumCluster.java | 4 ++ .../iterator/CloseableIteratorWrapper.java | 42 +++++++++++++++++++ 2 files changed, 46 insertions(+) create mode 100644 src/main/java/org/archive/util/iterator/CloseableIteratorWrapper.java diff --git a/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java b/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java index fbe88033..70e21029 100644 --- a/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java +++ b/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java @@ -243,6 +243,10 @@ protected void syncLoad(long newModTime) locRoot = newLocRoot; } + if (this.locCacheMap != null) { + locCacheMap.clear(); + } + closeExistingFiles(filesToClose); lastModTime = newModTime; diff --git a/src/main/java/org/archive/util/iterator/CloseableIteratorWrapper.java b/src/main/java/org/archive/util/iterator/CloseableIteratorWrapper.java new file mode 100644 index 00000000..f35c85e5 --- /dev/null +++ b/src/main/java/org/archive/util/iterator/CloseableIteratorWrapper.java @@ -0,0 +1,42 @@ +package org.archive.util.iterator; + +import java.io.IOException; +import java.util.Iterator; + +/** + * Wrap a regular Iterator to create a CloseableIterator where the close() is a no-op + * @author ilya + * + * @param + */ + +public class CloseableIteratorWrapper implements CloseableIterator +{ + protected Iterator iter; + + public CloseableIteratorWrapper(Iterator iter) + { + this.iter = iter; + } + + @Override + public boolean hasNext() { + return this.iter.hasNext(); + } + + @Override + public S next() { + return this.iter.next(); + } + + @Override + public void remove() { + this.iter.remove(); + + } + + @Override + public void close() throws IOException { + //No Op + } +} \ No newline at end of file From c5dbc67e6e0aad654658b04a9688179f97bfc980 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 14 Oct 2013 12:47:57 -0700 Subject: [PATCH 05/27] ZipNumBlock Loader: * add info on which shard failed to RuntimIOException * use connectedUrl in all exceptions messages * attemptBlockLoad: use SEVERE only on last retry of required cluster which will lead to a 503, use WARNING otherwise --- .../format/gzip/zipnum/SummaryBlockIterator.java | 2 +- .../archive/format/gzip/zipnum/ZipNumBlockLoader.java | 4 ---- .../org/archive/format/gzip/zipnum/ZipNumCluster.java | 10 +++++++--- .../util/binsearch/impl/http/ApacheHttp31SLR.java | 2 +- .../util/binsearch/impl/http/HTTPURLConnSLR.java | 2 +- .../java/org/archive/util/io/RuntimeIOException.java | 5 +++++ 6 files changed, 15 insertions(+), 10 deletions(-) diff --git a/src/main/java/org/archive/format/gzip/zipnum/SummaryBlockIterator.java b/src/main/java/org/archive/format/gzip/zipnum/SummaryBlockIterator.java index 0046625c..cbf947f6 100644 --- a/src/main/java/org/archive/format/gzip/zipnum/SummaryBlockIterator.java +++ b/src/main/java/org/archive/format/gzip/zipnum/SummaryBlockIterator.java @@ -120,7 +120,7 @@ public CloseableIterator getNextInner() { SeekableLineReader currReader = zipnumIndex.doBlockLoad(currPartId, startOffset, totalLength); if ((currReader == null) && zipnumIndex.isRequired()) { - throw new RuntimeIOException(); + throw new RuntimeIOException("Failed to load shards for: " + currPartId); } if (currReader != null) { diff --git a/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java b/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java index 2144bd30..9a6c459d 100644 --- a/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java +++ b/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java @@ -177,10 +177,6 @@ public SeekableLineReader attemptLoadBlock(String location, long startOffset, in } currReader = null; } - -// if (isRequired) { -// throw new RuntimeIOException(io); -// } } return currReader; diff --git a/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java b/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java index 70e21029..892dfbc0 100644 --- a/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java +++ b/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java @@ -75,7 +75,7 @@ public void run() { Thread.sleep(checkInterval); if (summary != null) { - summary.reloadFactory(); + summary.reloadFactory(); } } @@ -583,7 +583,7 @@ SeekableLineReader loadCachedBalancedReader(String partId, String[] locations, l if (cachedUrl != null) { long start = System.currentTimeMillis(); - reader = blockLoader.attemptLoadBlock(cachedUrl, offset, length, true, isRequired()); + reader = blockLoader.attemptLoadBlock(cachedUrl, offset, length, true, false); long duration = System.currentTimeMillis() - start; @@ -605,6 +605,8 @@ SeekableLineReader loadCachedBalancedReader(String partId, String[] locations, l Collections.shuffle(indexs); } + final int lastIndex = locations.length - 1; + for (int index : indexs) { // Skip failed cached url if (cachedUrl != null && locations[index].equals(cachedUrl)) { @@ -613,7 +615,9 @@ SeekableLineReader loadCachedBalancedReader(String partId, String[] locations, l long start = System.currentTimeMillis(); - reader = blockLoader.attemptLoadBlock(locations[index], offset, length, true, isRequired()); + boolean required = (isRequired() && (index == lastIndex)); + + reader = blockLoader.attemptLoadBlock(locations[index], offset, length, true, required); long duration = System.currentTimeMillis() - start; diff --git a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java index e09c02f9..5964e268 100644 --- a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java +++ b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java @@ -129,7 +129,7 @@ protected InputStream doSeekLoad(long offset, int maxLength) throws IOException connectedUrl = activeMethod.getURI().toString(); if ((code != 206) && (code != 200)) { - throw new BadHttpStatusException(code, url + " " + rangeHeader); + throw new BadHttpStatusException(code, connectedUrl + " " + rangeHeader); } InputStream is = activeMethod.getResponseBodyAsStream(); diff --git a/src/main/java/org/archive/util/binsearch/impl/http/HTTPURLConnSLR.java b/src/main/java/org/archive/util/binsearch/impl/http/HTTPURLConnSLR.java index c811ef68..6d618e43 100644 --- a/src/main/java/org/archive/util/binsearch/impl/http/HTTPURLConnSLR.java +++ b/src/main/java/org/archive/util/binsearch/impl/http/HTTPURLConnSLR.java @@ -79,7 +79,7 @@ protected InputStream doSeekLoad(long offset, int maxLength) connectedUrl = httpUrlConn.getURL().toString(); if ((code != 206) && (code != 200)) { - throw new BadHttpStatusException(code, url + " " + rangeHeader); + throw new BadHttpStatusException(code, connectedUrl + " " + rangeHeader); } InputStream is = httpUrlConn.getInputStream(); diff --git a/src/main/java/org/archive/util/io/RuntimeIOException.java b/src/main/java/org/archive/util/io/RuntimeIOException.java index e93e5639..9b1d4a1a 100644 --- a/src/main/java/org/archive/util/io/RuntimeIOException.java +++ b/src/main/java/org/archive/util/io/RuntimeIOException.java @@ -10,6 +10,11 @@ public RuntimeIOException() } + public RuntimeIOException(String message) + { + super(message); + } + public RuntimeIOException(int status) { this.status = status; From 1fc60191669b3e48da3b30fb595e1250228f9581 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 14 Oct 2013 14:57:26 -0700 Subject: [PATCH 06/27] ZIPNUMLoader: attempt better error msgs by propagating full error in exception --- .../archive/format/gzip/zipnum/ZipNumBlockLoader.java | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java b/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java index 9a6c459d..2247eda4 100644 --- a/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java +++ b/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java @@ -15,6 +15,7 @@ import org.archive.util.binsearch.impl.HTTPSeekableLineReader; import org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory; import org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory.HttpLibs; +import org.archive.util.io.RuntimeIOException; public class ZipNumBlockLoader { @@ -165,8 +166,10 @@ public SeekableLineReader attemptLoadBlock(String location, long startOffset, in actualLocation = location; } + String msg = io.toString() + " -- -r " + startOffset + "-" + (startOffset + totalLength - 1) + " " + actualLocation; + if (LOGGER.isLoggable(level)) { - LOGGER.log(level, io.toString() + " -- -r " + startOffset + "-" + (startOffset + totalLength - 1) + " " + actualLocation + " req? " + isRequired); + LOGGER.log(level, msg); } if (currReader != null) { @@ -177,6 +180,10 @@ public SeekableLineReader attemptLoadBlock(String location, long startOffset, in } currReader = null; } + + if (isRequired) { + throw new RuntimeIOException(msg); + } } return currReader; From 936fb4934b724d767c5338c9557431c1b49bb970 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 14 Oct 2013 17:45:02 -0700 Subject: [PATCH 07/27] FIX: ApacheHttp31SLR save the connected url even on error! ZIPNUM: track 2nd attempt to load correctly --- .../org/archive/format/gzip/zipnum/ZipNumCluster.java | 9 ++++++--- .../util/binsearch/impl/http/ApacheHttp31SLR.java | 1 + 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java b/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java index 892dfbc0..b30cf489 100644 --- a/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java +++ b/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java @@ -607,15 +607,18 @@ SeekableLineReader loadCachedBalancedReader(String partId, String[] locations, l final int lastIndex = locations.length - 1; - for (int index : indexs) { + for (int i = 0; i < indexs.size(); i++) { + + int index = indexs.get(i); + // Skip failed cached url - if (cachedUrl != null && locations[index].equals(cachedUrl)) { + if ((cachedUrl != null) && locations[index].equals(cachedUrl)) { continue; } long start = System.currentTimeMillis(); - boolean required = (isRequired() && (index == lastIndex)); + boolean required = (isRequired() && (i == lastIndex)); reader = blockLoader.attemptLoadBlock(locations[index], offset, length, true, required); diff --git a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java index 5964e268..ad92f3cb 100644 --- a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java +++ b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java @@ -137,6 +137,7 @@ protected InputStream doSeekLoad(long offset, int maxLength) throws IOException return cin; } catch (IOException io) { + connectedUrl = activeMethod.getURI().toString(); doClose(); throw io; } From 04744417eb97c9856c33892cc87476052ceb1e65 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 17 Oct 2013 09:08:06 -0700 Subject: [PATCH 08/27] FIX: ApacheSLR: turn off cookies when using manual cookie --- .../org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java index ad92f3cb..0f2e102d 100644 --- a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java +++ b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java @@ -8,6 +8,7 @@ import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpException; import org.apache.commons.httpclient.HttpMethod; +import org.apache.commons.httpclient.cookie.CookiePolicy; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.httpclient.methods.HeadMethod; import org.apache.commons.io.input.CountingInputStream; @@ -121,6 +122,7 @@ protected InputStream doSeekLoad(long offset, int maxLength) throws IOException } if (this.getCookie() != null) { + activeMethod.getParams().setCookiePolicy(CookiePolicy.IGNORE_COOKIES); activeMethod.setRequestHeader("Cookie", this.getCookie()); } From f82aead0ae91c881485ba24a6694965e707c1c90 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 17 Oct 2013 10:53:59 -0700 Subject: [PATCH 09/27] HttpSLR: add optional error header which can be saved RuntimeIOException: add status --- .../binsearch/impl/HTTPSeekableLineReader.java | 18 ++++++++++++++++++ .../binsearch/impl/http/ApacheHttp31SLR.java | 4 ++++ .../archive/util/io/RuntimeIOException.java | 8 +++++++- 3 files changed, 29 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReader.java b/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReader.java index 63eab9b4..d686a5e2 100644 --- a/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReader.java +++ b/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReader.java @@ -32,6 +32,8 @@ public int getStatus() protected boolean noKeepAlive; protected String cookie; protected String connectedUrl; + protected String errHeader; + protected String saveErrHeader; public abstract String getUrl(); @@ -76,4 +78,20 @@ public String getConnectedUrl() { return connectedUrl; } + + public String getSaveErrHeader() { + return saveErrHeader; + } + + public void setSaveErrHeader(String saveErrHeader) { + this.saveErrHeader = saveErrHeader; + } + + public String getErrHeader() { + return errHeader; + } + + public void setErrHeader(String errHeader) { + this.errHeader = errHeader; + } } \ No newline at end of file diff --git a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java index 0f2e102d..c4fdbba8 100644 --- a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java +++ b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java @@ -139,6 +139,10 @@ protected InputStream doSeekLoad(long offset, int maxLength) throws IOException return cin; } catch (IOException io) { + if (saveErrHeader != null) { + errHeader = getHeaderValue(saveErrHeader); + } + connectedUrl = activeMethod.getURI().toString(); doClose(); throw io; diff --git a/src/main/java/org/archive/util/io/RuntimeIOException.java b/src/main/java/org/archive/util/io/RuntimeIOException.java index 9b1d4a1a..1d74f79c 100644 --- a/src/main/java/org/archive/util/io/RuntimeIOException.java +++ b/src/main/java/org/archive/util/io/RuntimeIOException.java @@ -23,7 +23,13 @@ public RuntimeIOException(int status) public RuntimeIOException(Throwable cause) { super(cause); - } + } + + public RuntimeIOException(int status, Throwable cause) + { + super(cause); + this.status = status; + } public int getStatus() { From 647adec66cd34bac20d2732671606f3e237a2bc5 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 17 Oct 2013 23:03:19 -0700 Subject: [PATCH 10/27] MultiCDXInputSource: Make comparator public --- .../java/org/archive/format/cdx/MultiCDXInputSource.java | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/archive/format/cdx/MultiCDXInputSource.java b/src/main/java/org/archive/format/cdx/MultiCDXInputSource.java index 7f1ff002..35bb9043 100644 --- a/src/main/java/org/archive/format/cdx/MultiCDXInputSource.java +++ b/src/main/java/org/archive/format/cdx/MultiCDXInputSource.java @@ -41,18 +41,22 @@ public void setCdxUris(List cdxUris) throws IOException { } - Comparator comparator = new Comparator() { + public final static Comparator defaultComparator = new Comparator() { public int compare(String s1, String s2) { return s1.compareTo(s2); } }; - Comparator reverseComparator = new Comparator() { + public final static Comparator defaultReverseComparator = new Comparator() { public int compare(String s1, String s2) { return -s1.compareTo(s2); } }; + protected Comparator comparator = defaultComparator; + protected Comparator reverseComparator = defaultReverseComparator; + + public CloseableIterator getCDXIterator(String key, String prefix, boolean exact, ZipNumParams params) throws IOException { SortedCompositeIterator scitr = new SortedCompositeIterator(cdx.size(), params.isReverse() ? reverseComparator : comparator); From 7d4fbf451ed3ed8e09b0eb077b95411928a98a3c Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 18 Oct 2013 13:10:58 -0700 Subject: [PATCH 11/27] FIX: MultiCDXInputSource: add optimization for output lazy initing of sequential cluster load --- .../format/cdx/MultiCDXInputSource.java | 80 ++++++++++++++++++- 1 file changed, 78 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/archive/format/cdx/MultiCDXInputSource.java b/src/main/java/org/archive/format/cdx/MultiCDXInputSource.java index 35bb9043..aa44a887 100644 --- a/src/main/java/org/archive/format/cdx/MultiCDXInputSource.java +++ b/src/main/java/org/archive/format/cdx/MultiCDXInputSource.java @@ -75,14 +75,90 @@ public CloseableIterator getCDXIterator(String key, String prefix, boole return scitr; } + // A special iterator which initializes on actual first use + protected static class LazyInitIterator implements CloseableIterator + { + CDXInputSource source; + CloseableIterator iter; + boolean failed = false; + + String key, start, end; + ZipNumParams params; + + protected LazyInitIterator(CDXInputSource source, String key, String start, String end, ZipNumParams params) + { + this.key = key; + this.start = start; + this.end = end; + + this.params = params; + + this.source = source; + } + + protected void initIter() + { + if (iter != null) { + return; + } + + try { + iter = source.getCDXIterator(key, start, end, params); + } catch (IOException io) { + LOGGER.warning(io.toString()); + iter = null; + } + } + + @Override + public boolean hasNext() { + initIter(); + + if (iter == null) { + return false; + } + + return iter.hasNext(); + } + + @Override + public String next() { + initIter(); + + if (iter == null) { + return null; + } + + return iter.next(); + } + + @Override + public void remove() { + + } + + @Override + public void close() throws IOException { + if (iter != null) { + iter.close(); + } + } + } + public CloseableIterator createSeqIterator(String key, String start, String end, ZipNumParams params) { CloseableCompositeIterator composite = new CloseableCompositeIterator(); CloseableIterator iter = null; - for (CDXInputSource cdxReader : cdx) { + for (int i = 0; i < cdx.size(); i++) { try { - iter = cdxReader.getCDXIterator(key, start, end, params); + CDXInputSource cdxReader = cdx.get(i); + + if (i == (cdx.size() - 1)) { + iter = cdxReader.getCDXIterator(key, start, end, params); + } else { + iter = new LazyInitIterator(cdxReader, key, start, end, params); + } if (!params.isReverse()) { composite.addLast(iter); From 2d2d30e75b95717605580e7e48dd2dca430f8b0e Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 23 Oct 2013 16:51:47 -0700 Subject: [PATCH 12/27] FEATURE: add getTotalCount() to cdx input sources --- src/main/java/org/archive/format/cdx/CDXFile.java | 6 ++++++ .../java/org/archive/format/cdx/CDXInputSource.java | 2 ++ .../org/archive/format/cdx/MultiCDXInputSource.java | 11 +++++++++++ .../org/archive/format/gzip/zipnum/ZipNumCluster.java | 6 +++++- .../org/archive/format/gzip/zipnum/ZipNumIndex.java | 8 ++++++++ 5 files changed, 32 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/archive/format/cdx/CDXFile.java b/src/main/java/org/archive/format/cdx/CDXFile.java index 0c3a777a..7dca0464 100644 --- a/src/main/java/org/archive/format/cdx/CDXFile.java +++ b/src/main/java/org/archive/format/cdx/CDXFile.java @@ -97,4 +97,10 @@ public static BufferedReader createStreamingLineReader(String uri, boolean gzipp BufferedReader reader = new BufferedReader(new InputStreamReader(input)); return reader; } + + @Override + public long getTotalLines() { + //TODO: Implement + return 0; + } } diff --git a/src/main/java/org/archive/format/cdx/CDXInputSource.java b/src/main/java/org/archive/format/cdx/CDXInputSource.java index 0a926ebc..34abde53 100644 --- a/src/main/java/org/archive/format/cdx/CDXInputSource.java +++ b/src/main/java/org/archive/format/cdx/CDXInputSource.java @@ -9,4 +9,6 @@ public interface CDXInputSource { public CloseableIterator getCDXIterator(String key, String prefix, boolean exact, ZipNumParams params) throws IOException; public CloseableIterator getCDXIterator(String key, String start, String startEndUrl, ZipNumParams params) throws IOException; + + public long getTotalLines(); } diff --git a/src/main/java/org/archive/format/cdx/MultiCDXInputSource.java b/src/main/java/org/archive/format/cdx/MultiCDXInputSource.java index aa44a887..cbf70c0e 100644 --- a/src/main/java/org/archive/format/cdx/MultiCDXInputSource.java +++ b/src/main/java/org/archive/format/cdx/MultiCDXInputSource.java @@ -196,4 +196,15 @@ public CloseableIterator getCDXIterator(String key, String start, String return scitr; } + + @Override + public long getTotalLines() { + long sum = 0; + + for (CDXInputSource cdxReader : cdx) { + sum += cdxReader.getTotalLines(); + } + + return sum; + } } diff --git a/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java b/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java index b30cf489..bc773a58 100644 --- a/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java +++ b/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java @@ -190,6 +190,7 @@ public void init() throws IOException startDate = newStartDate; endDate = newEndDate; locRoot = newLocRoot; + this.cdxLinesTotalCount = computeTotalLines(); if (!disabled) { this.loadLastBlockSizes(blockSizesFile); @@ -241,6 +242,8 @@ protected void syncLoad(long newModTime) endDate = newEndDate; disabled = newIsDisabled; locRoot = newLocRoot; + + this.cdxLinesTotalCount = computeTotalLines(); } if (this.locCacheMap != null) { @@ -484,8 +487,9 @@ public long getLastBlockDiff(String startKey, int startPart, int endPart) { return diff; } + // Adjust from shorter blocks, if loaded - public long getTotalLines() + public long computeTotalLines() { long numLines = 0; diff --git a/src/main/java/org/archive/format/gzip/zipnum/ZipNumIndex.java b/src/main/java/org/archive/format/gzip/zipnum/ZipNumIndex.java index 7860be36..ad8c9297 100644 --- a/src/main/java/org/archive/format/gzip/zipnum/ZipNumIndex.java +++ b/src/main/java/org/archive/format/gzip/zipnum/ZipNumIndex.java @@ -29,6 +29,9 @@ public class ZipNumIndex implements CDXInputSource { // Used only for reference / user info protected int cdxLinesPerBlock = 3000; + + protected long cdxLinesTotalCount = 0; + //protected HashMap locMap = null; protected final static boolean DEFAULT_USE_NIO = true; @@ -528,4 +531,9 @@ public boolean isRequired() { public void setRequired(boolean required) { this.required = required; } + + @Override + public long getTotalLines() { + return cdxLinesTotalCount; + } } From baf1ad8921efb472be893e9ee92d48e49e56229e Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 24 Oct 2013 11:19:22 -0700 Subject: [PATCH 13/27] log RuntimeIOException --- .../org/archive/util/binsearch/SeekableLineReaderIterator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/archive/util/binsearch/SeekableLineReaderIterator.java b/src/main/java/org/archive/util/binsearch/SeekableLineReaderIterator.java index ca443ad4..991553c8 100644 --- a/src/main/java/org/archive/util/binsearch/SeekableLineReaderIterator.java +++ b/src/main/java/org/archive/util/binsearch/SeekableLineReaderIterator.java @@ -26,7 +26,7 @@ public String getNextInner() { next = slr.readLine(); } catch (IOException e) { if (propagateException) { - throw new RuntimeIOException(); + throw new RuntimeIOException(e.toString()); } } } From 154386c3d0c741a2750bb16e5720a72468b7407e Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 1 Nov 2013 14:58:19 -0700 Subject: [PATCH 14/27] add ThreadLocalHttpConnectionManager (from heritrix-commons) Switch to use ThreadLocalHttpConnectionManager as default for ApacheSLR! --- .../impl/http/ApacheHttp31SLRFactory.java | 18 +- .../io/ThreadLocalHttpConnectionManager.java | 291 ++++++++++++++++++ 2 files changed, 301 insertions(+), 8 deletions(-) create mode 100644 src/main/java/org/archive/util/io/ThreadLocalHttpConnectionManager.java diff --git a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java index 52e73a94..1f37b365 100644 --- a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java +++ b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java @@ -3,21 +3,21 @@ import java.io.IOException; import java.text.SimpleDateFormat; import java.util.Date; -import java.util.logging.Level; import java.util.logging.Logger; import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler; import org.apache.commons.httpclient.HostConfiguration; import org.apache.commons.httpclient.HttpClient; -import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager; +import org.apache.commons.httpclient.HttpConnectionManager; import org.apache.commons.httpclient.params.HttpClientParams; import org.archive.util.binsearch.impl.HTTPSeekableLineReader; import org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory; +import org.archive.util.httpclient.ThreadLocalHttpConnectionManager; public class ApacheHttp31SLRFactory extends HTTPSeekableLineReaderFactory { private final static Logger LOGGER = Logger.getLogger(ApacheHttp31SLRFactory.class.getName()); - private MultiThreadedHttpConnectionManager connectionManager = null; + private HttpConnectionManager connectionManager = null; private HostConfiguration hostConfiguration = null; private HttpClient http = null; @@ -26,7 +26,8 @@ public ApacheHttp31SLRFactory(String uriString) { } public ApacheHttp31SLRFactory() { - connectionManager = new MultiThreadedHttpConnectionManager(); + //connectionManager = new MultiThreadedHttpConnectionManager(); + connectionManager = new ThreadLocalHttpConnectionManager(); hostConfiguration = new HostConfiguration(); HttpClientParams params = new HttpClientParams(); http = new HttpClient(params,connectionManager); @@ -35,15 +36,16 @@ public ApacheHttp31SLRFactory() { public void close() throws IOException { - connectionManager.deleteClosedConnections(); + //connectionManager.deleteClosedConnections(); + connectionManager.closeIdleConnections(0); } @Override public ApacheHttp31SLR get(String url) throws IOException { - if (LOGGER.isLoggable(Level.FINEST)) { - LOGGER.finest("Connections: " + connectionManager.getConnectionsInPool(hostConfiguration)); - } +// if (LOGGER.isLoggable(Level.FINEST)) { +// LOGGER.finest("Connections: " + connectionManager.getConnectionsInPool(hostConfiguration)); +// } return new ApacheHttp31SLR(http, url); } diff --git a/src/main/java/org/archive/util/io/ThreadLocalHttpConnectionManager.java b/src/main/java/org/archive/util/io/ThreadLocalHttpConnectionManager.java new file mode 100644 index 00000000..83555584 --- /dev/null +++ b/src/main/java/org/archive/util/io/ThreadLocalHttpConnectionManager.java @@ -0,0 +1,291 @@ +/** + * ==================================================================== + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * ==================================================================== + * + */ +package org.archive.util.io; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.logging.Level; +import java.util.logging.Logger; + +import org.apache.commons.httpclient.HostConfiguration; +import org.apache.commons.httpclient.HttpConnection; +import org.apache.commons.httpclient.HttpConnectionManager; +import org.apache.commons.httpclient.params.HttpConnectionManagerParams; + +/** + * A simple, but thread-safe HttpClient {@link HttpConnectionManager}. + * Based on {@link org.apache.commons.httpclient.SimpleHttpConnectionManager}. + * + * Java >= 1.4 is recommended. + * + * @author Christian Kohlschuetter + */ +public final class ThreadLocalHttpConnectionManager implements + HttpConnectionManager { + + private static final CloserThread closer = new CloserThread(); + private static final Logger logger = Logger + .getLogger(ThreadLocalHttpConnectionManager.class.getName()); + + private final ThreadLocal tl = new ThreadLocal() { + protected synchronized ConnectionInfo initialValue() { + return new ConnectionInfo(); + } + }; + + private ConnectionInfo getConnectionInfo() { + return (ConnectionInfo) tl.get(); + } + + private static final class ConnectionInfo { + /** The http connection */ + private HttpConnection conn = null; + + /** + * The time the connection was made idle. + */ + private long idleStartTime = Long.MAX_VALUE; + } + + public ThreadLocalHttpConnectionManager() { + } + + /** + * Since the same connection is about to be reused, make sure the + * previous request was completely processed, and if not + * consume it now. + * @param conn The connection + * @return true, if the connection is reusable + */ + private static boolean finishLastResponse(final HttpConnection conn) { + InputStream lastResponse = conn.getLastResponseInputStream(); + if(lastResponse != null) { + conn.setLastResponseInputStream(null); + try { + lastResponse.close(); + return true; + } catch (IOException ioe) { + // force reconnect. + return false; + } + } else { + return false; + } + } + + /** + * Collection of parameters associated with this connection manager. + */ + private HttpConnectionManagerParams params = new HttpConnectionManagerParams(); + + /** + * @see HttpConnectionManager#getConnection(HostConfiguration) + */ + public HttpConnection getConnection( + final HostConfiguration hostConfiguration) { + return getConnection(hostConfiguration, 0); + } + + /** + * Gets the staleCheckingEnabled value to be set on HttpConnections that are created. + * + * @return true if stale checking will be enabled on HttpConections + * + * @see HttpConnection#isStaleCheckingEnabled() + * + * @deprecated Use {@link HttpConnectionManagerParams#isStaleCheckingEnabled()}, + * {@link HttpConnectionManager#getParams()}. + */ + public boolean isConnectionStaleCheckingEnabled() { + return this.params.isStaleCheckingEnabled(); + } + + /** + * Sets the staleCheckingEnabled value to be set on HttpConnections that are created. + * + * @param connectionStaleCheckingEnabled true if stale checking will be enabled + * on HttpConections + * + * @see HttpConnection#setStaleCheckingEnabled(boolean) + * + * @deprecated Use {@link HttpConnectionManagerParams#setStaleCheckingEnabled(boolean)}, + * {@link HttpConnectionManager#getParams()}. + */ + public void setConnectionStaleCheckingEnabled( + final boolean connectionStaleCheckingEnabled) { + this.params.setStaleCheckingEnabled(connectionStaleCheckingEnabled); + } + + /** + * @see HttpConnectionManager#getConnectionWithTimeout(HostConfiguration, long) + * + * @since 3.0 + */ + public HttpConnection getConnectionWithTimeout( + final HostConfiguration hostConfiguration, final long timeout) { + + final ConnectionInfo ci = getConnectionInfo(); + HttpConnection httpConnection = ci.conn; + + // make sure the host and proxy are correct for this connection + // close it and set the values if they are not + if(httpConnection == null || !finishLastResponse(httpConnection) + || !hostConfiguration.hostEquals(httpConnection) + || !hostConfiguration.proxyEquals(httpConnection)) { + + if(httpConnection != null && httpConnection.isOpen()) { + closer.closeConnection(httpConnection); + } + + httpConnection = new HttpConnection(hostConfiguration); + httpConnection.setHttpConnectionManager(this); + httpConnection.getParams().setDefaults(this.params); + ci.conn = httpConnection; + + httpConnection.setHost(hostConfiguration.getHost()); + httpConnection.setPort(hostConfiguration.getPort()); + httpConnection.setProtocol(hostConfiguration.getProtocol()); + httpConnection.setLocalAddress(hostConfiguration.getLocalAddress()); + + httpConnection.setProxyHost(hostConfiguration.getProxyHost()); + httpConnection.setProxyPort(hostConfiguration.getProxyPort()); + } + + // remove the connection from the timeout handler + ci.idleStartTime = Long.MAX_VALUE; + + return httpConnection; + } + + /** + * @see HttpConnectionManager#getConnection(HostConfiguration, long) + * + * @deprecated Use #getConnectionWithTimeout(HostConfiguration, long) + */ + public HttpConnection getConnection( + final HostConfiguration hostConfiguration, final long timeout) { + return getConnectionWithTimeout(hostConfiguration, timeout); + } + + /** + * @see HttpConnectionManager#releaseConnection(org.apache.commons.httpclient.HttpConnection) + */ + public void releaseConnection(final HttpConnection conn) { + final ConnectionInfo ci = getConnectionInfo(); + HttpConnection httpConnection = ci.conn; + + if(conn != httpConnection) { + throw new IllegalStateException( + "Unexpected release of an unknown connection."); + } + + finishLastResponse(httpConnection); + + // track the time the connection was made idle + ci.idleStartTime = System.currentTimeMillis(); + } + + /** + * Returns {@link HttpConnectionManagerParams parameters} associated + * with this connection manager. + * + * @since 2.1 + * + * @see HttpConnectionManagerParams + */ + public HttpConnectionManagerParams getParams() { + return this.params; + } + + /** + * Assigns {@link HttpConnectionManagerParams parameters} for this + * connection manager. + * + * @since 2.1 + * + * @see HttpConnectionManagerParams + */ + public void setParams(final HttpConnectionManagerParams p) { + if(p == null) { + throw new IllegalArgumentException("Parameters may not be null"); + } + this.params = p; + } + + /** + * @since 3.0 + */ + public void closeIdleConnections(final long idleTimeout) { + long maxIdleTime = System.currentTimeMillis() - idleTimeout; + + final ConnectionInfo ci = getConnectionInfo(); + + if(ci.idleStartTime <= maxIdleTime) { + ci.conn.close(); + } + } + + private static final class CloserThread extends Thread { + private List connections + = new ArrayList(); + + private static final int SLEEP_INTERVAL = 5000; + + public CloserThread() { + super("HttpConnection closer"); + // Make this a daemon thread so it can't be responsible for the JVM + // not shutting down. + setDaemon(true); + start(); + } + + public void closeConnection(final HttpConnection conn) { + synchronized (connections) { + connections.add(conn); + } + } + + public void run() { + try { + while (!Thread.interrupted()) { + Thread.sleep(SLEEP_INTERVAL); + + List s; + synchronized (connections) { + s = connections; + connections = new ArrayList(); + } + logger.log(Level.INFO, "Closing " + s.size() + + " HttpConnections"); + for(final Iterator it = s.iterator(); + it.hasNext();) { + HttpConnection conn = it.next(); + conn.close(); + conn.setHttpConnectionManager(null); + it.remove(); + } + } + } catch (InterruptedException e) { + return; + } + } + } +} From cd4310101050d10e00053f66aed0740b0f46c315 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 1 Nov 2013 22:03:34 +0000 Subject: [PATCH 15/27] fix path for ThreadLocalHttpConnectionManager --- .../util/{io => httpclient}/ThreadLocalHttpConnectionManager.java | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/main/java/org/archive/util/{io => httpclient}/ThreadLocalHttpConnectionManager.java (100%) diff --git a/src/main/java/org/archive/util/io/ThreadLocalHttpConnectionManager.java b/src/main/java/org/archive/util/httpclient/ThreadLocalHttpConnectionManager.java similarity index 100% rename from src/main/java/org/archive/util/io/ThreadLocalHttpConnectionManager.java rename to src/main/java/org/archive/util/httpclient/ThreadLocalHttpConnectionManager.java From 1c58cba6d8d210028187dd089528af72fec77d23 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 1 Nov 2013 22:05:56 +0000 Subject: [PATCH 16/27] fix package --- .../util/httpclient/ThreadLocalHttpConnectionManager.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/archive/util/httpclient/ThreadLocalHttpConnectionManager.java b/src/main/java/org/archive/util/httpclient/ThreadLocalHttpConnectionManager.java index 83555584..d3101e5f 100644 --- a/src/main/java/org/archive/util/httpclient/ThreadLocalHttpConnectionManager.java +++ b/src/main/java/org/archive/util/httpclient/ThreadLocalHttpConnectionManager.java @@ -15,7 +15,7 @@ * ==================================================================== * */ -package org.archive.util.io; +package org.archive.util.httpclient; import java.io.IOException; import java.io.InputStream; From da379775ff2f3482c83fe730f89e6d18ab813e1c Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 1 Nov 2013 15:48:14 -0700 Subject: [PATCH 17/27] move ThreadLocalHttpConnectionManager to original package org.archive.httpclient --- .../{util => }/httpclient/ThreadLocalHttpConnectionManager.java | 2 +- .../util/binsearch/impl/http/ApacheHttp31SLRFactory.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) rename src/main/java/org/archive/{util => }/httpclient/ThreadLocalHttpConnectionManager.java (99%) diff --git a/src/main/java/org/archive/util/httpclient/ThreadLocalHttpConnectionManager.java b/src/main/java/org/archive/httpclient/ThreadLocalHttpConnectionManager.java similarity index 99% rename from src/main/java/org/archive/util/httpclient/ThreadLocalHttpConnectionManager.java rename to src/main/java/org/archive/httpclient/ThreadLocalHttpConnectionManager.java index d3101e5f..91e850ea 100644 --- a/src/main/java/org/archive/util/httpclient/ThreadLocalHttpConnectionManager.java +++ b/src/main/java/org/archive/httpclient/ThreadLocalHttpConnectionManager.java @@ -15,7 +15,7 @@ * ==================================================================== * */ -package org.archive.util.httpclient; +package org.archive.httpclient; import java.io.IOException; import java.io.InputStream; diff --git a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java index 1f37b365..cffd0ebf 100644 --- a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java +++ b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java @@ -10,9 +10,9 @@ import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpConnectionManager; import org.apache.commons.httpclient.params.HttpClientParams; +import org.archive.httpclient.ThreadLocalHttpConnectionManager; import org.archive.util.binsearch.impl.HTTPSeekableLineReader; import org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory; -import org.archive.util.httpclient.ThreadLocalHttpConnectionManager; public class ApacheHttp31SLRFactory extends HTTPSeekableLineReaderFactory { private final static Logger LOGGER = Logger.getLogger(ApacheHttp31SLRFactory.class.getName()); From 68dab84a2d6e2a250055fb2847e7645d9555235f Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 1 Nov 2013 18:10:15 -0700 Subject: [PATCH 18/27] FEATURE: add support for httpcore 4.3 for SeekableLineReader! --- pom.xml | 7 +- .../impl/HTTPSeekableLineReaderFactory.java | 6 + .../impl/http/ApacheHttp31SLRFactory.java | 6 +- .../binsearch/impl/http/ApacheHttp43SLR.java | 180 ++++++++++++++++++ .../impl/http/ApacheHttp43SLRFactory.java | 100 ++++++++++ 5 files changed, 295 insertions(+), 4 deletions(-) create mode 100644 src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp43SLR.java create mode 100644 src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp43SLRFactory.java diff --git a/pom.xml b/pom.xml index db1efecc..03b1240d 100644 --- a/pom.xml +++ b/pom.xml @@ -118,7 +118,12 @@ mg4j 1.0.1 compile - + + + org.apache.httpcomponents + httpcore + 4.3 + diff --git a/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReaderFactory.java b/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReaderFactory.java index c1fa6fb6..b4a23db0 100644 --- a/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReaderFactory.java +++ b/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReaderFactory.java @@ -4,6 +4,7 @@ import org.archive.util.binsearch.SeekableLineReaderFactory; import org.archive.util.binsearch.impl.http.ApacheHttp31SLRFactory; +import org.archive.util.binsearch.impl.http.ApacheHttp43SLRFactory; import org.archive.util.binsearch.impl.http.HTTPURLConnSLRFactory; public abstract class HTTPSeekableLineReaderFactory implements SeekableLineReaderFactory { @@ -20,6 +21,7 @@ protected HTTPSeekableLineReaderFactory() public enum HttpLibs { APACHE_31, + APACHE_43, URLCONN, } @@ -50,6 +52,10 @@ public static HTTPSeekableLineReaderFactory getHttpFactory(HttpLibs type, String case URLCONN: factory = new HTTPURLConnSLRFactory(); break; + + case APACHE_43: + factory = new ApacheHttp43SLRFactory(); + break; } if (factory == null) { diff --git a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java index cffd0ebf..9bd7542b 100644 --- a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java +++ b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java @@ -9,8 +9,8 @@ import org.apache.commons.httpclient.HostConfiguration; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpConnectionManager; +import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager; import org.apache.commons.httpclient.params.HttpClientParams; -import org.archive.httpclient.ThreadLocalHttpConnectionManager; import org.archive.util.binsearch.impl.HTTPSeekableLineReader; import org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory; @@ -26,8 +26,8 @@ public ApacheHttp31SLRFactory(String uriString) { } public ApacheHttp31SLRFactory() { - //connectionManager = new MultiThreadedHttpConnectionManager(); - connectionManager = new ThreadLocalHttpConnectionManager(); + connectionManager = new MultiThreadedHttpConnectionManager(); + //connectionManager = new ThreadLocalHttpConnectionManager(); hostConfiguration = new HostConfiguration(); HttpClientParams params = new HttpClientParams(); http = new HttpClient(params,connectionManager); diff --git a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp43SLR.java b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp43SLR.java new file mode 100644 index 00000000..85a460da --- /dev/null +++ b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp43SLR.java @@ -0,0 +1,180 @@ +package org.archive.util.binsearch.impl.http; + +import java.io.IOException; +import java.io.InputStream; +import java.net.InetSocketAddress; +import java.net.Socket; +import java.net.SocketAddress; +import java.net.URL; + +import org.apache.http.Header; +import org.apache.http.HttpException; +import org.apache.http.HttpRequest; +import org.apache.http.HttpResponse; +import org.apache.http.HttpVersion; +import org.apache.http.impl.DefaultBHttpClientConnection; +import org.apache.http.message.BasicHttpRequest; +import org.archive.util.binsearch.impl.HTTPSeekableLineReader; + +public class ApacheHttp43SLR extends HTTPSeekableLineReader { + + private String urlString; + + private int connectTimeout = 0; + private int readTimeout = 0; + + private Socket socket = null; + private DefaultBHttpClientConnection activeConn = null; + private HttpResponse response = null; + + private final static int BUFF_SIZE = 8192; + + public ApacheHttp43SLR(String url) + { + urlString = url; + } + + public ApacheHttp43SLR(String url, int connectTimeout, int readTimeout) + { + this.urlString = url; + this.connectTimeout = connectTimeout; + this.readTimeout = readTimeout; + } + + @Override + public String getUrl() { + return urlString; + } + + @Override + public long getSize() throws IOException { + if (response == null) { + return 0; + } + + return response.getEntity().getContentLength(); + } + + @Override + public String getHeaderValue(String headerName) { + if (response == null) { + return null; + } + + Header header = response.getFirstHeader(headerName); + if (header == null) { + return null; + } + + return header.getValue(); + } + + protected static int getPort(URL url) + { + int port = url.getPort(); + + if (port > 0) { + return port; + } + + return url.getDefaultPort(); + } + + protected InputStream doSeekLoad(long offset, int maxLength, URL url) + throws IOException { + + SocketAddress endpoint = null; + + try { + socket = new Socket(); + endpoint = new InetSocketAddress(url.getHost(), getPort(url)); + socket.connect(endpoint, connectTimeout); + + activeConn = new DefaultBHttpClientConnection(BUFF_SIZE); + activeConn.bind(socket); + activeConn.setSocketTimeout(readTimeout); + + HttpRequest request = new BasicHttpRequest("GET", url.getFile(), HttpVersion.HTTP_1_1); + + String rangeHeader = makeRangeHeader(offset, maxLength); + + if (rangeHeader != null) { + request.setHeader("Range", rangeHeader); + } + + if (this.isNoKeepAlive()) { + request.setHeader("Connection", "close"); + } + + if (this.getCookie() != null) { + request.setHeader("Cookie", this.getCookie()); + } + + request.setHeader("Accept", "*/*"); + request.setHeader("Host", url.getHost()); + + activeConn.sendRequestHeader(request); + activeConn.flush(); + + response = activeConn.receiveResponseHeader(); + + int code = response.getStatusLine().getStatusCode(); + + connectedUrl = url.toString(); + + if (code > 300 && code < 400) { + Header header = response.getFirstHeader("Location"); + + doClose(); + + if (header != null) { + URL redirectURL = new URL(header.getValue()); + return doSeekLoad(offset, maxLength, redirectURL); + } + } + + if (code != 200 && code != 206) { + throw new BadHttpStatusException(code, connectedUrl + " " + rangeHeader); + } + + activeConn.receiveResponseEntity(response); + + return response.getEntity().getContent(); + + } catch (HttpException e) { + doClose(); + throw new IOException(e); + + } catch (IOException io) { + + if (saveErrHeader != null) { + errHeader = getHeaderValue(saveErrHeader); + } + + connectedUrl = url.toString(); + + doClose(); + throw io; + } + } + + @Override + protected void doClose() throws IOException { + if (activeConn != null) { + activeConn.close(); + activeConn = null; + socket = null; + } else if (socket != null) { + socket.close(); + socket = null; + } + response = null; + } + + @Override + protected InputStream doSeekLoad(long offset, int maxLength) + throws IOException { + + return doSeekLoad(offset, maxLength, new URL(urlString)); + } +} diff --git a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp43SLRFactory.java b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp43SLRFactory.java new file mode 100644 index 00000000..5e3bb3ed --- /dev/null +++ b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp43SLRFactory.java @@ -0,0 +1,100 @@ +package org.archive.util.binsearch.impl.http; + +import java.io.IOException; + +import org.archive.util.binsearch.impl.HTTPSeekableLineReader; +import org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory; + +public class ApacheHttp43SLRFactory extends HTTPSeekableLineReaderFactory { + + private int readTimeout = 0; + private int connectTimeout = 0; + + public ApacheHttp43SLRFactory() + { + + } + + @Override + public HTTPSeekableLineReader get(String url) throws IOException { + return new ApacheHttp43SLR(url, connectTimeout, readTimeout); + } + + @Override + public void close() throws IOException { + // TODO Auto-generated method stub + } + + @Override + public void setProxyHostPort(String hostPort) { + // TODO Auto-generated method stub + + } + + @Override + public void setMaxTotalConnections(int maxTotalConnections) { + // TODO Auto-generated method stub + + } + + @Override + public int getMaxTotalConnections() { + // TODO Auto-generated method stub + return 0; + } + + @Override + public void setMaxHostConnections(int maxHostConnections) { + // TODO Auto-generated method stub + + } + + @Override + public int getMaxHostConnections() { + // TODO Auto-generated method stub + return 0; + } + + @Override + public int getConnectionTimeoutMS() { + return connectTimeout; + } + + @Override + public void setConnectionTimeoutMS(int connectionTimeoutMS) { + connectTimeout = connectionTimeoutMS; + + } + + @Override + public int getSocketTimeoutMS() { + return readTimeout; + } + + @Override + public void setSocketTimeoutMS(int socketTimeoutMS) { + readTimeout = socketTimeoutMS; + } + + @Override + public void setStaleChecking(boolean enabled) { + + } + + @Override + public boolean isStaleChecking() { + // TODO Auto-generated method stub + return false; + } + + @Override + public long getModTime() { + // TODO Auto-generated method stub + return 0; + } + + @Override + public void setNumRetries(int numRetries) { + // TODO Auto-generated method stub + } +} From 54bdd7da37c195cc1885a9e6e424a6f91555f1df Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 2 Nov 2013 10:47:35 -0700 Subject: [PATCH 19/27] apach43SLR: better reading of entire buffer --- .../binsearch/impl/http/ApacheHttp43SLR.java | 40 +++++++++++++++++-- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp43SLR.java b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp43SLR.java index 85a460da..ef206bb1 100644 --- a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp43SLR.java +++ b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp43SLR.java @@ -1,5 +1,6 @@ package org.archive.util.binsearch.impl.http; +import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.net.InetSocketAddress; @@ -14,7 +15,9 @@ import org.apache.http.HttpVersion; import org.apache.http.impl.DefaultBHttpClientConnection; import org.apache.http.message.BasicHttpRequest; +import org.apache.http.util.EntityUtils; import org.archive.util.binsearch.impl.HTTPSeekableLineReader; +import org.archive.util.zip.GZIPMembersInputStream; public class ApacheHttp43SLR extends HTTPSeekableLineReader { @@ -83,11 +86,10 @@ protected static int getPort(URL url) protected InputStream doSeekLoad(long offset, int maxLength, URL url) throws IOException { - SocketAddress endpoint = null; - try { + SocketAddress endpoint = new InetSocketAddress(url.getHost(), getPort(url)); + socket = new Socket(); - endpoint = new InetSocketAddress(url.getHost(), getPort(url)); socket.connect(endpoint, connectTimeout); activeConn = new DefaultBHttpClientConnection(BUFF_SIZE); @@ -104,6 +106,8 @@ protected InputStream doSeekLoad(long offset, int maxLength, URL url) if (this.isNoKeepAlive()) { request.setHeader("Connection", "close"); + } else { + request.setHeader("Connection", "keep-alive"); } if (this.getCookie() != null) { @@ -157,6 +161,36 @@ protected InputStream doSeekLoad(long offset, int maxLength, URL url) throw io; } } + + @Override + public void seekWithMaxRead(long offset, boolean gzip, int maxLength) throws IOException + { + if (closed) { + throw new IOException("Seek after close()"); + } + + br = null; + + try { + doSeekLoad(offset, maxLength); + + if (bufferFully && (maxLength > 0)) { + byte[] buffer = EntityUtils.toByteArray(response.getEntity()); + + doClose(); + + is = new ByteArrayInputStream(buffer); + } + + if (gzip) { + is = new GZIPMembersInputStream(is, blockSize); + } + + } catch (IOException io) { + doClose(); + throw io; + } + } @Override protected void doClose() throws IOException { From 1e8c1afaffc2378145f5171d6f4c4c2ac7f30c3b Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 3 Nov 2013 18:38:29 -0800 Subject: [PATCH 20/27] canonicalizer: add ExtractRule and RewriteRule --- .../java/org/archive/url/ExtractRule.java | 45 +++++++++++++++ .../java/org/archive/url/RewriteRule.java | 55 +++++++++++++++++++ .../org/archive/url/WaybackURLKeyMaker.java | 48 ++-------------- 3 files changed, 104 insertions(+), 44 deletions(-) create mode 100644 src/main/java/org/archive/url/ExtractRule.java create mode 100644 src/main/java/org/archive/url/RewriteRule.java diff --git a/src/main/java/org/archive/url/ExtractRule.java b/src/main/java/org/archive/url/ExtractRule.java new file mode 100644 index 00000000..6d975b61 --- /dev/null +++ b/src/main/java/org/archive/url/ExtractRule.java @@ -0,0 +1,45 @@ +package org.archive.url; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class ExtractRule +{ + protected String startsWith; + protected String regex; + + protected Pattern regexPattern; + + public String getStartsWith() { + return startsWith; + } + public void setStartsWith(String startsWith) { + this.startsWith = startsWith; + } + public String getRegex() { + return regex; + } + public void setRegex(String regex) { + regexPattern = Pattern.compile(regex); + this.regex = regex; + } + + public Matcher extract(String url) + { + if ((startsWith != null) && !url.startsWith(startsWith)) { + return null; + } + + if (regexPattern == null) { + return null; + } + + Matcher match = regexPattern.matcher(url); + + if (!match.find()) { + return null; + } + + return match; + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/url/RewriteRule.java b/src/main/java/org/archive/url/RewriteRule.java new file mode 100644 index 00000000..47292686 --- /dev/null +++ b/src/main/java/org/archive/url/RewriteRule.java @@ -0,0 +1,55 @@ +package org.archive.url; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class RewriteRule +{ + protected String startsWith; + protected String regex; + protected String replace; + + protected Pattern regexPattern; + + public String getStartsWith() { + return startsWith; + } + public void setStartsWith(String startsWith) { + this.startsWith = startsWith; + } + public String getRegex() { + return regex; + } + public void setRegex(String regex) { + regexPattern = Pattern.compile(regex); + this.regex = regex; + } + public String getReplace() { + return replace; + } + public void setReplace(String replace) { + this.replace = replace; + } + + public boolean rewrite(StringBuilder sb) + { + String urlkey = sb.toString(); + + if ((startsWith != null) && !urlkey.startsWith(startsWith)) { + return false; + } + + if (regexPattern == null || replace == null) { + return false; + } + + Matcher match = regexPattern.matcher(urlkey); + + if (match.matches()) { + sb.replace(0, sb.length(), match.replaceAll(replace)); + return true; + } + + return false; + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/url/WaybackURLKeyMaker.java b/src/main/java/org/archive/url/WaybackURLKeyMaker.java index 23c67d06..99fb92e9 100644 --- a/src/main/java/org/archive/url/WaybackURLKeyMaker.java +++ b/src/main/java/org/archive/url/WaybackURLKeyMaker.java @@ -2,8 +2,6 @@ import java.net.URISyntaxException; import java.util.List; -import java.util.regex.Matcher; -import java.util.regex.Pattern; public class WaybackURLKeyMaker implements URLKeyMaker { // URLCanonicalizer canonicalizer = new NonMassagingIAURLCanonicalizer(); @@ -21,34 +19,6 @@ public void setCanonicalizer(URLCanonicalizer canonicalizer) { protected List customRules; - public static class RewriteRule - { - String startsWith; - String regex; - String replace; - Pattern regexPattern; - - public String getStartsWith() { - return startsWith; - } - public void setStartsWith(String startsWith) { - this.startsWith = startsWith; - } - public String getRegex() { - return regex; - } - public void setRegex(String regex) { - regexPattern = Pattern.compile(regex); - this.regex = regex; - } - public String getReplace() { - return replace; - } - public void setReplace(String replace) { - this.replace = replace; - } - } - public WaybackURLKeyMaker() { @@ -117,22 +87,12 @@ public void setCustomRules(List customRules) { protected String applyCustomRules(String urlkey) { + StringBuilder sb = new StringBuilder(urlkey); + for (RewriteRule rule : customRules) { - if ((rule.startsWith != null) && !urlkey.startsWith(rule.startsWith)) { - continue; - } - - if (rule.regexPattern == null || rule.replace == null) { - continue; - } - - Matcher match = rule.regexPattern.matcher(urlkey); - - if (match.matches()) { - urlkey = match.replaceAll(rule.replace); - } + rule.rewrite(sb); } - return urlkey; + return sb.toString(); } } From 0ebbad2b2c71f1e4277105cb193985f0158b7739 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 16 Nov 2013 12:47:11 -0800 Subject: [PATCH 21/27] FIX: add cdxlinefactory doesn't require custom format extractrule, check for empty string --- .../archive/format/cdx/StandardCDXLineFactory.java | 11 +++++++++++ src/main/java/org/archive/url/ExtractRule.java | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/archive/format/cdx/StandardCDXLineFactory.java b/src/main/java/org/archive/format/cdx/StandardCDXLineFactory.java index d2c299e5..33da41f1 100644 --- a/src/main/java/org/archive/format/cdx/StandardCDXLineFactory.java +++ b/src/main/java/org/archive/format/cdx/StandardCDXLineFactory.java @@ -39,6 +39,17 @@ public FieldSplitFormat getParseFormat() return parseFormat; } + public CDXLine createStandardCDXLine(String input) + { + if (parseFormat == cdx11) { + return new CDX11Line(input, parseFormat); + } else if (parseFormat == cdx09) { + return new CDX09Line(input, parseFormat); + } else { + return new CDXLine(input, parseFormat); + } + } + public CDXLine createStandardCDXLine(String input, FieldSplitFormat exFormat) { if (parseFormat == cdx11) { diff --git a/src/main/java/org/archive/url/ExtractRule.java b/src/main/java/org/archive/url/ExtractRule.java index 6d975b61..bcfb3b2f 100644 --- a/src/main/java/org/archive/url/ExtractRule.java +++ b/src/main/java/org/archive/url/ExtractRule.java @@ -26,7 +26,7 @@ public void setRegex(String regex) { public Matcher extract(String url) { - if ((startsWith != null) && !url.startsWith(startsWith)) { + if ((startsWith != null) && !startsWith.isEmpty() && !url.startsWith(startsWith)) { return null; } From 5077ad8ece1d0d63948db8371e83f39045d9de2b Mon Sep 17 00:00:00 2001 From: Vinay Goel Date: Sat, 23 Nov 2013 04:41:16 -0800 Subject: [PATCH 22/27] Extract outlinks/hopinfo from warc/metadata records --- .../WARCMetadataRecordExtractorOutput.java | 150 ++++++++++++++++++ 1 file changed, 150 insertions(+) create mode 100644 src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java diff --git a/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java b/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java new file mode 100644 index 00000000..0d564a6f --- /dev/null +++ b/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java @@ -0,0 +1,150 @@ +package org.archive.extract; + +import java.io.IOException; +import java.io.PrintWriter; +import java.net.MalformedURLException; +import java.net.URISyntaxException; +import java.net.URL; +import java.util.List; +import java.util.logging.Logger; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.archive.format.gzip.GZIPFormatException; +import org.archive.format.json.JSONUtils; +import org.archive.format.json.SimpleJSONPathSpec; +import org.archive.resource.MetaData; +import org.archive.resource.Resource; +import org.archive.util.IAUtils; +import org.archive.util.StreamCopy; +import org.json.JSONArray; +import org.json.JSONException; +import org.json.JSONObject; + +import com.google.common.io.CountingOutputStream; +import com.google.common.io.NullOutputStream; + +public class WARCMetadataRecordExtractorOutput implements ExtractorOutput { + private static final Logger LOG = + Logger.getLogger(WARCMetadataRecordExtractorOutput.class.getName()); + + private PrintWriter out; + SimpleJSONPathSpec formatSpec = new SimpleJSONPathSpec("Envelope.Format"); + SimpleJSONPathSpec warcURL = new SimpleJSONPathSpec("Envelope.WARC-Header-Metadata.WARC-Target-URI"); + SimpleJSONPathSpec warcDate = new SimpleJSONPathSpec("Envelope.WARC-Header-Metadata.WARC-Date"); + SimpleJSONPathSpec warcType = new SimpleJSONPathSpec("Envelope.WARC-Header-Metadata.WARC-Type"); + SimpleJSONPathSpec warcMetadataRecord = new SimpleJSONPathSpec("Envelope.Payload-Metadata.WARC-Metadata-Metadata.Metadata-Records"); + + private String outputType = "outlinks"; + + public WARCMetadataRecordExtractorOutput(PrintWriter out, String outputType) { + this.out = out; + this.outputType = outputType; + } + + public WARCMetadataRecordExtractorOutput(PrintWriter out) { + this(out,"outlinks"); + } + + public void output(Resource resource) throws IOException { + NullOutputStream nullo = new NullOutputStream(); + CountingOutputStream co = new CountingOutputStream(nullo); + try { + StreamCopy.copy(resource.getInputStream(), co); + } catch(GZIPFormatException e) { + e.printStackTrace(); + return; + } + long bytes = co.getCount(); + if(bytes > 0) { + LOG.info(bytes + " unconsumed bytes in Resource InputStream."); + } + try { + MetaData m = resource.getMetaData().getTopMetaData(); + // URL DATE OURL MIME HTTP-CODE SHA1 META REDIR OFFSET LENGTH FILE + String format = getEnvelopeFormat(m); + String origUrl = "TBD"; + String date = "TBD"; + String canUrl = "TBD"; + + if(format.equals("WARC")) { + origUrl = getWARCURL(m); + date = getWARCDate(m); + String type = getWARCType(m); + if(type.equals("metadata")) { + String warcMetadataRecord = getWARCMetadataRecord(m); + + JSONArray array = new JSONArray(warcMetadataRecord); + String viaUrl = "-"; + String viaPath = "-"; + String sourceTag = "-"; + for(int i=0;i 2) + //'outlinks': 'origUrl date origOutlinkUrl linktype linktext' + out.format("%s\t%s\t%s\t%s\t\n",origUrl,date,linkParts[0],linkParts[2]); + } + } else if(outputType.equals("hopinfo")) { + String key = obj.get("Name").toString(); + String value = obj.get("Value").toString(); + if(key.equals("via")) { + viaUrl = value; + } else if (key.equals("hopsFromSeed")) { + viaPath = value; + } else if (key.equals("sourceTag")) { + sourceTag = value; + } + } + } + if(outputType.equals("hopinfo")) { + //'hopinfo': 'origCrawledUrl date origViaUrl hopPathFromVia sourceTag' + out.format("%s\t%s\t%s\t%s\t%s\n",origUrl,date,viaUrl,viaPath,sourceTag); + } + } + } + + } + catch (Exception e) { + throw new IOException(e); + } + out.flush(); + } + + private String getEnvelopeFormat(MetaData m) { + return unwrapFirst(formatSpec.extract(m),"-"); + } + private String getWARCURL(MetaData m) { + return unwrapFirst(warcURL.extract(m),"-"); + } + private String getWARCDate(MetaData m) { + return unwrapFirst(warcDate.extract(m),"-"); + } + private String getWARCType(MetaData m) { + return unwrapFirst(warcType.extract(m),"-"); + } + private String getWARCMetadataRecord(MetaData m) { + return unwrapFirst(warcMetadataRecord.extract(m),"-"); + } + + private String unwrapFirst(List> l, String defaultValue) { + if(l != null) { + if(l.size() > 0) { + if(l.get(0) != null) { + if(l.get(0).size() > 0) { + String v = l.get(0).get(0); + if(v != null) { + if(v.length() > 0) { + return v; + } + } + } + } + } + } + return defaultValue; + } +} From 329ff22b0ab15656454ae682acb00b44beedc371 Mon Sep 17 00:00:00 2001 From: Vinay Goel Date: Mon, 25 Nov 2013 19:10:07 +0000 Subject: [PATCH 23/27] reference pom.xml for building with CDH4 --- pom-cdh4.xml | 229 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 229 insertions(+) create mode 100644 pom-cdh4.xml diff --git a/pom-cdh4.xml b/pom-cdh4.xml new file mode 100644 index 00000000..de19d8d0 --- /dev/null +++ b/pom-cdh4.xml @@ -0,0 +1,229 @@ + + 4.0.0 + + org.archive + ia-web-commons + 1.0-SNAPSHOT + jar + + ia-web-commons + http://maven.apache.org + + + UTF-8 + ${maven.build.timestamp} + yyyyMMddhhmmss + + + + + junit + junit + 3.8.1 + test + + + + com.google.guava + guava + 14.0.1 + + + + org.json + json + 20090211 + + + org.htmlparser + htmlparser + 1.6 + + + + org.mozilla + juniversalchardet + 1.0.3 + + + + commons-httpclient + commons-httpclient + 3.1 + + + + org.apache.hadoop + hadoop-core + 2.0.0-mr1-cdh4.2.0 + + + commons-httpclient + commons-httpclient + + + javax.servlet + servlet-api + + + javax.servlet.jsp + jsp-api + + + org.mortbay.jetty + jetty + + + org.mortbay.jetty + jetty-util + + + tomcat + jasper-runtime + + + tomcat + jasper-compiler + + + + + org.apache.hadoop + hadoop-common + 2.0.0-cdh4.2.0 + + + org.apache.hadoop + hadoop-mapreduce-client-common + 2.0.0-cdh4.2.0 + + + org.apache.hadoop + hadoop-mapreduce-client-core + 2.0.0-cdh4.2.0 + + + + org.apache.pig + pig + 0.11.1 + provided + + + + commons-lang + commons-lang + 2.5 + + + + commons-io + commons-io + 2.4 + + + + org.gnu.inet + libidn + 1.15 + + + it.unimi.dsi + mg4j + 1.0.1 + compile + + + org.apache.httpcomponents + httpcore + 4.3 + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 2.3.2 + + 1.6 + 1.6 + + + + maven-assembly-plugin + 2.4 + + + jar-with-dependencies + + ia-web-commons + + + + package + + single + + + + + + + + src/main/resources + true + + + + + + + internetarchive + Internet Archive Maven Repository + http://builds.archive.org:8080/maven2 + default + + + true + daily + warn + + + true + daily + warn + + + + + cloudera + Cloudera Hadoop + https://repository.cloudera.com/artifactory/cloudera-repos/ + default + + + true + daily + warn + + + true + daily + warn + + + + + + + + repository + + ${repository.url} + + + + From fc24be82f632abd7ca56337be77b7ee683368338 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 5 Dec 2013 16:13:18 -0800 Subject: [PATCH 24/27] moving org.archive.net.PublicSuffixes to ia-web-commons --- .../java/org/archive/net/PublicSuffixes.java | 363 + src/main/resources/effective_tld_names.dat | 7045 +++++++++++++++++ .../org/archive/net/PublicSuffixesTest.java | 193 + 3 files changed, 7601 insertions(+) create mode 100644 src/main/java/org/archive/net/PublicSuffixes.java create mode 100644 src/main/resources/effective_tld_names.dat create mode 100644 src/test/java/org/archive/net/PublicSuffixesTest.java diff --git a/src/main/java/org/archive/net/PublicSuffixes.java b/src/main/java/org/archive/net/PublicSuffixes.java new file mode 100644 index 00000000..eab8081a --- /dev/null +++ b/src/main/java/org/archive/net/PublicSuffixes.java @@ -0,0 +1,363 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.net; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.FileInputStream; +import java.io.FileWriter; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.io.PrintWriter; +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.commons.io.IOUtils; +import org.archive.util.TextUtils; + +/** + * Utility class for making use of the information about 'public suffixes' at + * http://publicsuffix.org. + * + * The public suffix list (once known as 'effective TLDs') was motivated by the + * need to decide on which broader domains a subdomain was allowed to set + * cookies. For example, a server at 'www.example.com' can set cookies for + * 'www.example.com' or 'example.com' but not 'com'. 'www.example.co.uk' can set + * cookies for 'www.example.co.uk' or 'example.co.uk' but not 'co.uk' or 'uk'. + * The number of rules for all top-level-domains and 2nd- or 3rd- level domains + * has become quite long; essentially the broadest domain a subdomain may assign + * to is the one that was sold/registered to a specific name registrant. + * + * This concept should be useful in other contexts, too. Grouping URIs (or + * queues of URIs to crawl) together with others sharing the same registered + * suffix may be useful for applying the same rules to all, such as assigning + * them to the same queue or crawler in a multi- machine setup. + * + * As of Heritrix3, we prefer the term 'Assignment Level Domain' (ALD) + * for such domains, by analogy to 'Top Level Domain' (TLD) or '2nd Level + * Domain' (2LD), etc. + * + * @author Gojomo + * + * this version of PublicSuffixes uses suffix-tree data structure for generating less + * redundant regular expression. It may be even possible to write a light-weight, + * thread-safe matcher based on this class. + * @author Kenji Nagahashi + */ +public class PublicSuffixes { + protected static Pattern topmostAssignedSurtPrefixPattern; + protected static String topmostAssignedSurtPrefixRegex; + + /** + * prefix tree node. each Node represents sequence of letters (prefix) + * and alternative sequences following it (list of Node's). Nodes in + * {@code branches} are sorted for skip list like lookup and for generating + * effective regular expression (see {@link #compareTo(Node)} and {@link #compareTo(char).) + * + * as is intended for internal use only, there's no access methods. procedures for updating + * prefix tree with new input are defined within this class ({@link #addBranch(CharSequence)}). + * + * terminal node could be represented in two different form: 1) Node with zero branches, + * or 2) Node with zero-length {@code cs}. So, root node must be initialized with empty (not null) + * {@code branches} unless empty string matches the overall pattern. + * {@code cs} must not be null except for root node. + */ + public static class Node implements Comparable { + protected CharSequence cs; + protected List branches; + public Node() { + this("", null); + } + protected Node(CharSequence cs) { + this(cs, null); + } + protected Node(CharSequence cs, List branches) { + this.cs = cs; + this.branches = branches; + } + public void addBranch(CharSequence s) { + if (branches == null) { + branches = new ArrayList(); + branches.add(new Node("", null)); + } + for (int i = 0; i < branches.size(); i++) { + Node alt = branches.get(i); + if (alt.add(s)) return; + if (alt.compareTo(s.charAt(0)) > 0) { + Node alt1 = new Node(s, null); + branches.add(i, alt1); + return; + } + } + Node alt2 = new Node(s, null); + branches.add(alt2); + } + public boolean add(CharSequence s) { + int l = Math.min(s.length(), cs.length()); + int i = 0; + while (i < l && s.charAt(i) == cs.charAt(i)) + i++; + // zero-length match holds only when both cs and s are empty. + if (i == 0) return cs.length() == 0 && s.length() == 0; + if (i < cs.length()) { + CharSequence cs0 = cs.subSequence(0, i); + CharSequence cs1 = cs.subSequence(i, cs.length()); + CharSequence cs2 = s.subSequence(i, s.length()); + cs = cs0; + Node alt1 = new Node(cs1, branches); + (branches = new ArrayList()).add(alt1); + addBranch(cs2); + } else { + assert i == cs.length(); + addBranch(s.subSequence(i, s.length())); + } + return true; + } + public int compareTo(Node other) { + if (other.cs == null || other.cs.length() == 0) + return (cs == null || cs.length() == 0) ? 0 : -1; + return compareTo(other.cs.charAt(0)); + } + public int compareTo(char oc) { + if (cs == null || cs.length() == 0) return 1; + // '!' and '*' must come after ordinary letters, in this order, for regexp + // to work as intended. + char c = cs.charAt(0); + if (c == oc) return 0; + if (c == '!') return oc == '*' ? -1 : 1; + if (c == '*') return 1; + if (oc == '*' || oc == '!') return -1; + return Character.valueOf(c).compareTo(oc); + // for generating the same regexp as previous version. + //return Character.valueOf(oc).compareTo(c); + } + } + + /** + * Utility method for dumping a regex String, based on a published public + * suffix list, which matches any SURT-form hostname up through the broadest + * 'private' (assigned/sold) domain-segment. That is, for any of the + * SURT-form hostnames... + * + * com,example, com,example,www, com,example,california,www + * + * ...the regex will match 'com,example,'. + * + * @param args + * @throws IOException + */ + public static void main(String args[]) throws IOException { + InputStream is; + if (args.length == 0 || "=".equals(args[0])) { + // use bundled list + is = PublicSuffixes.class.getClassLoader().getResourceAsStream( + "effective_tld_names.dat"); + } else { + is = new FileInputStream(args[0]); + } + BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8")); + String regex = getTopmostAssignedSurtPrefixRegex(reader); + IOUtils.closeQuietly(is); + + boolean needsClose = false; + BufferedWriter writer; + if (args.length >= 2) { + // write to specified file + writer = new BufferedWriter(new FileWriter(args[1])); + needsClose = true; + } else { + // write to stdout + writer = new BufferedWriter(new OutputStreamWriter(System.out)); + } + writer.append(regex); + writer.flush(); + if (needsClose) { + writer.close(); + } + } + /** + * Reads a file of the format promulgated by publicsuffix.org, ignoring + * comments and '!' exceptions/notations, converting domain segments to + * SURT-ordering. Leaves glob-style '*' wildcarding in place. Returns root + * node of SURT-ordered prefix tree. + * + * @param reader + * @return root of prefix tree node. + * @throws IOException + */ + protected static Node readPublishedFileToSurtTrie(BufferedReader reader) throws IOException { + // initializing with empty Alt list prevents empty pattern from being + // created for the first addBranch() + Node alt = new Node(null, new ArrayList()); + String line; + while ((line = reader.readLine()) != null) { + // discard whitespace, empty lines, comments, exceptions + line = line.trim(); + if (line.length() == 0 || line.startsWith("//")) continue; + // discard utf8 notation after entry + line = line.split("\\s+")[0]; + // TODO: maybe we don't need to create lower-cased String + line = line.toLowerCase(); + // SURT-order domain segments + String[] segs = line.split("\\."); + StringBuilder sb = new StringBuilder(); + for (int i = segs.length - 1; i >= 0; i--) { + if (segs[i].length() == 0) continue; + sb.append(segs[i]).append(','); + } + alt.addBranch(sb.toString()); + } + return alt; + } + /** + * utility function for dumping prefix tree structure. intended for debug use. + * @param alt root of prefix tree. + * @param lv indent level. 0 for root (no indent). + * @param out writer to send output to. + */ + public static void dump(Node alt, int lv, PrintWriter out) { + for (int i = 0; i < lv; i++) + out.print(" "); + out.println(alt.cs != null ? ('"'+alt.cs.toString()+'"') : "(null)"); + if (alt.branches != null) { + for (Node br : alt.branches) { + dump(br, lv + 1, out); + } + } + } + /** + * bulids regular expression from prefix-tree {@code alt} into buffer {@code sb}. + * @param alt prefix tree root. + * @param sb StringBuffer to store regular expression. + */ + protected static void buildRegex(Node alt, StringBuilder sb) { + String close = null; + if (alt.cs != null) { + // actually '!' always be the first character, because it is + // always used along with '*'. + for (int i = 0; i < alt.cs.length(); i++) { + char c = alt.cs.charAt(i); + if (c == '!') { + if (close != null) + throw new RuntimeException("more than one '!'"); + sb.append("(?="); + close = ")"; + } else if (c == '*') { + sb.append("[-\\w]+"); + } else { + sb.append(c); + } + } + } + if (alt.branches != null) { + // alt.branches.size() should always be > 1 + if (alt.branches.size() > 1) { + sb.append("(?:"); + } + String sep = ""; + for (Node alt1 : alt.branches) { + sb.append(sep); sep = "|"; + buildRegex(alt1, sb); + } + if (alt.branches.size() > 1) { + sb.append(")"); + } + } + if (close != null) + sb.append(close); + } + + /** + * Converts SURT-ordered list of public prefixes into a Java regex which + * matches the public-portion "plus one" segment, giving the domain on which + * cookies can be set or other policy grouping should occur. Also adds to + * regex a fallback matcher that for any new/unknown TLDs assumes the + * second-level domain is assignable. (Eg: 'zzz,example,'). + * + * @param list + * @return + */ + private static String surtPrefixRegexFromTrie(Node trie) { + StringBuilder regex = new StringBuilder(); + regex.append("(?ix)^\n"); + trie.addBranch("*,"); // for new/unknown TLDs + buildRegex(trie, regex); + regex.append("\n([-\\w]+,)"); + return regex.toString(); + } + + public static synchronized Pattern getTopmostAssignedSurtPrefixPattern() { + if (topmostAssignedSurtPrefixPattern == null) { + topmostAssignedSurtPrefixPattern = Pattern + .compile(getTopmostAssignedSurtPrefixRegex()); + } + return topmostAssignedSurtPrefixPattern; + } + + public static synchronized String getTopmostAssignedSurtPrefixRegex() { + if (topmostAssignedSurtPrefixRegex == null) { + // use bundled list + try { + BufferedReader reader = new BufferedReader(new InputStreamReader( + PublicSuffixes.class.getClassLoader().getResourceAsStream( + "effective_tld_names.dat"), "UTF-8")); + topmostAssignedSurtPrefixRegex = getTopmostAssignedSurtPrefixRegex(reader); + IOUtils.closeQuietly(reader); + } catch (UnsupportedEncodingException ex) { + // should never happen + throw new RuntimeException(ex); + } + } + return topmostAssignedSurtPrefixRegex; + } + + public static String getTopmostAssignedSurtPrefixRegex(BufferedReader reader) { + try { + Node trie = readPublishedFileToSurtTrie(reader); + return surtPrefixRegexFromTrie(trie); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + /** + * Truncate SURT to its topmost assigned domain segment; that is, + * the public suffix plus one segment, but as a SURT-ordered prefix. + * + * if the pattern doesn't match, the passed-in SURT is returned. + * + * @param surt SURT to truncate + * @return truncated-to-topmost-assigned SURT prefix + */ + public static String reduceSurtToAssignmentLevel(String surt) { + Matcher matcher = TextUtils.getMatcher( + getTopmostAssignedSurtPrefixRegex(), surt); + if (matcher.find()) { + surt = matcher.group(); + } + TextUtils.recycleMatcher(matcher); + return surt; + } +} diff --git a/src/main/resources/effective_tld_names.dat b/src/main/resources/effective_tld_names.dat new file mode 100644 index 00000000..7c4a0860 --- /dev/null +++ b/src/main/resources/effective_tld_names.dat @@ -0,0 +1,7045 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +// ===BEGIN ICANN DOMAINS=== + +// ac : http://en.wikipedia.org/wiki/.ac +ac +com.ac +edu.ac +gov.ac +net.ac +mil.ac +org.ac + +// ad : http://en.wikipedia.org/wiki/.ad +ad +nom.ad + +// ae : http://en.wikipedia.org/wiki/.ae +// see also: "Domain Name Eligibility Policy" at http://www.aeda.ae/eng/aepolicy.php +ae +co.ae +net.ae +org.ae +sch.ae +ac.ae +gov.ae +mil.ae + +// aero : see http://www.information.aero/index.php?id=66 +aero +accident-investigation.aero +accident-prevention.aero +aerobatic.aero +aeroclub.aero +aerodrome.aero +agents.aero +aircraft.aero +airline.aero +airport.aero +air-surveillance.aero +airtraffic.aero +air-traffic-control.aero +ambulance.aero +amusement.aero +association.aero +author.aero +ballooning.aero +broker.aero +caa.aero +cargo.aero +catering.aero +certification.aero +championship.aero +charter.aero +civilaviation.aero +club.aero +conference.aero +consultant.aero +consulting.aero +control.aero +council.aero +crew.aero +design.aero +dgca.aero +educator.aero +emergency.aero +engine.aero +engineer.aero +entertainment.aero +equipment.aero +exchange.aero +express.aero +federation.aero +flight.aero +freight.aero +fuel.aero +gliding.aero +government.aero +groundhandling.aero +group.aero +hanggliding.aero +homebuilt.aero +insurance.aero +journal.aero +journalist.aero +leasing.aero +logistics.aero +magazine.aero +maintenance.aero +marketplace.aero +media.aero +microlight.aero +modelling.aero +navigation.aero +parachuting.aero +paragliding.aero +passenger-association.aero +pilot.aero +press.aero +production.aero +recreation.aero +repbody.aero +res.aero +research.aero +rotorcraft.aero +safety.aero +scientist.aero +services.aero +show.aero +skydiving.aero +software.aero +student.aero +taxi.aero +trader.aero +trading.aero +trainer.aero +union.aero +workinggroup.aero +works.aero + +// af : http://www.nic.af/help.jsp +af +gov.af +com.af +org.af +net.af +edu.af + +// ag : http://www.nic.ag/prices.htm +ag +com.ag +org.ag +net.ag +co.ag +nom.ag + +// ai : http://nic.com.ai/ +ai +off.ai +com.ai +net.ai +org.ai + +// al : http://www.ert.gov.al/ert_alb/faq_det.html?Id=31 +al +com.al +edu.al +gov.al +mil.al +net.al +org.al + +// am : http://en.wikipedia.org/wiki/.am +am + +// an : http://www.una.an/an_domreg/default.asp +an +com.an +net.an +org.an +edu.an + +// ao : http://en.wikipedia.org/wiki/.ao +// http://www.dns.ao/REGISTR.DOC +ao +ed.ao +gv.ao +og.ao +co.ao +pb.ao +it.ao + +// aq : http://en.wikipedia.org/wiki/.aq +aq + +// ar : http://en.wikipedia.org/wiki/.ar +*.ar +!congresodelalengua3.ar +!educ.ar +!gobiernoelectronico.ar +!mecon.ar +!nacion.ar +!nic.ar +!promocion.ar +!retina.ar +!uba.ar + +// arpa : http://en.wikipedia.org/wiki/.arpa +// Confirmed by registry 2008-06-18 +e164.arpa +in-addr.arpa +ip6.arpa +iris.arpa +uri.arpa +urn.arpa + +// as : http://en.wikipedia.org/wiki/.as +as +gov.as + +// asia : http://en.wikipedia.org/wiki/.asia +asia + +// at : http://en.wikipedia.org/wiki/.at +// Confirmed by registry 2008-06-17 +at +ac.at +co.at +gv.at +or.at + +// au : http://en.wikipedia.org/wiki/.au +// http://www.auda.org.au/ +// 2LDs +com.au +net.au +org.au +edu.au +gov.au +asn.au +id.au +csiro.au +// Historic 2LDs (closed to new registration, but sites still exist) +info.au +conf.au +oz.au +// CGDNs - http://www.cgdn.org.au/ +act.au +nsw.au +nt.au +qld.au +sa.au +tas.au +vic.au +wa.au +// 3LDs +act.edu.au +nsw.edu.au +nt.edu.au +qld.edu.au +sa.edu.au +tas.edu.au +vic.edu.au +wa.edu.au +act.gov.au +// Removed at request of Shae.Donelan@services.nsw.gov.au, 2010-03-04 +// nsw.gov.au +nt.gov.au +qld.gov.au +sa.gov.au +tas.gov.au +vic.gov.au +wa.gov.au + +// aw : http://en.wikipedia.org/wiki/.aw +aw +com.aw + +// ax : http://en.wikipedia.org/wiki/.ax +ax + +// az : http://en.wikipedia.org/wiki/.az +az +com.az +net.az +int.az +gov.az +org.az +edu.az +info.az +pp.az +mil.az +name.az +pro.az +biz.az + +// ba : http://en.wikipedia.org/wiki/.ba +ba +org.ba +net.ba +edu.ba +gov.ba +mil.ba +unsa.ba +unbi.ba +co.ba +com.ba +rs.ba + +// bb : http://en.wikipedia.org/wiki/.bb +bb +biz.bb +com.bb +edu.bb +gov.bb +info.bb +net.bb +org.bb +store.bb + +// bd : http://en.wikipedia.org/wiki/.bd +*.bd + +// be : http://en.wikipedia.org/wiki/.be +// Confirmed by registry 2008-06-08 +be +ac.be + +// bf : http://en.wikipedia.org/wiki/.bf +bf +gov.bf + +// bg : http://en.wikipedia.org/wiki/.bg +// https://www.register.bg/user/static/rules/en/index.html +bg +a.bg +b.bg +c.bg +d.bg +e.bg +f.bg +g.bg +h.bg +i.bg +j.bg +k.bg +l.bg +m.bg +n.bg +o.bg +p.bg +q.bg +r.bg +s.bg +t.bg +u.bg +v.bg +w.bg +x.bg +y.bg +z.bg +0.bg +1.bg +2.bg +3.bg +4.bg +5.bg +6.bg +7.bg +8.bg +9.bg + +// bh : http://en.wikipedia.org/wiki/.bh +bh +com.bh +edu.bh +net.bh +org.bh +gov.bh + +// bi : http://en.wikipedia.org/wiki/.bi +// http://whois.nic.bi/ +bi +co.bi +com.bi +edu.bi +or.bi +org.bi + +// biz : http://en.wikipedia.org/wiki/.biz +biz + +// bj : http://en.wikipedia.org/wiki/.bj +bj +asso.bj +barreau.bj +gouv.bj + +// bm : http://www.bermudanic.bm/dnr-text.txt +bm +com.bm +edu.bm +gov.bm +net.bm +org.bm + +// bn : http://en.wikipedia.org/wiki/.bn +*.bn + +// bo : http://www.nic.bo/ +bo +com.bo +edu.bo +gov.bo +gob.bo +int.bo +org.bo +net.bo +mil.bo +tv.bo + +// br : http://registro.br/dominio/dpn.html +// Updated by registry 2011-03-01 +br +adm.br +adv.br +agr.br +am.br +arq.br +art.br +ato.br +b.br +bio.br +blog.br +bmd.br +cim.br +cng.br +cnt.br +com.br +coop.br +ecn.br +eco.br +edu.br +emp.br +eng.br +esp.br +etc.br +eti.br +far.br +flog.br +fm.br +fnd.br +fot.br +fst.br +g12.br +ggf.br +gov.br +imb.br +ind.br +inf.br +jor.br +jus.br +leg.br +lel.br +mat.br +med.br +mil.br +mus.br +net.br +nom.br +not.br +ntr.br +odo.br +org.br +ppg.br +pro.br +psc.br +psi.br +qsl.br +radio.br +rec.br +slg.br +srv.br +taxi.br +teo.br +tmp.br +trd.br +tur.br +tv.br +vet.br +vlog.br +wiki.br +zlg.br + +// bs : http://www.nic.bs/rules.html +bs +com.bs +net.bs +org.bs +edu.bs +gov.bs + +// bt : http://en.wikipedia.org/wiki/.bt +bt +com.bt +edu.bt +gov.bt +net.bt +org.bt + +// bv : No registrations at this time. +// Submitted by registry 2006-06-16 + +// bw : http://en.wikipedia.org/wiki/.bw +// http://www.gobin.info/domainname/bw.doc +// list of other 2nd level tlds ? +bw +co.bw +org.bw + +// by : http://en.wikipedia.org/wiki/.by +// http://tld.by/rules_2006_en.html +// list of other 2nd level tlds ? +by +gov.by +mil.by +// Official information does not indicate that com.by is a reserved +// second-level domain, but it's being used as one (see www.google.com.by and +// www.yahoo.com.by, for example), so we list it here for safety's sake. +com.by + +// http://hoster.by/ +of.by + +// bz : http://en.wikipedia.org/wiki/.bz +// http://www.belizenic.bz/ +bz +com.bz +net.bz +org.bz +edu.bz +gov.bz + +// ca : http://en.wikipedia.org/wiki/.ca +ca +// ca geographical names +ab.ca +bc.ca +mb.ca +nb.ca +nf.ca +nl.ca +ns.ca +nt.ca +nu.ca +on.ca +pe.ca +qc.ca +sk.ca +yk.ca +// gc.ca: http://en.wikipedia.org/wiki/.gc.ca +// see also: http://registry.gc.ca/en/SubdomainFAQ +gc.ca + +// cat : http://en.wikipedia.org/wiki/.cat +cat + +// cc : http://en.wikipedia.org/wiki/.cc +cc + +// cd : http://en.wikipedia.org/wiki/.cd +// see also: https://www.nic.cd/domain/insertDomain_2.jsp?act=1 +cd +gov.cd + +// cf : http://en.wikipedia.org/wiki/.cf +cf + +// cg : http://en.wikipedia.org/wiki/.cg +cg + +// ch : http://en.wikipedia.org/wiki/.ch +ch + +// ci : http://en.wikipedia.org/wiki/.ci +// http://www.nic.ci/index.php?page=charte +ci +org.ci +or.ci +com.ci +co.ci +edu.ci +ed.ci +ac.ci +net.ci +go.ci +asso.ci +aéroport.ci +int.ci +presse.ci +md.ci +gouv.ci + +// ck : http://en.wikipedia.org/wiki/.ck +*.ck +!www.ck + +// cl : http://en.wikipedia.org/wiki/.cl +cl +gov.cl +gob.cl +co.cl +mil.cl + +// cm : http://en.wikipedia.org/wiki/.cm +cm +gov.cm + +// cn : http://en.wikipedia.org/wiki/.cn +// Submitted by registry 2008-06-11 +cn +ac.cn +com.cn +edu.cn +gov.cn +net.cn +org.cn +mil.cn +公司.cn +网络.cn +網絡.cn +// cn geographic names +ah.cn +bj.cn +cq.cn +fj.cn +gd.cn +gs.cn +gz.cn +gx.cn +ha.cn +hb.cn +he.cn +hi.cn +hl.cn +hn.cn +jl.cn +js.cn +jx.cn +ln.cn +nm.cn +nx.cn +qh.cn +sc.cn +sd.cn +sh.cn +sn.cn +sx.cn +tj.cn +xj.cn +xz.cn +yn.cn +zj.cn +hk.cn +mo.cn +tw.cn + +// co : http://en.wikipedia.org/wiki/.co +// Submitted by registry 2008-06-11 +co +arts.co +com.co +edu.co +firm.co +gov.co +info.co +int.co +mil.co +net.co +nom.co +org.co +rec.co +web.co + +// com : http://en.wikipedia.org/wiki/.com +com + +// coop : http://en.wikipedia.org/wiki/.coop +coop + +// cr : http://www.nic.cr/niccr_publico/showRegistroDominiosScreen.do +cr +ac.cr +co.cr +ed.cr +fi.cr +go.cr +or.cr +sa.cr + +// cu : http://en.wikipedia.org/wiki/.cu +cu +com.cu +edu.cu +org.cu +net.cu +gov.cu +inf.cu + +// cv : http://en.wikipedia.org/wiki/.cv +cv + +// cw : http://www.una.cw/cw_registry/ +// Confirmed by registry 2013-03-26 +cw +com.cw +edu.cw +net.cw +org.cw + +// cx : http://en.wikipedia.org/wiki/.cx +// list of other 2nd level tlds ? +cx +gov.cx + +// cy : http://en.wikipedia.org/wiki/.cy +*.cy + +// cz : http://en.wikipedia.org/wiki/.cz +cz + +// de : http://en.wikipedia.org/wiki/.de +// Confirmed by registry (with technical +// reservations) 2008-07-01 +de + +// dj : http://en.wikipedia.org/wiki/.dj +dj + +// dk : http://en.wikipedia.org/wiki/.dk +// Confirmed by registry 2008-06-17 +dk + +// dm : http://en.wikipedia.org/wiki/.dm +dm +com.dm +net.dm +org.dm +edu.dm +gov.dm + +// do : http://en.wikipedia.org/wiki/.do +do +art.do +com.do +edu.do +gob.do +gov.do +mil.do +net.do +org.do +sld.do +web.do + +// dz : http://en.wikipedia.org/wiki/.dz +dz +com.dz +org.dz +net.dz +gov.dz +edu.dz +asso.dz +pol.dz +art.dz + +// ec : http://www.nic.ec/reg/paso1.asp +// Submitted by registry 2008-07-04 +ec +com.ec +info.ec +net.ec +fin.ec +k12.ec +med.ec +pro.ec +org.ec +edu.ec +gov.ec +gob.ec +mil.ec + +// edu : http://en.wikipedia.org/wiki/.edu +edu + +// ee : http://www.eenet.ee/EENet/dom_reeglid.html#lisa_B +ee +edu.ee +gov.ee +riik.ee +lib.ee +med.ee +com.ee +pri.ee +aip.ee +org.ee +fie.ee + +// eg : http://en.wikipedia.org/wiki/.eg +eg +com.eg +edu.eg +eun.eg +gov.eg +mil.eg +name.eg +net.eg +org.eg +sci.eg + +// er : http://en.wikipedia.org/wiki/.er +*.er + +// es : https://www.nic.es/site_ingles/ingles/dominios/index.html +es +com.es +nom.es +org.es +gob.es +edu.es + +// et : http://en.wikipedia.org/wiki/.et +*.et + +// eu : http://en.wikipedia.org/wiki/.eu +eu + +// fi : http://en.wikipedia.org/wiki/.fi +fi +// aland.fi : http://en.wikipedia.org/wiki/.ax +// This domain is being phased out in favor of .ax. As there are still many +// domains under aland.fi, we still keep it on the list until aland.fi is +// completely removed. +// TODO: Check for updates (expected to be phased out around Q1/2009) +aland.fi + +// fj : http://en.wikipedia.org/wiki/.fj +*.fj + +// fk : http://en.wikipedia.org/wiki/.fk +*.fk + +// fm : http://en.wikipedia.org/wiki/.fm +fm + +// fo : http://en.wikipedia.org/wiki/.fo +fo + +// fr : http://www.afnic.fr/ +// domaines descriptifs : http://www.afnic.fr/obtenir/chartes/nommage-fr/annexe-descriptifs +fr +com.fr +asso.fr +nom.fr +prd.fr +presse.fr +tm.fr +// domaines sectoriels : http://www.afnic.fr/obtenir/chartes/nommage-fr/annexe-sectoriels +aeroport.fr +assedic.fr +avocat.fr +avoues.fr +cci.fr +chambagri.fr +chirurgiens-dentistes.fr +experts-comptables.fr +geometre-expert.fr +gouv.fr +greta.fr +huissier-justice.fr +medecin.fr +notaires.fr +pharmacien.fr +port.fr +veterinaire.fr + +// ga : http://en.wikipedia.org/wiki/.ga +ga + +// gb : This registry is effectively dormant +// Submitted by registry 2008-06-12 + +// gd : http://en.wikipedia.org/wiki/.gd +gd + +// ge : http://www.nic.net.ge/policy_en.pdf +ge +com.ge +edu.ge +gov.ge +org.ge +mil.ge +net.ge +pvt.ge + +// gf : http://en.wikipedia.org/wiki/.gf +gf + +// gg : http://www.channelisles.net/applic/avextn.shtml +gg +co.gg +org.gg +net.gg +sch.gg +gov.gg + +// gh : http://en.wikipedia.org/wiki/.gh +// see also: http://www.nic.gh/reg_now.php +// Although domains directly at second level are not possible at the moment, +// they have been possible for some time and may come back. +gh +com.gh +edu.gh +gov.gh +org.gh +mil.gh + +// gi : http://www.nic.gi/rules.html +gi +com.gi +ltd.gi +gov.gi +mod.gi +edu.gi +org.gi + +// gl : http://en.wikipedia.org/wiki/.gl +// http://nic.gl +gl + +// gm : http://www.nic.gm/htmlpages%5Cgm-policy.htm +gm + +// gn : http://psg.com/dns/gn/gn.txt +// Submitted by registry 2008-06-17 +ac.gn +com.gn +edu.gn +gov.gn +org.gn +net.gn + +// gov : http://en.wikipedia.org/wiki/.gov +gov + +// gp : http://www.nic.gp/index.php?lang=en +gp +com.gp +net.gp +mobi.gp +edu.gp +org.gp +asso.gp + +// gq : http://en.wikipedia.org/wiki/.gq +gq + +// gr : https://grweb.ics.forth.gr/english/1617-B-2005.html +// Submitted by registry 2008-06-09 +gr +com.gr +edu.gr +net.gr +org.gr +gov.gr + +// gs : http://en.wikipedia.org/wiki/.gs +gs + +// gt : http://www.gt/politicas_de_registro.html +gt +com.gt +edu.gt +gob.gt +ind.gt +mil.gt +net.gt +org.gt + +// gu : http://gadao.gov.gu/registration.txt +*.gu + +// gw : http://en.wikipedia.org/wiki/.gw +gw + +// gy : http://en.wikipedia.org/wiki/.gy +// http://registry.gy/ +gy +co.gy +com.gy +net.gy + +// hk : https://www.hkdnr.hk +// Submitted by registry 2008-06-11 +hk +com.hk +edu.hk +gov.hk +idv.hk +net.hk +org.hk +公司.hk +教育.hk +敎育.hk +政府.hk +個人.hk +个人.hk +箇人.hk +網络.hk +网络.hk +组織.hk +網絡.hk +网絡.hk +组织.hk +組織.hk +組织.hk + +// hm : http://en.wikipedia.org/wiki/.hm +hm + +// hn : http://www.nic.hn/politicas/ps02,,05.html +hn +com.hn +edu.hn +org.hn +net.hn +mil.hn +gob.hn + +// hr : http://www.dns.hr/documents/pdf/HRTLD-regulations.pdf +hr +iz.hr +from.hr +name.hr +com.hr + +// ht : http://www.nic.ht/info/charte.cfm +ht +com.ht +shop.ht +firm.ht +info.ht +adult.ht +net.ht +pro.ht +org.ht +med.ht +art.ht +coop.ht +pol.ht +asso.ht +edu.ht +rel.ht +gouv.ht +perso.ht + +// hu : http://www.domain.hu/domain/English/sld.html +// Confirmed by registry 2008-06-12 +hu +co.hu +info.hu +org.hu +priv.hu +sport.hu +tm.hu +2000.hu +agrar.hu +bolt.hu +casino.hu +city.hu +erotica.hu +erotika.hu +film.hu +forum.hu +games.hu +hotel.hu +ingatlan.hu +jogasz.hu +konyvelo.hu +lakas.hu +media.hu +news.hu +reklam.hu +sex.hu +shop.hu +suli.hu +szex.hu +tozsde.hu +utazas.hu +video.hu + +// id : https://register.pandi.or.id/ +id +ac.id +biz.id +co.id +go.id +mil.id +my.id +net.id +or.id +sch.id +web.id + +// ie : http://en.wikipedia.org/wiki/.ie +ie +gov.ie + +// il : http://en.wikipedia.org/wiki/.il +*.il + +// im : https://www.nic.im/pdfs/imfaqs.pdf +im +co.im +ltd.co.im +plc.co.im +net.im +gov.im +org.im +nic.im +ac.im + +// in : http://en.wikipedia.org/wiki/.in +// see also: http://www.inregistry.in/policies/ +// Please note, that nic.in is not an offical eTLD, but used by most +// government institutions. +in +co.in +firm.in +net.in +org.in +gen.in +ind.in +nic.in +ac.in +edu.in +res.in +gov.in +mil.in + +// info : http://en.wikipedia.org/wiki/.info +info + +// int : http://en.wikipedia.org/wiki/.int +// Confirmed by registry 2008-06-18 +int +eu.int + +// io : http://www.nic.io/rules.html +// list of other 2nd level tlds ? +io +com.io + +// iq : http://www.cmc.iq/english/iq/iqregister1.htm +iq +gov.iq +edu.iq +mil.iq +com.iq +org.iq +net.iq + +// ir : http://www.nic.ir/Terms_and_Conditions_ir,_Appendix_1_Domain_Rules +// Also see http://www.nic.ir/Internationalized_Domain_Names +// Two .ir entries added at request of , 2010-04-16 +ir +ac.ir +co.ir +gov.ir +id.ir +net.ir +org.ir +sch.ir +// xn--mgba3a4f16a.ir (.ir, Persian YEH) +ایران.ir +// xn--mgba3a4fra.ir (.ir, Arabic YEH) +ايران.ir + +// is : http://www.isnic.is/domain/rules.php +// Confirmed by registry 2008-12-06 +is +net.is +com.is +edu.is +gov.is +org.is +int.is + +// it : http://en.wikipedia.org/wiki/.it +it +gov.it +edu.it +// list of reserved geo-names : +// http://www.nic.it/documenti/regolamenti-e-linee-guida/regolamento-assegnazione-versione-6.0.pdf +// (There is also a list of reserved geo-names corresponding to Italian +// municipalities : http://www.nic.it/documenti/appendice-c.pdf , but it is +// not included here.) +agrigento.it +ag.it +alessandria.it +al.it +ancona.it +an.it +aosta.it +aoste.it +ao.it +arezzo.it +ar.it +ascoli-piceno.it +ascolipiceno.it +ap.it +asti.it +at.it +avellino.it +av.it +bari.it +ba.it +andria-barletta-trani.it +andriabarlettatrani.it +trani-barletta-andria.it +tranibarlettaandria.it +barletta-trani-andria.it +barlettatraniandria.it +andria-trani-barletta.it +andriatranibarletta.it +trani-andria-barletta.it +traniandriabarletta.it +bt.it +belluno.it +bl.it +benevento.it +bn.it +bergamo.it +bg.it +biella.it +bi.it +bologna.it +bo.it +bolzano.it +bozen.it +balsan.it +alto-adige.it +altoadige.it +suedtirol.it +bz.it +brescia.it +bs.it +brindisi.it +br.it +cagliari.it +ca.it +caltanissetta.it +cl.it +campobasso.it +cb.it +carboniaiglesias.it +carbonia-iglesias.it +iglesias-carbonia.it +iglesiascarbonia.it +ci.it +caserta.it +ce.it +catania.it +ct.it +catanzaro.it +cz.it +chieti.it +ch.it +como.it +co.it +cosenza.it +cs.it +cremona.it +cr.it +crotone.it +kr.it +cuneo.it +cn.it +dell-ogliastra.it +dellogliastra.it +ogliastra.it +og.it +enna.it +en.it +ferrara.it +fe.it +fermo.it +fm.it +firenze.it +florence.it +fi.it +foggia.it +fg.it +forli-cesena.it +forlicesena.it +cesena-forli.it +cesenaforli.it +fc.it +frosinone.it +fr.it +genova.it +genoa.it +ge.it +gorizia.it +go.it +grosseto.it +gr.it +imperia.it +im.it +isernia.it +is.it +laquila.it +aquila.it +aq.it +la-spezia.it +laspezia.it +sp.it +latina.it +lt.it +lecce.it +le.it +lecco.it +lc.it +livorno.it +li.it +lodi.it +lo.it +lucca.it +lu.it +macerata.it +mc.it +mantova.it +mn.it +massa-carrara.it +massacarrara.it +carrara-massa.it +carraramassa.it +ms.it +matera.it +mt.it +medio-campidano.it +mediocampidano.it +campidano-medio.it +campidanomedio.it +vs.it +messina.it +me.it +milano.it +milan.it +mi.it +modena.it +mo.it +monza.it +monza-brianza.it +monzabrianza.it +monzaebrianza.it +monzaedellabrianza.it +monza-e-della-brianza.it +mb.it +napoli.it +naples.it +na.it +novara.it +no.it +nuoro.it +nu.it +oristano.it +or.it +padova.it +padua.it +pd.it +palermo.it +pa.it +parma.it +pr.it +pavia.it +pv.it +perugia.it +pg.it +pescara.it +pe.it +pesaro-urbino.it +pesarourbino.it +urbino-pesaro.it +urbinopesaro.it +pu.it +piacenza.it +pc.it +pisa.it +pi.it +pistoia.it +pt.it +pordenone.it +pn.it +potenza.it +pz.it +prato.it +po.it +ragusa.it +rg.it +ravenna.it +ra.it +reggio-calabria.it +reggiocalabria.it +rc.it +reggio-emilia.it +reggioemilia.it +re.it +rieti.it +ri.it +rimini.it +rn.it +roma.it +rome.it +rm.it +rovigo.it +ro.it +salerno.it +sa.it +sassari.it +ss.it +savona.it +sv.it +siena.it +si.it +siracusa.it +sr.it +sondrio.it +so.it +taranto.it +ta.it +tempio-olbia.it +tempioolbia.it +olbia-tempio.it +olbiatempio.it +ot.it +teramo.it +te.it +terni.it +tr.it +torino.it +turin.it +to.it +trapani.it +tp.it +trento.it +trentino.it +tn.it +treviso.it +tv.it +trieste.it +ts.it +udine.it +ud.it +varese.it +va.it +venezia.it +venice.it +ve.it +verbania.it +vb.it +vercelli.it +vc.it +verona.it +vr.it +vibo-valentia.it +vibovalentia.it +vv.it +vicenza.it +vi.it +viterbo.it +vt.it + +// je : http://www.channelisles.net/applic/avextn.shtml +je +co.je +org.je +net.je +sch.je +gov.je + +// jm : http://www.com.jm/register.html +*.jm + +// jo : http://www.dns.jo/Registration_policy.aspx +jo +com.jo +org.jo +net.jo +edu.jo +sch.jo +gov.jo +mil.jo +name.jo + +// jobs : http://en.wikipedia.org/wiki/.jobs +jobs + +// jp : http://en.wikipedia.org/wiki/.jp +// http://jprs.co.jp/en/jpdomain.html +// Updated by registry 2012-05-28 +jp +// jp organizational type names +ac.jp +ad.jp +co.jp +ed.jp +go.jp +gr.jp +lg.jp +ne.jp +or.jp +// jp preficture type names +aichi.jp +akita.jp +aomori.jp +chiba.jp +ehime.jp +fukui.jp +fukuoka.jp +fukushima.jp +gifu.jp +gunma.jp +hiroshima.jp +hokkaido.jp +hyogo.jp +ibaraki.jp +ishikawa.jp +iwate.jp +kagawa.jp +kagoshima.jp +kanagawa.jp +kochi.jp +kumamoto.jp +kyoto.jp +mie.jp +miyagi.jp +miyazaki.jp +nagano.jp +nagasaki.jp +nara.jp +niigata.jp +oita.jp +okayama.jp +okinawa.jp +osaka.jp +saga.jp +saitama.jp +shiga.jp +shimane.jp +shizuoka.jp +tochigi.jp +tokushima.jp +tokyo.jp +tottori.jp +toyama.jp +wakayama.jp +yamagata.jp +yamaguchi.jp +yamanashi.jp +// jp geographic type names +// http://jprs.jp/doc/rule/saisoku-1.html +*.kawasaki.jp +*.kitakyushu.jp +*.kobe.jp +*.nagoya.jp +*.sapporo.jp +*.sendai.jp +*.yokohama.jp +!city.kawasaki.jp +!city.kitakyushu.jp +!city.kobe.jp +!city.nagoya.jp +!city.sapporo.jp +!city.sendai.jp +!city.yokohama.jp +// 4th level registration +aisai.aichi.jp +ama.aichi.jp +anjo.aichi.jp +asuke.aichi.jp +chiryu.aichi.jp +chita.aichi.jp +fuso.aichi.jp +gamagori.aichi.jp +handa.aichi.jp +hazu.aichi.jp +hekinan.aichi.jp +higashiura.aichi.jp +ichinomiya.aichi.jp +inazawa.aichi.jp +inuyama.aichi.jp +isshiki.aichi.jp +iwakura.aichi.jp +kanie.aichi.jp +kariya.aichi.jp +kasugai.aichi.jp +kira.aichi.jp +kiyosu.aichi.jp +komaki.aichi.jp +konan.aichi.jp +kota.aichi.jp +mihama.aichi.jp +miyoshi.aichi.jp +nagakute.aichi.jp +nishio.aichi.jp +nisshin.aichi.jp +obu.aichi.jp +oguchi.aichi.jp +oharu.aichi.jp +okazaki.aichi.jp +owariasahi.aichi.jp +seto.aichi.jp +shikatsu.aichi.jp +shinshiro.aichi.jp +shitara.aichi.jp +tahara.aichi.jp +takahama.aichi.jp +tobishima.aichi.jp +toei.aichi.jp +togo.aichi.jp +tokai.aichi.jp +tokoname.aichi.jp +toyoake.aichi.jp +toyohashi.aichi.jp +toyokawa.aichi.jp +toyone.aichi.jp +toyota.aichi.jp +tsushima.aichi.jp +yatomi.aichi.jp +akita.akita.jp +daisen.akita.jp +fujisato.akita.jp +gojome.akita.jp +hachirogata.akita.jp +happou.akita.jp +higashinaruse.akita.jp +honjo.akita.jp +honjyo.akita.jp +ikawa.akita.jp +kamikoani.akita.jp +kamioka.akita.jp +katagami.akita.jp +kazuno.akita.jp +kitaakita.akita.jp +kosaka.akita.jp +kyowa.akita.jp +misato.akita.jp +mitane.akita.jp +moriyoshi.akita.jp +nikaho.akita.jp +noshiro.akita.jp +odate.akita.jp +oga.akita.jp +ogata.akita.jp +semboku.akita.jp +yokote.akita.jp +yurihonjo.akita.jp +aomori.aomori.jp +gonohe.aomori.jp +hachinohe.aomori.jp +hashikami.aomori.jp +hiranai.aomori.jp +hirosaki.aomori.jp +itayanagi.aomori.jp +kuroishi.aomori.jp +misawa.aomori.jp +mutsu.aomori.jp +nakadomari.aomori.jp +noheji.aomori.jp +oirase.aomori.jp +owani.aomori.jp +rokunohe.aomori.jp +sannohe.aomori.jp +shichinohe.aomori.jp +shingo.aomori.jp +takko.aomori.jp +towada.aomori.jp +tsugaru.aomori.jp +tsuruta.aomori.jp +abiko.chiba.jp +asahi.chiba.jp +chonan.chiba.jp +chosei.chiba.jp +choshi.chiba.jp +chuo.chiba.jp +funabashi.chiba.jp +futtsu.chiba.jp +hanamigawa.chiba.jp +ichihara.chiba.jp +ichikawa.chiba.jp +ichinomiya.chiba.jp +inzai.chiba.jp +isumi.chiba.jp +kamagaya.chiba.jp +kamogawa.chiba.jp +kashiwa.chiba.jp +katori.chiba.jp +katsuura.chiba.jp +kimitsu.chiba.jp +kisarazu.chiba.jp +kozaki.chiba.jp +kujukuri.chiba.jp +kyonan.chiba.jp +matsudo.chiba.jp +midori.chiba.jp +mihama.chiba.jp +minamiboso.chiba.jp +mobara.chiba.jp +mutsuzawa.chiba.jp +nagara.chiba.jp +nagareyama.chiba.jp +narashino.chiba.jp +narita.chiba.jp +noda.chiba.jp +oamishirasato.chiba.jp +omigawa.chiba.jp +onjuku.chiba.jp +otaki.chiba.jp +sakae.chiba.jp +sakura.chiba.jp +shimofusa.chiba.jp +shirako.chiba.jp +shiroi.chiba.jp +shisui.chiba.jp +sodegaura.chiba.jp +sosa.chiba.jp +tako.chiba.jp +tateyama.chiba.jp +togane.chiba.jp +tohnosho.chiba.jp +tomisato.chiba.jp +urayasu.chiba.jp +yachimata.chiba.jp +yachiyo.chiba.jp +yokaichiba.chiba.jp +yokoshibahikari.chiba.jp +yotsukaido.chiba.jp +ainan.ehime.jp +honai.ehime.jp +ikata.ehime.jp +imabari.ehime.jp +iyo.ehime.jp +kamijima.ehime.jp +kihoku.ehime.jp +kumakogen.ehime.jp +masaki.ehime.jp +matsuno.ehime.jp +matsuyama.ehime.jp +namikata.ehime.jp +niihama.ehime.jp +ozu.ehime.jp +saijo.ehime.jp +seiyo.ehime.jp +shikokuchuo.ehime.jp +tobe.ehime.jp +toon.ehime.jp +uchiko.ehime.jp +uwajima.ehime.jp +yawatahama.ehime.jp +echizen.fukui.jp +eiheiji.fukui.jp +fukui.fukui.jp +ikeda.fukui.jp +katsuyama.fukui.jp +mihama.fukui.jp +minamiechizen.fukui.jp +obama.fukui.jp +ohi.fukui.jp +ono.fukui.jp +sabae.fukui.jp +sakai.fukui.jp +takahama.fukui.jp +tsuruga.fukui.jp +wakasa.fukui.jp +ashiya.fukuoka.jp +buzen.fukuoka.jp +chikugo.fukuoka.jp +chikuho.fukuoka.jp +chikujo.fukuoka.jp +chikushino.fukuoka.jp +chikuzen.fukuoka.jp +chuo.fukuoka.jp +dazaifu.fukuoka.jp +fukuchi.fukuoka.jp +hakata.fukuoka.jp +higashi.fukuoka.jp +hirokawa.fukuoka.jp +hisayama.fukuoka.jp +iizuka.fukuoka.jp +inatsuki.fukuoka.jp +kaho.fukuoka.jp +kasuga.fukuoka.jp +kasuya.fukuoka.jp +kawara.fukuoka.jp +keisen.fukuoka.jp +koga.fukuoka.jp +kurate.fukuoka.jp +kurogi.fukuoka.jp +kurume.fukuoka.jp +minami.fukuoka.jp +miyako.fukuoka.jp +miyama.fukuoka.jp +miyawaka.fukuoka.jp +mizumaki.fukuoka.jp +munakata.fukuoka.jp +nakagawa.fukuoka.jp +nakama.fukuoka.jp +nishi.fukuoka.jp +nogata.fukuoka.jp +ogori.fukuoka.jp +okagaki.fukuoka.jp +okawa.fukuoka.jp +oki.fukuoka.jp +omuta.fukuoka.jp +onga.fukuoka.jp +onojo.fukuoka.jp +oto.fukuoka.jp +saigawa.fukuoka.jp +sasaguri.fukuoka.jp +shingu.fukuoka.jp +shinyoshitomi.fukuoka.jp +shonai.fukuoka.jp +soeda.fukuoka.jp +sue.fukuoka.jp +tachiarai.fukuoka.jp +tagawa.fukuoka.jp +takata.fukuoka.jp +toho.fukuoka.jp +toyotsu.fukuoka.jp +tsuiki.fukuoka.jp +ukiha.fukuoka.jp +umi.fukuoka.jp +usui.fukuoka.jp +yamada.fukuoka.jp +yame.fukuoka.jp +yanagawa.fukuoka.jp +yukuhashi.fukuoka.jp +aizubange.fukushima.jp +aizumisato.fukushima.jp +aizuwakamatsu.fukushima.jp +asakawa.fukushima.jp +bandai.fukushima.jp +date.fukushima.jp +fukushima.fukushima.jp +furudono.fukushima.jp +futaba.fukushima.jp +hanawa.fukushima.jp +higashi.fukushima.jp +hirata.fukushima.jp +hirono.fukushima.jp +iitate.fukushima.jp +inawashiro.fukushima.jp +ishikawa.fukushima.jp +iwaki.fukushima.jp +izumizaki.fukushima.jp +kagamiishi.fukushima.jp +kaneyama.fukushima.jp +kawamata.fukushima.jp +kitakata.fukushima.jp +kitashiobara.fukushima.jp +koori.fukushima.jp +koriyama.fukushima.jp +kunimi.fukushima.jp +miharu.fukushima.jp +mishima.fukushima.jp +namie.fukushima.jp +nango.fukushima.jp +nishiaizu.fukushima.jp +nishigo.fukushima.jp +okuma.fukushima.jp +omotego.fukushima.jp +ono.fukushima.jp +otama.fukushima.jp +samegawa.fukushima.jp +shimogo.fukushima.jp +shirakawa.fukushima.jp +showa.fukushima.jp +soma.fukushima.jp +sukagawa.fukushima.jp +taishin.fukushima.jp +tamakawa.fukushima.jp +tanagura.fukushima.jp +tenei.fukushima.jp +yabuki.fukushima.jp +yamato.fukushima.jp +yamatsuri.fukushima.jp +yanaizu.fukushima.jp +yugawa.fukushima.jp +anpachi.gifu.jp +ena.gifu.jp +gifu.gifu.jp +ginan.gifu.jp +godo.gifu.jp +gujo.gifu.jp +hashima.gifu.jp +hichiso.gifu.jp +hida.gifu.jp +higashishirakawa.gifu.jp +ibigawa.gifu.jp +ikeda.gifu.jp +kakamigahara.gifu.jp +kani.gifu.jp +kasahara.gifu.jp +kasamatsu.gifu.jp +kawaue.gifu.jp +kitagata.gifu.jp +mino.gifu.jp +minokamo.gifu.jp +mitake.gifu.jp +mizunami.gifu.jp +motosu.gifu.jp +nakatsugawa.gifu.jp +ogaki.gifu.jp +sakahogi.gifu.jp +seki.gifu.jp +sekigahara.gifu.jp +shirakawa.gifu.jp +tajimi.gifu.jp +takayama.gifu.jp +tarui.gifu.jp +toki.gifu.jp +tomika.gifu.jp +wanouchi.gifu.jp +yamagata.gifu.jp +yaotsu.gifu.jp +yoro.gifu.jp +annaka.gunma.jp +chiyoda.gunma.jp +fujioka.gunma.jp +higashiagatsuma.gunma.jp +isesaki.gunma.jp +itakura.gunma.jp +kanna.gunma.jp +kanra.gunma.jp +katashina.gunma.jp +kawaba.gunma.jp +kiryu.gunma.jp +kusatsu.gunma.jp +maebashi.gunma.jp +meiwa.gunma.jp +midori.gunma.jp +minakami.gunma.jp +naganohara.gunma.jp +nakanojo.gunma.jp +nanmoku.gunma.jp +numata.gunma.jp +oizumi.gunma.jp +ora.gunma.jp +ota.gunma.jp +shibukawa.gunma.jp +shimonita.gunma.jp +shinto.gunma.jp +showa.gunma.jp +takasaki.gunma.jp +takayama.gunma.jp +tamamura.gunma.jp +tatebayashi.gunma.jp +tomioka.gunma.jp +tsukiyono.gunma.jp +tsumagoi.gunma.jp +ueno.gunma.jp +yoshioka.gunma.jp +asaminami.hiroshima.jp +daiwa.hiroshima.jp +etajima.hiroshima.jp +fuchu.hiroshima.jp +fukuyama.hiroshima.jp +hatsukaichi.hiroshima.jp +higashihiroshima.hiroshima.jp +hongo.hiroshima.jp +jinsekikogen.hiroshima.jp +kaita.hiroshima.jp +kui.hiroshima.jp +kumano.hiroshima.jp +kure.hiroshima.jp +mihara.hiroshima.jp +miyoshi.hiroshima.jp +naka.hiroshima.jp +onomichi.hiroshima.jp +osakikamijima.hiroshima.jp +otake.hiroshima.jp +saka.hiroshima.jp +sera.hiroshima.jp +seranishi.hiroshima.jp +shinichi.hiroshima.jp +shobara.hiroshima.jp +takehara.hiroshima.jp +abashiri.hokkaido.jp +abira.hokkaido.jp +aibetsu.hokkaido.jp +akabira.hokkaido.jp +akkeshi.hokkaido.jp +asahikawa.hokkaido.jp +ashibetsu.hokkaido.jp +ashoro.hokkaido.jp +assabu.hokkaido.jp +atsuma.hokkaido.jp +bibai.hokkaido.jp +biei.hokkaido.jp +bifuka.hokkaido.jp +bihoro.hokkaido.jp +biratori.hokkaido.jp +chippubetsu.hokkaido.jp +chitose.hokkaido.jp +date.hokkaido.jp +ebetsu.hokkaido.jp +embetsu.hokkaido.jp +eniwa.hokkaido.jp +erimo.hokkaido.jp +esan.hokkaido.jp +esashi.hokkaido.jp +fukagawa.hokkaido.jp +fukushima.hokkaido.jp +furano.hokkaido.jp +furubira.hokkaido.jp +haboro.hokkaido.jp +hakodate.hokkaido.jp +hamatonbetsu.hokkaido.jp +hidaka.hokkaido.jp +higashikagura.hokkaido.jp +higashikawa.hokkaido.jp +hiroo.hokkaido.jp +hokuryu.hokkaido.jp +hokuto.hokkaido.jp +honbetsu.hokkaido.jp +horokanai.hokkaido.jp +horonobe.hokkaido.jp +ikeda.hokkaido.jp +imakane.hokkaido.jp +ishikari.hokkaido.jp +iwamizawa.hokkaido.jp +iwanai.hokkaido.jp +kamifurano.hokkaido.jp +kamikawa.hokkaido.jp +kamishihoro.hokkaido.jp +kamisunagawa.hokkaido.jp +kamoenai.hokkaido.jp +kayabe.hokkaido.jp +kembuchi.hokkaido.jp +kikonai.hokkaido.jp +kimobetsu.hokkaido.jp +kitahiroshima.hokkaido.jp +kitami.hokkaido.jp +kiyosato.hokkaido.jp +koshimizu.hokkaido.jp +kunneppu.hokkaido.jp +kuriyama.hokkaido.jp +kuromatsunai.hokkaido.jp +kushiro.hokkaido.jp +kutchan.hokkaido.jp +kyowa.hokkaido.jp +mashike.hokkaido.jp +matsumae.hokkaido.jp +mikasa.hokkaido.jp +minamifurano.hokkaido.jp +mombetsu.hokkaido.jp +moseushi.hokkaido.jp +mukawa.hokkaido.jp +muroran.hokkaido.jp +naie.hokkaido.jp +nakagawa.hokkaido.jp +nakasatsunai.hokkaido.jp +nakatombetsu.hokkaido.jp +nanae.hokkaido.jp +nanporo.hokkaido.jp +nayoro.hokkaido.jp +nemuro.hokkaido.jp +niikappu.hokkaido.jp +niki.hokkaido.jp +nishiokoppe.hokkaido.jp +noboribetsu.hokkaido.jp +numata.hokkaido.jp +obihiro.hokkaido.jp +obira.hokkaido.jp +oketo.hokkaido.jp +okoppe.hokkaido.jp +otaru.hokkaido.jp +otobe.hokkaido.jp +otofuke.hokkaido.jp +otoineppu.hokkaido.jp +oumu.hokkaido.jp +ozora.hokkaido.jp +pippu.hokkaido.jp +rankoshi.hokkaido.jp +rebun.hokkaido.jp +rikubetsu.hokkaido.jp +rishiri.hokkaido.jp +rishirifuji.hokkaido.jp +saroma.hokkaido.jp +sarufutsu.hokkaido.jp +shakotan.hokkaido.jp +shari.hokkaido.jp +shibecha.hokkaido.jp +shibetsu.hokkaido.jp +shikabe.hokkaido.jp +shikaoi.hokkaido.jp +shimamaki.hokkaido.jp +shimizu.hokkaido.jp +shimokawa.hokkaido.jp +shinshinotsu.hokkaido.jp +shintoku.hokkaido.jp +shiranuka.hokkaido.jp +shiraoi.hokkaido.jp +shiriuchi.hokkaido.jp +sobetsu.hokkaido.jp +sunagawa.hokkaido.jp +taiki.hokkaido.jp +takasu.hokkaido.jp +takikawa.hokkaido.jp +takinoue.hokkaido.jp +teshikaga.hokkaido.jp +tobetsu.hokkaido.jp +tohma.hokkaido.jp +tomakomai.hokkaido.jp +tomari.hokkaido.jp +toya.hokkaido.jp +toyako.hokkaido.jp +toyotomi.hokkaido.jp +toyoura.hokkaido.jp +tsubetsu.hokkaido.jp +tsukigata.hokkaido.jp +urakawa.hokkaido.jp +urausu.hokkaido.jp +uryu.hokkaido.jp +utashinai.hokkaido.jp +wakkanai.hokkaido.jp +wassamu.hokkaido.jp +yakumo.hokkaido.jp +yoichi.hokkaido.jp +aioi.hyogo.jp +akashi.hyogo.jp +ako.hyogo.jp +amagasaki.hyogo.jp +aogaki.hyogo.jp +asago.hyogo.jp +ashiya.hyogo.jp +awaji.hyogo.jp +fukusaki.hyogo.jp +goshiki.hyogo.jp +harima.hyogo.jp +himeji.hyogo.jp +ichikawa.hyogo.jp +inagawa.hyogo.jp +itami.hyogo.jp +kakogawa.hyogo.jp +kamigori.hyogo.jp +kamikawa.hyogo.jp +kasai.hyogo.jp +kasuga.hyogo.jp +kawanishi.hyogo.jp +miki.hyogo.jp +minamiawaji.hyogo.jp +nishinomiya.hyogo.jp +nishiwaki.hyogo.jp +ono.hyogo.jp +sanda.hyogo.jp +sannan.hyogo.jp +sasayama.hyogo.jp +sayo.hyogo.jp +shingu.hyogo.jp +shinonsen.hyogo.jp +shiso.hyogo.jp +sumoto.hyogo.jp +taishi.hyogo.jp +taka.hyogo.jp +takarazuka.hyogo.jp +takasago.hyogo.jp +takino.hyogo.jp +tamba.hyogo.jp +tatsuno.hyogo.jp +toyooka.hyogo.jp +yabu.hyogo.jp +yashiro.hyogo.jp +yoka.hyogo.jp +yokawa.hyogo.jp +ami.ibaraki.jp +asahi.ibaraki.jp +bando.ibaraki.jp +chikusei.ibaraki.jp +daigo.ibaraki.jp +fujishiro.ibaraki.jp +hitachi.ibaraki.jp +hitachinaka.ibaraki.jp +hitachiomiya.ibaraki.jp +hitachiota.ibaraki.jp +ibaraki.ibaraki.jp +ina.ibaraki.jp +inashiki.ibaraki.jp +itako.ibaraki.jp +iwama.ibaraki.jp +joso.ibaraki.jp +kamisu.ibaraki.jp +kasama.ibaraki.jp +kashima.ibaraki.jp +kasumigaura.ibaraki.jp +koga.ibaraki.jp +miho.ibaraki.jp +mito.ibaraki.jp +moriya.ibaraki.jp +naka.ibaraki.jp +namegata.ibaraki.jp +oarai.ibaraki.jp +ogawa.ibaraki.jp +omitama.ibaraki.jp +ryugasaki.ibaraki.jp +sakai.ibaraki.jp +sakuragawa.ibaraki.jp +shimodate.ibaraki.jp +shimotsuma.ibaraki.jp +shirosato.ibaraki.jp +sowa.ibaraki.jp +suifu.ibaraki.jp +takahagi.ibaraki.jp +tamatsukuri.ibaraki.jp +tokai.ibaraki.jp +tomobe.ibaraki.jp +tone.ibaraki.jp +toride.ibaraki.jp +tsuchiura.ibaraki.jp +tsukuba.ibaraki.jp +uchihara.ibaraki.jp +ushiku.ibaraki.jp +yachiyo.ibaraki.jp +yamagata.ibaraki.jp +yawara.ibaraki.jp +yuki.ibaraki.jp +anamizu.ishikawa.jp +hakui.ishikawa.jp +hakusan.ishikawa.jp +kaga.ishikawa.jp +kahoku.ishikawa.jp +kanazawa.ishikawa.jp +kawakita.ishikawa.jp +komatsu.ishikawa.jp +nakanoto.ishikawa.jp +nanao.ishikawa.jp +nomi.ishikawa.jp +nonoichi.ishikawa.jp +noto.ishikawa.jp +shika.ishikawa.jp +suzu.ishikawa.jp +tsubata.ishikawa.jp +tsurugi.ishikawa.jp +uchinada.ishikawa.jp +wajima.ishikawa.jp +fudai.iwate.jp +fujisawa.iwate.jp +hanamaki.iwate.jp +hiraizumi.iwate.jp +hirono.iwate.jp +ichinohe.iwate.jp +ichinoseki.iwate.jp +iwaizumi.iwate.jp +iwate.iwate.jp +joboji.iwate.jp +kamaishi.iwate.jp +kanegasaki.iwate.jp +karumai.iwate.jp +kawai.iwate.jp +kitakami.iwate.jp +kuji.iwate.jp +kunohe.iwate.jp +kuzumaki.iwate.jp +miyako.iwate.jp +mizusawa.iwate.jp +morioka.iwate.jp +ninohe.iwate.jp +noda.iwate.jp +ofunato.iwate.jp +oshu.iwate.jp +otsuchi.iwate.jp +rikuzentakata.iwate.jp +shiwa.iwate.jp +shizukuishi.iwate.jp +sumita.iwate.jp +takizawa.iwate.jp +tanohata.iwate.jp +tono.iwate.jp +yahaba.iwate.jp +yamada.iwate.jp +ayagawa.kagawa.jp +higashikagawa.kagawa.jp +kanonji.kagawa.jp +kotohira.kagawa.jp +manno.kagawa.jp +marugame.kagawa.jp +mitoyo.kagawa.jp +naoshima.kagawa.jp +sanuki.kagawa.jp +tadotsu.kagawa.jp +takamatsu.kagawa.jp +tonosho.kagawa.jp +uchinomi.kagawa.jp +utazu.kagawa.jp +zentsuji.kagawa.jp +akune.kagoshima.jp +amami.kagoshima.jp +hioki.kagoshima.jp +isa.kagoshima.jp +isen.kagoshima.jp +izumi.kagoshima.jp +kagoshima.kagoshima.jp +kanoya.kagoshima.jp +kawanabe.kagoshima.jp +kinko.kagoshima.jp +kouyama.kagoshima.jp +makurazaki.kagoshima.jp +matsumoto.kagoshima.jp +minamitane.kagoshima.jp +nakatane.kagoshima.jp +nishinoomote.kagoshima.jp +satsumasendai.kagoshima.jp +soo.kagoshima.jp +tarumizu.kagoshima.jp +yusui.kagoshima.jp +aikawa.kanagawa.jp +atsugi.kanagawa.jp +ayase.kanagawa.jp +chigasaki.kanagawa.jp +ebina.kanagawa.jp +fujisawa.kanagawa.jp +hadano.kanagawa.jp +hakone.kanagawa.jp +hiratsuka.kanagawa.jp +isehara.kanagawa.jp +kaisei.kanagawa.jp +kamakura.kanagawa.jp +kiyokawa.kanagawa.jp +matsuda.kanagawa.jp +minamiashigara.kanagawa.jp +miura.kanagawa.jp +nakai.kanagawa.jp +ninomiya.kanagawa.jp +odawara.kanagawa.jp +oi.kanagawa.jp +oiso.kanagawa.jp +sagamihara.kanagawa.jp +samukawa.kanagawa.jp +tsukui.kanagawa.jp +yamakita.kanagawa.jp +yamato.kanagawa.jp +yokosuka.kanagawa.jp +yugawara.kanagawa.jp +zama.kanagawa.jp +zushi.kanagawa.jp +aki.kochi.jp +geisei.kochi.jp +hidaka.kochi.jp +higashitsuno.kochi.jp +ino.kochi.jp +kagami.kochi.jp +kami.kochi.jp +kitagawa.kochi.jp +kochi.kochi.jp +mihara.kochi.jp +motoyama.kochi.jp +muroto.kochi.jp +nahari.kochi.jp +nakamura.kochi.jp +nankoku.kochi.jp +nishitosa.kochi.jp +niyodogawa.kochi.jp +ochi.kochi.jp +okawa.kochi.jp +otoyo.kochi.jp +otsuki.kochi.jp +sakawa.kochi.jp +sukumo.kochi.jp +susaki.kochi.jp +tosa.kochi.jp +tosashimizu.kochi.jp +toyo.kochi.jp +tsuno.kochi.jp +umaji.kochi.jp +yasuda.kochi.jp +yusuhara.kochi.jp +amakusa.kumamoto.jp +arao.kumamoto.jp +aso.kumamoto.jp +choyo.kumamoto.jp +gyokuto.kumamoto.jp +hitoyoshi.kumamoto.jp +kamiamakusa.kumamoto.jp +kashima.kumamoto.jp +kikuchi.kumamoto.jp +kosa.kumamoto.jp +kumamoto.kumamoto.jp +mashiki.kumamoto.jp +mifune.kumamoto.jp +minamata.kumamoto.jp +minamioguni.kumamoto.jp +nagasu.kumamoto.jp +nishihara.kumamoto.jp +oguni.kumamoto.jp +ozu.kumamoto.jp +sumoto.kumamoto.jp +takamori.kumamoto.jp +uki.kumamoto.jp +uto.kumamoto.jp +yamaga.kumamoto.jp +yamato.kumamoto.jp +yatsushiro.kumamoto.jp +ayabe.kyoto.jp +fukuchiyama.kyoto.jp +higashiyama.kyoto.jp +ide.kyoto.jp +ine.kyoto.jp +joyo.kyoto.jp +kameoka.kyoto.jp +kamo.kyoto.jp +kita.kyoto.jp +kizu.kyoto.jp +kumiyama.kyoto.jp +kyotamba.kyoto.jp +kyotanabe.kyoto.jp +kyotango.kyoto.jp +maizuru.kyoto.jp +minami.kyoto.jp +minamiyamashiro.kyoto.jp +miyazu.kyoto.jp +muko.kyoto.jp +nagaokakyo.kyoto.jp +nakagyo.kyoto.jp +nantan.kyoto.jp +oyamazaki.kyoto.jp +sakyo.kyoto.jp +seika.kyoto.jp +tanabe.kyoto.jp +uji.kyoto.jp +ujitawara.kyoto.jp +wazuka.kyoto.jp +yamashina.kyoto.jp +yawata.kyoto.jp +asahi.mie.jp +inabe.mie.jp +ise.mie.jp +kameyama.mie.jp +kawagoe.mie.jp +kiho.mie.jp +kisosaki.mie.jp +kiwa.mie.jp +komono.mie.jp +kumano.mie.jp +kuwana.mie.jp +matsusaka.mie.jp +meiwa.mie.jp +mihama.mie.jp +minamiise.mie.jp +misugi.mie.jp +miyama.mie.jp +nabari.mie.jp +shima.mie.jp +suzuka.mie.jp +tado.mie.jp +taiki.mie.jp +taki.mie.jp +tamaki.mie.jp +toba.mie.jp +tsu.mie.jp +udono.mie.jp +ureshino.mie.jp +watarai.mie.jp +yokkaichi.mie.jp +furukawa.miyagi.jp +higashimatsushima.miyagi.jp +ishinomaki.miyagi.jp +iwanuma.miyagi.jp +kakuda.miyagi.jp +kami.miyagi.jp +kawasaki.miyagi.jp +kesennuma.miyagi.jp +marumori.miyagi.jp +matsushima.miyagi.jp +minamisanriku.miyagi.jp +misato.miyagi.jp +murata.miyagi.jp +natori.miyagi.jp +ogawara.miyagi.jp +ohira.miyagi.jp +onagawa.miyagi.jp +osaki.miyagi.jp +rifu.miyagi.jp +semine.miyagi.jp +shibata.miyagi.jp +shichikashuku.miyagi.jp +shikama.miyagi.jp +shiogama.miyagi.jp +shiroishi.miyagi.jp +tagajo.miyagi.jp +taiwa.miyagi.jp +tome.miyagi.jp +tomiya.miyagi.jp +wakuya.miyagi.jp +watari.miyagi.jp +yamamoto.miyagi.jp +zao.miyagi.jp +aya.miyazaki.jp +ebino.miyazaki.jp +gokase.miyazaki.jp +hyuga.miyazaki.jp +kadogawa.miyazaki.jp +kawaminami.miyazaki.jp +kijo.miyazaki.jp +kitagawa.miyazaki.jp +kitakata.miyazaki.jp +kitaura.miyazaki.jp +kobayashi.miyazaki.jp +kunitomi.miyazaki.jp +kushima.miyazaki.jp +mimata.miyazaki.jp +miyakonojo.miyazaki.jp +miyazaki.miyazaki.jp +morotsuka.miyazaki.jp +nichinan.miyazaki.jp +nishimera.miyazaki.jp +nobeoka.miyazaki.jp +saito.miyazaki.jp +shiiba.miyazaki.jp +shintomi.miyazaki.jp +takaharu.miyazaki.jp +takanabe.miyazaki.jp +takazaki.miyazaki.jp +tsuno.miyazaki.jp +achi.nagano.jp +agematsu.nagano.jp +anan.nagano.jp +aoki.nagano.jp +asahi.nagano.jp +azumino.nagano.jp +chikuhoku.nagano.jp +chikuma.nagano.jp +chino.nagano.jp +fujimi.nagano.jp +hakuba.nagano.jp +hara.nagano.jp +hiraya.nagano.jp +iida.nagano.jp +iijima.nagano.jp +iiyama.nagano.jp +iizuna.nagano.jp +ikeda.nagano.jp +ikusaka.nagano.jp +ina.nagano.jp +karuizawa.nagano.jp +kawakami.nagano.jp +kiso.nagano.jp +kisofukushima.nagano.jp +kitaaiki.nagano.jp +komagane.nagano.jp +komoro.nagano.jp +matsukawa.nagano.jp +matsumoto.nagano.jp +miasa.nagano.jp +minamiaiki.nagano.jp +minamimaki.nagano.jp +minamiminowa.nagano.jp +minowa.nagano.jp +miyada.nagano.jp +miyota.nagano.jp +mochizuki.nagano.jp +nagano.nagano.jp +nagawa.nagano.jp +nagiso.nagano.jp +nakagawa.nagano.jp +nakano.nagano.jp +nozawaonsen.nagano.jp +obuse.nagano.jp +ogawa.nagano.jp +okaya.nagano.jp +omachi.nagano.jp +omi.nagano.jp +ookuwa.nagano.jp +ooshika.nagano.jp +otaki.nagano.jp +otari.nagano.jp +sakae.nagano.jp +sakaki.nagano.jp +saku.nagano.jp +sakuho.nagano.jp +shimosuwa.nagano.jp +shinanomachi.nagano.jp +shiojiri.nagano.jp +suwa.nagano.jp +suzaka.nagano.jp +takagi.nagano.jp +takamori.nagano.jp +takayama.nagano.jp +tateshina.nagano.jp +tatsuno.nagano.jp +togakushi.nagano.jp +togura.nagano.jp +tomi.nagano.jp +ueda.nagano.jp +wada.nagano.jp +yamagata.nagano.jp +yamanouchi.nagano.jp +yasaka.nagano.jp +yasuoka.nagano.jp +chijiwa.nagasaki.jp +futsu.nagasaki.jp +goto.nagasaki.jp +hasami.nagasaki.jp +hirado.nagasaki.jp +iki.nagasaki.jp +isahaya.nagasaki.jp +kawatana.nagasaki.jp +kuchinotsu.nagasaki.jp +matsuura.nagasaki.jp +nagasaki.nagasaki.jp +obama.nagasaki.jp +omura.nagasaki.jp +oseto.nagasaki.jp +saikai.nagasaki.jp +sasebo.nagasaki.jp +seihi.nagasaki.jp +shimabara.nagasaki.jp +shinkamigoto.nagasaki.jp +togitsu.nagasaki.jp +tsushima.nagasaki.jp +unzen.nagasaki.jp +ando.nara.jp +gose.nara.jp +heguri.nara.jp +higashiyoshino.nara.jp +ikaruga.nara.jp +ikoma.nara.jp +kamikitayama.nara.jp +kanmaki.nara.jp +kashiba.nara.jp +kashihara.nara.jp +katsuragi.nara.jp +kawai.nara.jp +kawakami.nara.jp +kawanishi.nara.jp +koryo.nara.jp +kurotaki.nara.jp +mitsue.nara.jp +miyake.nara.jp +nara.nara.jp +nosegawa.nara.jp +oji.nara.jp +ouda.nara.jp +oyodo.nara.jp +sakurai.nara.jp +sango.nara.jp +shimoichi.nara.jp +shimokitayama.nara.jp +shinjo.nara.jp +soni.nara.jp +takatori.nara.jp +tawaramoto.nara.jp +tenkawa.nara.jp +tenri.nara.jp +uda.nara.jp +yamatokoriyama.nara.jp +yamatotakada.nara.jp +yamazoe.nara.jp +yoshino.nara.jp +aga.niigata.jp +agano.niigata.jp +gosen.niigata.jp +itoigawa.niigata.jp +izumozaki.niigata.jp +joetsu.niigata.jp +kamo.niigata.jp +kariwa.niigata.jp +kashiwazaki.niigata.jp +minamiuonuma.niigata.jp +mitsuke.niigata.jp +muika.niigata.jp +murakami.niigata.jp +myoko.niigata.jp +nagaoka.niigata.jp +niigata.niigata.jp +ojiya.niigata.jp +omi.niigata.jp +sado.niigata.jp +sanjo.niigata.jp +seiro.niigata.jp +seirou.niigata.jp +sekikawa.niigata.jp +shibata.niigata.jp +tagami.niigata.jp +tainai.niigata.jp +tochio.niigata.jp +tokamachi.niigata.jp +tsubame.niigata.jp +tsunan.niigata.jp +uonuma.niigata.jp +yahiko.niigata.jp +yoita.niigata.jp +yuzawa.niigata.jp +beppu.oita.jp +bungoono.oita.jp +bungotakada.oita.jp +hasama.oita.jp +hiji.oita.jp +himeshima.oita.jp +hita.oita.jp +kamitsue.oita.jp +kokonoe.oita.jp +kuju.oita.jp +kunisaki.oita.jp +kusu.oita.jp +oita.oita.jp +saiki.oita.jp +taketa.oita.jp +tsukumi.oita.jp +usa.oita.jp +usuki.oita.jp +yufu.oita.jp +akaiwa.okayama.jp +asakuchi.okayama.jp +bizen.okayama.jp +hayashima.okayama.jp +ibara.okayama.jp +kagamino.okayama.jp +kasaoka.okayama.jp +kibichuo.okayama.jp +kumenan.okayama.jp +kurashiki.okayama.jp +maniwa.okayama.jp +misaki.okayama.jp +nagi.okayama.jp +niimi.okayama.jp +nishiawakura.okayama.jp +okayama.okayama.jp +satosho.okayama.jp +setouchi.okayama.jp +shinjo.okayama.jp +shoo.okayama.jp +soja.okayama.jp +takahashi.okayama.jp +tamano.okayama.jp +tsuyama.okayama.jp +wake.okayama.jp +yakage.okayama.jp +aguni.okinawa.jp +ginowan.okinawa.jp +ginoza.okinawa.jp +gushikami.okinawa.jp +haebaru.okinawa.jp +higashi.okinawa.jp +hirara.okinawa.jp +iheya.okinawa.jp +ishigaki.okinawa.jp +ishikawa.okinawa.jp +itoman.okinawa.jp +izena.okinawa.jp +kadena.okinawa.jp +kin.okinawa.jp +kitadaito.okinawa.jp +kitanakagusuku.okinawa.jp +kumejima.okinawa.jp +kunigami.okinawa.jp +minamidaito.okinawa.jp +motobu.okinawa.jp +nago.okinawa.jp +naha.okinawa.jp +nakagusuku.okinawa.jp +nakijin.okinawa.jp +nanjo.okinawa.jp +nishihara.okinawa.jp +ogimi.okinawa.jp +okinawa.okinawa.jp +onna.okinawa.jp +shimoji.okinawa.jp +taketomi.okinawa.jp +tarama.okinawa.jp +tokashiki.okinawa.jp +tomigusuku.okinawa.jp +tonaki.okinawa.jp +urasoe.okinawa.jp +uruma.okinawa.jp +yaese.okinawa.jp +yomitan.okinawa.jp +yonabaru.okinawa.jp +yonaguni.okinawa.jp +zamami.okinawa.jp +abeno.osaka.jp +chihayaakasaka.osaka.jp +chuo.osaka.jp +daito.osaka.jp +fujiidera.osaka.jp +habikino.osaka.jp +hannan.osaka.jp +higashiosaka.osaka.jp +higashisumiyoshi.osaka.jp +higashiyodogawa.osaka.jp +hirakata.osaka.jp +ibaraki.osaka.jp +ikeda.osaka.jp +izumi.osaka.jp +izumiotsu.osaka.jp +izumisano.osaka.jp +kadoma.osaka.jp +kaizuka.osaka.jp +kanan.osaka.jp +kashiwara.osaka.jp +katano.osaka.jp +kawachinagano.osaka.jp +kishiwada.osaka.jp +kita.osaka.jp +kumatori.osaka.jp +matsubara.osaka.jp +minato.osaka.jp +minoh.osaka.jp +misaki.osaka.jp +moriguchi.osaka.jp +neyagawa.osaka.jp +nishi.osaka.jp +nose.osaka.jp +osakasayama.osaka.jp +sakai.osaka.jp +sayama.osaka.jp +sennan.osaka.jp +settsu.osaka.jp +shijonawate.osaka.jp +shimamoto.osaka.jp +suita.osaka.jp +tadaoka.osaka.jp +taishi.osaka.jp +tajiri.osaka.jp +takaishi.osaka.jp +takatsuki.osaka.jp +tondabayashi.osaka.jp +toyonaka.osaka.jp +toyono.osaka.jp +yao.osaka.jp +ariake.saga.jp +arita.saga.jp +fukudomi.saga.jp +genkai.saga.jp +hamatama.saga.jp +hizen.saga.jp +imari.saga.jp +kamimine.saga.jp +kanzaki.saga.jp +karatsu.saga.jp +kashima.saga.jp +kitagata.saga.jp +kitahata.saga.jp +kiyama.saga.jp +kouhoku.saga.jp +kyuragi.saga.jp +nishiarita.saga.jp +ogi.saga.jp +omachi.saga.jp +ouchi.saga.jp +saga.saga.jp +shiroishi.saga.jp +taku.saga.jp +tara.saga.jp +tosu.saga.jp +yoshinogari.saga.jp +arakawa.saitama.jp +asaka.saitama.jp +chichibu.saitama.jp +fujimi.saitama.jp +fujimino.saitama.jp +fukaya.saitama.jp +hanno.saitama.jp +hanyu.saitama.jp +hasuda.saitama.jp +hatogaya.saitama.jp +hatoyama.saitama.jp +hidaka.saitama.jp +higashichichibu.saitama.jp +higashimatsuyama.saitama.jp +honjo.saitama.jp +ina.saitama.jp +iruma.saitama.jp +iwatsuki.saitama.jp +kamiizumi.saitama.jp +kamikawa.saitama.jp +kamisato.saitama.jp +kasukabe.saitama.jp +kawagoe.saitama.jp +kawaguchi.saitama.jp +kawajima.saitama.jp +kazo.saitama.jp +kitamoto.saitama.jp +koshigaya.saitama.jp +kounosu.saitama.jp +kuki.saitama.jp +kumagaya.saitama.jp +matsubushi.saitama.jp +minano.saitama.jp +misato.saitama.jp +miyashiro.saitama.jp +miyoshi.saitama.jp +moroyama.saitama.jp +nagatoro.saitama.jp +namegawa.saitama.jp +niiza.saitama.jp +ogano.saitama.jp +ogawa.saitama.jp +ogose.saitama.jp +okegawa.saitama.jp +omiya.saitama.jp +otaki.saitama.jp +ranzan.saitama.jp +ryokami.saitama.jp +saitama.saitama.jp +sakado.saitama.jp +satte.saitama.jp +sayama.saitama.jp +shiki.saitama.jp +shiraoka.saitama.jp +soka.saitama.jp +sugito.saitama.jp +toda.saitama.jp +tokigawa.saitama.jp +tokorozawa.saitama.jp +tsurugashima.saitama.jp +urawa.saitama.jp +warabi.saitama.jp +yashio.saitama.jp +yokoze.saitama.jp +yono.saitama.jp +yorii.saitama.jp +yoshida.saitama.jp +yoshikawa.saitama.jp +yoshimi.saitama.jp +aisho.shiga.jp +gamo.shiga.jp +higashiomi.shiga.jp +hikone.shiga.jp +koka.shiga.jp +konan.shiga.jp +kosei.shiga.jp +koto.shiga.jp +kusatsu.shiga.jp +maibara.shiga.jp +moriyama.shiga.jp +nagahama.shiga.jp +nishiazai.shiga.jp +notogawa.shiga.jp +omihachiman.shiga.jp +otsu.shiga.jp +ritto.shiga.jp +ryuoh.shiga.jp +takashima.shiga.jp +takatsuki.shiga.jp +torahime.shiga.jp +toyosato.shiga.jp +yasu.shiga.jp +akagi.shimane.jp +ama.shimane.jp +gotsu.shimane.jp +hamada.shimane.jp +higashiizumo.shimane.jp +hikawa.shimane.jp +hikimi.shimane.jp +izumo.shimane.jp +kakinoki.shimane.jp +masuda.shimane.jp +matsue.shimane.jp +misato.shimane.jp +nishinoshima.shimane.jp +ohda.shimane.jp +okinoshima.shimane.jp +okuizumo.shimane.jp +shimane.shimane.jp +tamayu.shimane.jp +tsuwano.shimane.jp +unnan.shimane.jp +yakumo.shimane.jp +yasugi.shimane.jp +yatsuka.shimane.jp +arai.shizuoka.jp +atami.shizuoka.jp +fuji.shizuoka.jp +fujieda.shizuoka.jp +fujikawa.shizuoka.jp +fujinomiya.shizuoka.jp +fukuroi.shizuoka.jp +gotemba.shizuoka.jp +haibara.shizuoka.jp +hamamatsu.shizuoka.jp +higashiizu.shizuoka.jp +ito.shizuoka.jp +iwata.shizuoka.jp +izu.shizuoka.jp +izunokuni.shizuoka.jp +kakegawa.shizuoka.jp +kannami.shizuoka.jp +kawanehon.shizuoka.jp +kawazu.shizuoka.jp +kikugawa.shizuoka.jp +kosai.shizuoka.jp +makinohara.shizuoka.jp +matsuzaki.shizuoka.jp +minamiizu.shizuoka.jp +mishima.shizuoka.jp +morimachi.shizuoka.jp +nishiizu.shizuoka.jp +numazu.shizuoka.jp +omaezaki.shizuoka.jp +shimada.shizuoka.jp +shimizu.shizuoka.jp +shimoda.shizuoka.jp +shizuoka.shizuoka.jp +susono.shizuoka.jp +yaizu.shizuoka.jp +yoshida.shizuoka.jp +ashikaga.tochigi.jp +bato.tochigi.jp +haga.tochigi.jp +ichikai.tochigi.jp +iwafune.tochigi.jp +kaminokawa.tochigi.jp +kanuma.tochigi.jp +karasuyama.tochigi.jp +kuroiso.tochigi.jp +mashiko.tochigi.jp +mibu.tochigi.jp +moka.tochigi.jp +motegi.tochigi.jp +nasu.tochigi.jp +nasushiobara.tochigi.jp +nikko.tochigi.jp +nishikata.tochigi.jp +nogi.tochigi.jp +ohira.tochigi.jp +ohtawara.tochigi.jp +oyama.tochigi.jp +sakura.tochigi.jp +sano.tochigi.jp +shimotsuke.tochigi.jp +shioya.tochigi.jp +takanezawa.tochigi.jp +tochigi.tochigi.jp +tsuga.tochigi.jp +ujiie.tochigi.jp +utsunomiya.tochigi.jp +yaita.tochigi.jp +aizumi.tokushima.jp +anan.tokushima.jp +ichiba.tokushima.jp +itano.tokushima.jp +kainan.tokushima.jp +komatsushima.tokushima.jp +matsushige.tokushima.jp +mima.tokushima.jp +minami.tokushima.jp +miyoshi.tokushima.jp +mugi.tokushima.jp +nakagawa.tokushima.jp +naruto.tokushima.jp +sanagochi.tokushima.jp +shishikui.tokushima.jp +tokushima.tokushima.jp +wajiki.tokushima.jp +adachi.tokyo.jp +akiruno.tokyo.jp +akishima.tokyo.jp +aogashima.tokyo.jp +arakawa.tokyo.jp +bunkyo.tokyo.jp +chiyoda.tokyo.jp +chofu.tokyo.jp +chuo.tokyo.jp +edogawa.tokyo.jp +fuchu.tokyo.jp +fussa.tokyo.jp +hachijo.tokyo.jp +hachioji.tokyo.jp +hamura.tokyo.jp +higashikurume.tokyo.jp +higashimurayama.tokyo.jp +higashiyamato.tokyo.jp +hino.tokyo.jp +hinode.tokyo.jp +hinohara.tokyo.jp +inagi.tokyo.jp +itabashi.tokyo.jp +katsushika.tokyo.jp +kita.tokyo.jp +kiyose.tokyo.jp +kodaira.tokyo.jp +koganei.tokyo.jp +kokubunji.tokyo.jp +komae.tokyo.jp +koto.tokyo.jp +kouzushima.tokyo.jp +kunitachi.tokyo.jp +machida.tokyo.jp +meguro.tokyo.jp +minato.tokyo.jp +mitaka.tokyo.jp +mizuho.tokyo.jp +musashimurayama.tokyo.jp +musashino.tokyo.jp +nakano.tokyo.jp +nerima.tokyo.jp +ogasawara.tokyo.jp +okutama.tokyo.jp +ome.tokyo.jp +oshima.tokyo.jp +ota.tokyo.jp +setagaya.tokyo.jp +shibuya.tokyo.jp +shinagawa.tokyo.jp +shinjuku.tokyo.jp +suginami.tokyo.jp +sumida.tokyo.jp +tachikawa.tokyo.jp +taito.tokyo.jp +tama.tokyo.jp +toshima.tokyo.jp +chizu.tottori.jp +hino.tottori.jp +kawahara.tottori.jp +koge.tottori.jp +kotoura.tottori.jp +misasa.tottori.jp +nanbu.tottori.jp +nichinan.tottori.jp +sakaiminato.tottori.jp +tottori.tottori.jp +wakasa.tottori.jp +yazu.tottori.jp +yonago.tottori.jp +asahi.toyama.jp +fuchu.toyama.jp +fukumitsu.toyama.jp +funahashi.toyama.jp +himi.toyama.jp +imizu.toyama.jp +inami.toyama.jp +johana.toyama.jp +kamiichi.toyama.jp +kurobe.toyama.jp +nakaniikawa.toyama.jp +namerikawa.toyama.jp +nanto.toyama.jp +nyuzen.toyama.jp +oyabe.toyama.jp +taira.toyama.jp +takaoka.toyama.jp +tateyama.toyama.jp +toga.toyama.jp +tonami.toyama.jp +toyama.toyama.jp +unazuki.toyama.jp +uozu.toyama.jp +yamada.toyama.jp +arida.wakayama.jp +aridagawa.wakayama.jp +gobo.wakayama.jp +hashimoto.wakayama.jp +hidaka.wakayama.jp +hirogawa.wakayama.jp +inami.wakayama.jp +iwade.wakayama.jp +kainan.wakayama.jp +kamitonda.wakayama.jp +katsuragi.wakayama.jp +kimino.wakayama.jp +kinokawa.wakayama.jp +kitayama.wakayama.jp +koya.wakayama.jp +koza.wakayama.jp +kozagawa.wakayama.jp +kudoyama.wakayama.jp +kushimoto.wakayama.jp +mihama.wakayama.jp +misato.wakayama.jp +nachikatsuura.wakayama.jp +shingu.wakayama.jp +shirahama.wakayama.jp +taiji.wakayama.jp +tanabe.wakayama.jp +wakayama.wakayama.jp +yuasa.wakayama.jp +yura.wakayama.jp +asahi.yamagata.jp +funagata.yamagata.jp +higashine.yamagata.jp +iide.yamagata.jp +kahoku.yamagata.jp +kaminoyama.yamagata.jp +kaneyama.yamagata.jp +kawanishi.yamagata.jp +mamurogawa.yamagata.jp +mikawa.yamagata.jp +murayama.yamagata.jp +nagai.yamagata.jp +nakayama.yamagata.jp +nanyo.yamagata.jp +nishikawa.yamagata.jp +obanazawa.yamagata.jp +oe.yamagata.jp +oguni.yamagata.jp +ohkura.yamagata.jp +oishida.yamagata.jp +sagae.yamagata.jp +sakata.yamagata.jp +sakegawa.yamagata.jp +shinjo.yamagata.jp +shirataka.yamagata.jp +shonai.yamagata.jp +takahata.yamagata.jp +tendo.yamagata.jp +tozawa.yamagata.jp +tsuruoka.yamagata.jp +yamagata.yamagata.jp +yamanobe.yamagata.jp +yonezawa.yamagata.jp +yuza.yamagata.jp +abu.yamaguchi.jp +hagi.yamaguchi.jp +hikari.yamaguchi.jp +hofu.yamaguchi.jp +iwakuni.yamaguchi.jp +kudamatsu.yamaguchi.jp +mitou.yamaguchi.jp +nagato.yamaguchi.jp +oshima.yamaguchi.jp +shimonoseki.yamaguchi.jp +shunan.yamaguchi.jp +tabuse.yamaguchi.jp +tokuyama.yamaguchi.jp +toyota.yamaguchi.jp +ube.yamaguchi.jp +yuu.yamaguchi.jp +chuo.yamanashi.jp +doshi.yamanashi.jp +fuefuki.yamanashi.jp +fujikawa.yamanashi.jp +fujikawaguchiko.yamanashi.jp +fujiyoshida.yamanashi.jp +hayakawa.yamanashi.jp +hokuto.yamanashi.jp +ichikawamisato.yamanashi.jp +kai.yamanashi.jp +kofu.yamanashi.jp +koshu.yamanashi.jp +kosuge.yamanashi.jp +minami-alps.yamanashi.jp +minobu.yamanashi.jp +nakamichi.yamanashi.jp +nanbu.yamanashi.jp +narusawa.yamanashi.jp +nirasaki.yamanashi.jp +nishikatsura.yamanashi.jp +oshino.yamanashi.jp +otsuki.yamanashi.jp +showa.yamanashi.jp +tabayama.yamanashi.jp +tsuru.yamanashi.jp +uenohara.yamanashi.jp +yamanakako.yamanashi.jp +yamanashi.yamanashi.jp + +// ke : http://www.kenic.or.ke/index.php?option=com_content&task=view&id=117&Itemid=145 +*.ke + +// kg : http://www.domain.kg/dmn_n.html +kg +org.kg +net.kg +com.kg +edu.kg +gov.kg +mil.kg + +// kh : http://www.mptc.gov.kh/dns_registration.htm +*.kh + +// ki : http://www.ki/dns/index.html +ki +edu.ki +biz.ki +net.ki +org.ki +gov.ki +info.ki +com.ki + +// km : http://en.wikipedia.org/wiki/.km +// http://www.domaine.km/documents/charte.doc +km +org.km +nom.km +gov.km +prd.km +tm.km +edu.km +mil.km +ass.km +com.km +// These are only mentioned as proposed suggestions at domaine.km, but +// http://en.wikipedia.org/wiki/.km says they're available for registration: +coop.km +asso.km +presse.km +medecin.km +notaires.km +pharmaciens.km +veterinaire.km +gouv.km + +// kn : http://en.wikipedia.org/wiki/.kn +// http://www.dot.kn/domainRules.html +kn +net.kn +org.kn +edu.kn +gov.kn + +// kp : http://www.kcce.kp/en_index.php +com.kp +edu.kp +gov.kp +org.kp +rep.kp +tra.kp + +// kr : http://en.wikipedia.org/wiki/.kr +// see also: http://domain.nida.or.kr/eng/registration.jsp +kr +ac.kr +co.kr +es.kr +go.kr +hs.kr +kg.kr +mil.kr +ms.kr +ne.kr +or.kr +pe.kr +re.kr +sc.kr +// kr geographical names +busan.kr +chungbuk.kr +chungnam.kr +daegu.kr +daejeon.kr +gangwon.kr +gwangju.kr +gyeongbuk.kr +gyeonggi.kr +gyeongnam.kr +incheon.kr +jeju.kr +jeonbuk.kr +jeonnam.kr +seoul.kr +ulsan.kr + +// kw : http://en.wikipedia.org/wiki/.kw +*.kw + +// ky : http://www.icta.ky/da_ky_reg_dom.php +// Confirmed by registry 2008-06-17 +ky +edu.ky +gov.ky +com.ky +org.ky +net.ky + +// kz : http://en.wikipedia.org/wiki/.kz +// see also: http://www.nic.kz/rules/index.jsp +kz +org.kz +edu.kz +net.kz +gov.kz +mil.kz +com.kz + +// la : http://en.wikipedia.org/wiki/.la +// Submitted by registry 2008-06-10 +la +int.la +net.la +info.la +edu.la +gov.la +per.la +com.la +org.la + +// lb : http://en.wikipedia.org/wiki/.lb +// Submitted by registry 2008-06-17 +com.lb +edu.lb +gov.lb +net.lb +org.lb + +// lc : http://en.wikipedia.org/wiki/.lc +// see also: http://www.nic.lc/rules.htm +lc +com.lc +net.lc +co.lc +org.lc +edu.lc +gov.lc + +// li : http://en.wikipedia.org/wiki/.li +li + +// lk : http://www.nic.lk/seclevpr.html +lk +gov.lk +sch.lk +net.lk +int.lk +com.lk +org.lk +edu.lk +ngo.lk +soc.lk +web.lk +ltd.lk +assn.lk +grp.lk +hotel.lk + +// lr : http://psg.com/dns/lr/lr.txt +// Submitted by registry 2008-06-17 +com.lr +edu.lr +gov.lr +org.lr +net.lr + +// ls : http://en.wikipedia.org/wiki/.ls +ls +co.ls +org.ls + +// lt : http://en.wikipedia.org/wiki/.lt +lt +// gov.lt : http://www.gov.lt/index_en.php +gov.lt + +// lu : http://www.dns.lu/en/ +lu + +// lv : http://www.nic.lv/DNS/En/generic.php +lv +com.lv +edu.lv +gov.lv +org.lv +mil.lv +id.lv +net.lv +asn.lv +conf.lv + +// ly : http://www.nic.ly/regulations.php +ly +com.ly +net.ly +gov.ly +plc.ly +edu.ly +sch.ly +med.ly +org.ly +id.ly + +// ma : http://en.wikipedia.org/wiki/.ma +// http://www.anrt.ma/fr/admin/download/upload/file_fr782.pdf +ma +co.ma +net.ma +gov.ma +org.ma +ac.ma +press.ma + +// mc : http://www.nic.mc/ +mc +tm.mc +asso.mc + +// md : http://en.wikipedia.org/wiki/.md +md + +// me : http://en.wikipedia.org/wiki/.me +me +co.me +net.me +org.me +edu.me +ac.me +gov.me +its.me +priv.me + +// mg : http://www.nic.mg/tarif.htm +mg +org.mg +nom.mg +gov.mg +prd.mg +tm.mg +edu.mg +mil.mg +com.mg + +// mh : http://en.wikipedia.org/wiki/.mh +mh + +// mil : http://en.wikipedia.org/wiki/.mil +mil + +// mk : http://en.wikipedia.org/wiki/.mk +// see also: http://dns.marnet.net.mk/postapka.php +mk +com.mk +org.mk +net.mk +edu.mk +gov.mk +inf.mk +name.mk + +// ml : http://www.gobin.info/domainname/ml-template.doc +// see also: http://en.wikipedia.org/wiki/.ml +ml +com.ml +edu.ml +gouv.ml +gov.ml +net.ml +org.ml +presse.ml + +// mm : http://en.wikipedia.org/wiki/.mm +*.mm + +// mn : http://en.wikipedia.org/wiki/.mn +mn +gov.mn +edu.mn +org.mn + +// mo : http://www.monic.net.mo/ +mo +com.mo +net.mo +org.mo +edu.mo +gov.mo + +// mobi : http://en.wikipedia.org/wiki/.mobi +mobi + +// mp : http://www.dot.mp/ +// Confirmed by registry 2008-06-17 +mp + +// mq : http://en.wikipedia.org/wiki/.mq +mq + +// mr : http://en.wikipedia.org/wiki/.mr +mr +gov.mr + +// ms : http://en.wikipedia.org/wiki/.ms +ms + +// mt : https://www.nic.org.mt/dotmt/ +*.mt + +// mu : http://en.wikipedia.org/wiki/.mu +mu +com.mu +net.mu +org.mu +gov.mu +ac.mu +co.mu +or.mu + +// museum : http://about.museum/naming/ +// http://index.museum/ +museum +academy.museum +agriculture.museum +air.museum +airguard.museum +alabama.museum +alaska.museum +amber.museum +ambulance.museum +american.museum +americana.museum +americanantiques.museum +americanart.museum +amsterdam.museum +and.museum +annefrank.museum +anthro.museum +anthropology.museum +antiques.museum +aquarium.museum +arboretum.museum +archaeological.museum +archaeology.museum +architecture.museum +art.museum +artanddesign.museum +artcenter.museum +artdeco.museum +arteducation.museum +artgallery.museum +arts.museum +artsandcrafts.museum +asmatart.museum +assassination.museum +assisi.museum +association.museum +astronomy.museum +atlanta.museum +austin.museum +australia.museum +automotive.museum +aviation.museum +axis.museum +badajoz.museum +baghdad.museum +bahn.museum +bale.museum +baltimore.museum +barcelona.museum +baseball.museum +basel.museum +baths.museum +bauern.museum +beauxarts.museum +beeldengeluid.museum +bellevue.museum +bergbau.museum +berkeley.museum +berlin.museum +bern.museum +bible.museum +bilbao.museum +bill.museum +birdart.museum +birthplace.museum +bonn.museum +boston.museum +botanical.museum +botanicalgarden.museum +botanicgarden.museum +botany.museum +brandywinevalley.museum +brasil.museum +bristol.museum +british.museum +britishcolumbia.museum +broadcast.museum +brunel.museum +brussel.museum +brussels.museum +bruxelles.museum +building.museum +burghof.museum +bus.museum +bushey.museum +cadaques.museum +california.museum +cambridge.museum +can.museum +canada.museum +capebreton.museum +carrier.museum +cartoonart.museum +casadelamoneda.museum +castle.museum +castres.museum +celtic.museum +center.museum +chattanooga.museum +cheltenham.museum +chesapeakebay.museum +chicago.museum +children.museum +childrens.museum +childrensgarden.museum +chiropractic.museum +chocolate.museum +christiansburg.museum +cincinnati.museum +cinema.museum +circus.museum +civilisation.museum +civilization.museum +civilwar.museum +clinton.museum +clock.museum +coal.museum +coastaldefence.museum +cody.museum +coldwar.museum +collection.museum +colonialwilliamsburg.museum +coloradoplateau.museum +columbia.museum +columbus.museum +communication.museum +communications.museum +community.museum +computer.museum +computerhistory.museum +comunicações.museum +contemporary.museum +contemporaryart.museum +convent.museum +copenhagen.museum +corporation.museum +correios-e-telecomunicações.museum +corvette.museum +costume.museum +countryestate.museum +county.museum +crafts.museum +cranbrook.museum +creation.museum +cultural.museum +culturalcenter.museum +culture.museum +cyber.museum +cymru.museum +dali.museum +dallas.museum +database.museum +ddr.museum +decorativearts.museum +delaware.museum +delmenhorst.museum +denmark.museum +depot.museum +design.museum +detroit.museum +dinosaur.museum +discovery.museum +dolls.museum +donostia.museum +durham.museum +eastafrica.museum +eastcoast.museum +education.museum +educational.museum +egyptian.museum +eisenbahn.museum +elburg.museum +elvendrell.museum +embroidery.museum +encyclopedic.museum +england.museum +entomology.museum +environment.museum +environmentalconservation.museum +epilepsy.museum +essex.museum +estate.museum +ethnology.museum +exeter.museum +exhibition.museum +family.museum +farm.museum +farmequipment.museum +farmers.museum +farmstead.museum +field.museum +figueres.museum +filatelia.museum +film.museum +fineart.museum +finearts.museum +finland.museum +flanders.museum +florida.museum +force.museum +fortmissoula.museum +fortworth.museum +foundation.museum +francaise.museum +frankfurt.museum +franziskaner.museum +freemasonry.museum +freiburg.museum +fribourg.museum +frog.museum +fundacio.museum +furniture.museum +gallery.museum +garden.museum +gateway.museum +geelvinck.museum +gemological.museum +geology.museum +georgia.museum +giessen.museum +glas.museum +glass.museum +gorge.museum +grandrapids.museum +graz.museum +guernsey.museum +halloffame.museum +hamburg.museum +handson.museum +harvestcelebration.museum +hawaii.museum +health.museum +heimatunduhren.museum +hellas.museum +helsinki.museum +hembygdsforbund.museum +heritage.museum +histoire.museum +historical.museum +historicalsociety.museum +historichouses.museum +historisch.museum +historisches.museum +history.museum +historyofscience.museum +horology.museum +house.museum +humanities.museum +illustration.museum +imageandsound.museum +indian.museum +indiana.museum +indianapolis.museum +indianmarket.museum +intelligence.museum +interactive.museum +iraq.museum +iron.museum +isleofman.museum +jamison.museum +jefferson.museum +jerusalem.museum +jewelry.museum +jewish.museum +jewishart.museum +jfk.museum +journalism.museum +judaica.museum +judygarland.museum +juedisches.museum +juif.museum +karate.museum +karikatur.museum +kids.museum +koebenhavn.museum +koeln.museum +kunst.museum +kunstsammlung.museum +kunstunddesign.museum +labor.museum +labour.museum +lajolla.museum +lancashire.museum +landes.museum +lans.museum +läns.museum +larsson.museum +lewismiller.museum +lincoln.museum +linz.museum +living.museum +livinghistory.museum +localhistory.museum +london.museum +losangeles.museum +louvre.museum +loyalist.museum +lucerne.museum +luxembourg.museum +luzern.museum +mad.museum +madrid.museum +mallorca.museum +manchester.museum +mansion.museum +mansions.museum +manx.museum +marburg.museum +maritime.museum +maritimo.museum +maryland.museum +marylhurst.museum +media.museum +medical.museum +medizinhistorisches.museum +meeres.museum +memorial.museum +mesaverde.museum +michigan.museum +midatlantic.museum +military.museum +mill.museum +miners.museum +mining.museum +minnesota.museum +missile.museum +missoula.museum +modern.museum +moma.museum +money.museum +monmouth.museum +monticello.museum +montreal.museum +moscow.museum +motorcycle.museum +muenchen.museum +muenster.museum +mulhouse.museum +muncie.museum +museet.museum +museumcenter.museum +museumvereniging.museum +music.museum +national.museum +nationalfirearms.museum +nationalheritage.museum +nativeamerican.museum +naturalhistory.museum +naturalhistorymuseum.museum +naturalsciences.museum +nature.museum +naturhistorisches.museum +natuurwetenschappen.museum +naumburg.museum +naval.museum +nebraska.museum +neues.museum +newhampshire.museum +newjersey.museum +newmexico.museum +newport.museum +newspaper.museum +newyork.museum +niepce.museum +norfolk.museum +north.museum +nrw.museum +nuernberg.museum +nuremberg.museum +nyc.museum +nyny.museum +oceanographic.museum +oceanographique.museum +omaha.museum +online.museum +ontario.museum +openair.museum +oregon.museum +oregontrail.museum +otago.museum +oxford.museum +pacific.museum +paderborn.museum +palace.museum +paleo.museum +palmsprings.museum +panama.museum +paris.museum +pasadena.museum +pharmacy.museum +philadelphia.museum +philadelphiaarea.museum +philately.museum +phoenix.museum +photography.museum +pilots.museum +pittsburgh.museum +planetarium.museum +plantation.museum +plants.museum +plaza.museum +portal.museum +portland.museum +portlligat.museum +posts-and-telecommunications.museum +preservation.museum +presidio.museum +press.museum +project.museum +public.museum +pubol.museum +quebec.museum +railroad.museum +railway.museum +research.museum +resistance.museum +riodejaneiro.museum +rochester.museum +rockart.museum +roma.museum +russia.museum +saintlouis.museum +salem.museum +salvadordali.museum +salzburg.museum +sandiego.museum +sanfrancisco.museum +santabarbara.museum +santacruz.museum +santafe.museum +saskatchewan.museum +satx.museum +savannahga.museum +schlesisches.museum +schoenbrunn.museum +schokoladen.museum +school.museum +schweiz.museum +science.museum +scienceandhistory.museum +scienceandindustry.museum +sciencecenter.museum +sciencecenters.museum +science-fiction.museum +sciencehistory.museum +sciences.museum +sciencesnaturelles.museum +scotland.museum +seaport.museum +settlement.museum +settlers.museum +shell.museum +sherbrooke.museum +sibenik.museum +silk.museum +ski.museum +skole.museum +society.museum +sologne.museum +soundandvision.museum +southcarolina.museum +southwest.museum +space.museum +spy.museum +square.museum +stadt.museum +stalbans.museum +starnberg.museum +state.museum +stateofdelaware.museum +station.museum +steam.museum +steiermark.museum +stjohn.museum +stockholm.museum +stpetersburg.museum +stuttgart.museum +suisse.museum +surgeonshall.museum +surrey.museum +svizzera.museum +sweden.museum +sydney.museum +tank.museum +tcm.museum +technology.museum +telekommunikation.museum +television.museum +texas.museum +textile.museum +theater.museum +time.museum +timekeeping.museum +topology.museum +torino.museum +touch.museum +town.museum +transport.museum +tree.museum +trolley.museum +trust.museum +trustee.museum +uhren.museum +ulm.museum +undersea.museum +university.museum +usa.museum +usantiques.museum +usarts.museum +uscountryestate.museum +usculture.museum +usdecorativearts.museum +usgarden.museum +ushistory.museum +ushuaia.museum +uslivinghistory.museum +utah.museum +uvic.museum +valley.museum +vantaa.museum +versailles.museum +viking.museum +village.museum +virginia.museum +virtual.museum +virtuel.museum +vlaanderen.museum +volkenkunde.museum +wales.museum +wallonie.museum +war.museum +washingtondc.museum +watchandclock.museum +watch-and-clock.museum +western.museum +westfalen.museum +whaling.museum +wildlife.museum +williamsburg.museum +windmill.museum +workshop.museum +york.museum +yorkshire.museum +yosemite.museum +youth.museum +zoological.museum +zoology.museum +ירושלים.museum +иком.museum + +// mv : http://en.wikipedia.org/wiki/.mv +// "mv" included because, contra Wikipedia, google.mv exists. +mv +aero.mv +biz.mv +com.mv +coop.mv +edu.mv +gov.mv +info.mv +int.mv +mil.mv +museum.mv +name.mv +net.mv +org.mv +pro.mv + +// mw : http://www.registrar.mw/ +mw +ac.mw +biz.mw +co.mw +com.mw +coop.mw +edu.mw +gov.mw +int.mw +museum.mw +net.mw +org.mw + +// mx : http://www.nic.mx/ +// Submitted by registry 2008-06-19 +mx +com.mx +org.mx +gob.mx +edu.mx +net.mx + +// my : http://www.mynic.net.my/ +my +com.my +net.my +org.my +gov.my +edu.my +mil.my +name.my + +// mz : http://www.gobin.info/domainname/mz-template.doc +*.mz +!teledata.mz + +// na : http://www.na-nic.com.na/ +// http://www.info.na/domain/ +na +info.na +pro.na +name.na +school.na +or.na +dr.na +us.na +mx.na +ca.na +in.na +cc.na +tv.na +ws.na +mobi.na +co.na +com.na +org.na + +// name : has 2nd-level tlds, but there's no list of them +name + +// nc : http://www.cctld.nc/ +nc +asso.nc + +// ne : http://en.wikipedia.org/wiki/.ne +ne + +// net : http://en.wikipedia.org/wiki/.net +net + +// nf : http://en.wikipedia.org/wiki/.nf +nf +com.nf +net.nf +per.nf +rec.nf +web.nf +arts.nf +firm.nf +info.nf +other.nf +store.nf + +// ng : http://psg.com/dns/ng/ +// Submitted by registry 2008-06-17 +ac.ng +com.ng +edu.ng +gov.ng +net.ng +org.ng + +// ni : http://www.nic.ni/dominios.htm +*.ni + +// nl : http://www.domain-registry.nl/ace.php/c,728,122,,,,Home.html +// Confirmed by registry (with technical +// reservations) 2008-06-08 +nl + +// BV.nl will be a registry for dutch BV's (besloten vennootschap) +bv.nl + +// no : http://www.norid.no/regelverk/index.en.html +// The Norwegian registry has declined to notify us of updates. The web pages +// referenced below are the official source of the data. There is also an +// announce mailing list: +// https://postlister.uninett.no/sympa/info/norid-diskusjon +no +// Norid generic domains : http://www.norid.no/regelverk/vedlegg-c.en.html +fhs.no +vgs.no +fylkesbibl.no +folkebibl.no +museum.no +idrett.no +priv.no +// Non-Norid generic domains : http://www.norid.no/regelverk/vedlegg-d.en.html +mil.no +stat.no +dep.no +kommune.no +herad.no +// no geographical names : http://www.norid.no/regelverk/vedlegg-b.en.html +// counties +aa.no +ah.no +bu.no +fm.no +hl.no +hm.no +jan-mayen.no +mr.no +nl.no +nt.no +of.no +ol.no +oslo.no +rl.no +sf.no +st.no +svalbard.no +tm.no +tr.no +va.no +vf.no +// primary and lower secondary schools per county +gs.aa.no +gs.ah.no +gs.bu.no +gs.fm.no +gs.hl.no +gs.hm.no +gs.jan-mayen.no +gs.mr.no +gs.nl.no +gs.nt.no +gs.of.no +gs.ol.no +gs.oslo.no +gs.rl.no +gs.sf.no +gs.st.no +gs.svalbard.no +gs.tm.no +gs.tr.no +gs.va.no +gs.vf.no +// cities +akrehamn.no +åkrehamn.no +algard.no +ålgård.no +arna.no +brumunddal.no +bryne.no +bronnoysund.no +brønnøysund.no +drobak.no +drøbak.no +egersund.no +fetsund.no +floro.no +florø.no +fredrikstad.no +hokksund.no +honefoss.no +hønefoss.no +jessheim.no +jorpeland.no +jørpeland.no +kirkenes.no +kopervik.no +krokstadelva.no +langevag.no +langevåg.no +leirvik.no +mjondalen.no +mjøndalen.no +mo-i-rana.no +mosjoen.no +mosjøen.no +nesoddtangen.no +orkanger.no +osoyro.no +osøyro.no +raholt.no +råholt.no +sandnessjoen.no +sandnessjøen.no +skedsmokorset.no +slattum.no +spjelkavik.no +stathelle.no +stavern.no +stjordalshalsen.no +stjørdalshalsen.no +tananger.no +tranby.no +vossevangen.no +// communities +afjord.no +åfjord.no +agdenes.no +al.no +ål.no +alesund.no +ålesund.no +alstahaug.no +alta.no +áltá.no +alaheadju.no +álaheadju.no +alvdal.no +amli.no +åmli.no +amot.no +åmot.no +andebu.no +andoy.no +andøy.no +andasuolo.no +ardal.no +årdal.no +aremark.no +arendal.no +ås.no +aseral.no +åseral.no +asker.no +askim.no +askvoll.no +askoy.no +askøy.no +asnes.no +åsnes.no +audnedaln.no +aukra.no +aure.no +aurland.no +aurskog-holand.no +aurskog-høland.no +austevoll.no +austrheim.no +averoy.no +averøy.no +balestrand.no +ballangen.no +balat.no +bálát.no +balsfjord.no +bahccavuotna.no +báhccavuotna.no +bamble.no +bardu.no +beardu.no +beiarn.no +bajddar.no +bájddar.no +baidar.no +báidár.no +berg.no +bergen.no +berlevag.no +berlevåg.no +bearalvahki.no +bearalváhki.no +bindal.no +birkenes.no +bjarkoy.no +bjarkøy.no +bjerkreim.no +bjugn.no +bodo.no +bodø.no +badaddja.no +bådåddjå.no +budejju.no +bokn.no +bremanger.no +bronnoy.no +brønnøy.no +bygland.no +bykle.no +barum.no +bærum.no +bo.telemark.no +bø.telemark.no +bo.nordland.no +bø.nordland.no +bievat.no +bievát.no +bomlo.no +bømlo.no +batsfjord.no +båtsfjord.no +bahcavuotna.no +báhcavuotna.no +dovre.no +drammen.no +drangedal.no +dyroy.no +dyrøy.no +donna.no +dønna.no +eid.no +eidfjord.no +eidsberg.no +eidskog.no +eidsvoll.no +eigersund.no +elverum.no +enebakk.no +engerdal.no +etne.no +etnedal.no +evenes.no +evenassi.no +evenášši.no +evje-og-hornnes.no +farsund.no +fauske.no +fuossko.no +fuoisku.no +fedje.no +fet.no +finnoy.no +finnøy.no +fitjar.no +fjaler.no +fjell.no +flakstad.no +flatanger.no +flekkefjord.no +flesberg.no +flora.no +fla.no +flå.no +folldal.no +forsand.no +fosnes.no +frei.no +frogn.no +froland.no +frosta.no +frana.no +fræna.no +froya.no +frøya.no +fusa.no +fyresdal.no +forde.no +førde.no +gamvik.no +gangaviika.no +gáŋgaviika.no +gaular.no +gausdal.no +gildeskal.no +gildeskål.no +giske.no +gjemnes.no +gjerdrum.no +gjerstad.no +gjesdal.no +gjovik.no +gjøvik.no +gloppen.no +gol.no +gran.no +grane.no +granvin.no +gratangen.no +grimstad.no +grong.no +kraanghke.no +kråanghke.no +grue.no +gulen.no +hadsel.no +halden.no +halsa.no +hamar.no +hamaroy.no +habmer.no +hábmer.no +hapmir.no +hápmir.no +hammerfest.no +hammarfeasta.no +hámmárfeasta.no +haram.no +hareid.no +harstad.no +hasvik.no +aknoluokta.no +ákŋoluokta.no +hattfjelldal.no +aarborte.no +haugesund.no +hemne.no +hemnes.no +hemsedal.no +heroy.more-og-romsdal.no +herøy.møre-og-romsdal.no +heroy.nordland.no +herøy.nordland.no +hitra.no +hjartdal.no +hjelmeland.no +hobol.no +hobøl.no +hof.no +hol.no +hole.no +holmestrand.no +holtalen.no +holtålen.no +hornindal.no +horten.no +hurdal.no +hurum.no +hvaler.no +hyllestad.no +hagebostad.no +hægebostad.no +hoyanger.no +høyanger.no +hoylandet.no +høylandet.no +ha.no +hå.no +ibestad.no +inderoy.no +inderøy.no +iveland.no +jevnaker.no +jondal.no +jolster.no +jølster.no +karasjok.no +karasjohka.no +kárášjohka.no +karlsoy.no +galsa.no +gálsá.no +karmoy.no +karmøy.no +kautokeino.no +guovdageaidnu.no +klepp.no +klabu.no +klæbu.no +kongsberg.no +kongsvinger.no +kragero.no +kragerø.no +kristiansand.no +kristiansund.no +krodsherad.no +krødsherad.no +kvalsund.no +rahkkeravju.no +ráhkkerávju.no +kvam.no +kvinesdal.no +kvinnherad.no +kviteseid.no +kvitsoy.no +kvitsøy.no +kvafjord.no +kvæfjord.no +giehtavuoatna.no +kvanangen.no +kvænangen.no +navuotna.no +návuotna.no +kafjord.no +kåfjord.no +gaivuotna.no +gáivuotna.no +larvik.no +lavangen.no +lavagis.no +loabat.no +loabát.no +lebesby.no +davvesiida.no +leikanger.no +leirfjord.no +leka.no +leksvik.no +lenvik.no +leangaviika.no +leaŋgaviika.no +lesja.no +levanger.no +lier.no +lierne.no +lillehammer.no +lillesand.no +lindesnes.no +lindas.no +lindås.no +lom.no +loppa.no +lahppi.no +láhppi.no +lund.no +lunner.no +luroy.no +lurøy.no +luster.no +lyngdal.no +lyngen.no +ivgu.no +lardal.no +lerdal.no +lærdal.no +lodingen.no +lødingen.no +lorenskog.no +lørenskog.no +loten.no +løten.no +malvik.no +masoy.no +måsøy.no +muosat.no +muosát.no +mandal.no +marker.no +marnardal.no +masfjorden.no +meland.no +meldal.no +melhus.no +meloy.no +meløy.no +meraker.no +meråker.no +moareke.no +moåreke.no +midsund.no +midtre-gauldal.no +modalen.no +modum.no +molde.no +moskenes.no +moss.no +mosvik.no +malselv.no +målselv.no +malatvuopmi.no +málatvuopmi.no +namdalseid.no +aejrie.no +namsos.no +namsskogan.no +naamesjevuemie.no +nååmesjevuemie.no +laakesvuemie.no +nannestad.no +narvik.no +narviika.no +naustdal.no +nedre-eiker.no +nes.akershus.no +nes.buskerud.no +nesna.no +nesodden.no +nesseby.no +unjarga.no +unjárga.no +nesset.no +nissedal.no +nittedal.no +nord-aurdal.no +nord-fron.no +nord-odal.no +norddal.no +nordkapp.no +davvenjarga.no +davvenjárga.no +nordre-land.no +nordreisa.no +raisa.no +ráisa.no +nore-og-uvdal.no +notodden.no +naroy.no +nærøy.no +notteroy.no +nøtterøy.no +odda.no +oksnes.no +øksnes.no +oppdal.no +oppegard.no +oppegård.no +orkdal.no +orland.no +ørland.no +orskog.no +ørskog.no +orsta.no +ørsta.no +os.hedmark.no +os.hordaland.no +osen.no +osteroy.no +osterøy.no +ostre-toten.no +østre-toten.no +overhalla.no +ovre-eiker.no +øvre-eiker.no +oyer.no +øyer.no +oygarden.no +øygarden.no +oystre-slidre.no +øystre-slidre.no +porsanger.no +porsangu.no +porsáŋgu.no +porsgrunn.no +radoy.no +radøy.no +rakkestad.no +rana.no +ruovat.no +randaberg.no +rauma.no +rendalen.no +rennebu.no +rennesoy.no +rennesøy.no +rindal.no +ringebu.no +ringerike.no +ringsaker.no +rissa.no +risor.no +risør.no +roan.no +rollag.no +rygge.no +ralingen.no +rælingen.no +rodoy.no +rødøy.no +romskog.no +rømskog.no +roros.no +røros.no +rost.no +røst.no +royken.no +røyken.no +royrvik.no +røyrvik.no +rade.no +råde.no +salangen.no +siellak.no +saltdal.no +salat.no +sálát.no +sálat.no +samnanger.no +sande.more-og-romsdal.no +sande.møre-og-romsdal.no +sande.vestfold.no +sandefjord.no +sandnes.no +sandoy.no +sandøy.no +sarpsborg.no +sauda.no +sauherad.no +sel.no +selbu.no +selje.no +seljord.no +sigdal.no +siljan.no +sirdal.no +skaun.no +skedsmo.no +ski.no +skien.no +skiptvet.no +skjervoy.no +skjervøy.no +skierva.no +skiervá.no +skjak.no +skjåk.no +skodje.no +skanland.no +skånland.no +skanit.no +skánit.no +smola.no +smøla.no +snillfjord.no +snasa.no +snåsa.no +snoasa.no +snaase.no +snåase.no +sogndal.no +sokndal.no +sola.no +solund.no +songdalen.no +sortland.no +spydeberg.no +stange.no +stavanger.no +steigen.no +steinkjer.no +stjordal.no +stjørdal.no +stokke.no +stor-elvdal.no +stord.no +stordal.no +storfjord.no +omasvuotna.no +strand.no +stranda.no +stryn.no +sula.no +suldal.no +sund.no +sunndal.no +surnadal.no +sveio.no +svelvik.no +sykkylven.no +sogne.no +søgne.no +somna.no +sømna.no +sondre-land.no +søndre-land.no +sor-aurdal.no +sør-aurdal.no +sor-fron.no +sør-fron.no +sor-odal.no +sør-odal.no +sor-varanger.no +sør-varanger.no +matta-varjjat.no +mátta-várjjat.no +sorfold.no +sørfold.no +sorreisa.no +sørreisa.no +sorum.no +sørum.no +tana.no +deatnu.no +time.no +tingvoll.no +tinn.no +tjeldsund.no +dielddanuorri.no +tjome.no +tjøme.no +tokke.no +tolga.no +torsken.no +tranoy.no +tranøy.no +tromso.no +tromsø.no +tromsa.no +romsa.no +trondheim.no +troandin.no +trysil.no +trana.no +træna.no +trogstad.no +trøgstad.no +tvedestrand.no +tydal.no +tynset.no +tysfjord.no +divtasvuodna.no +divttasvuotna.no +tysnes.no +tysvar.no +tysvær.no +tonsberg.no +tønsberg.no +ullensaker.no +ullensvang.no +ulvik.no +utsira.no +vadso.no +vadsø.no +cahcesuolo.no +čáhcesuolo.no +vaksdal.no +valle.no +vang.no +vanylven.no +vardo.no +vardø.no +varggat.no +várggát.no +vefsn.no +vaapste.no +vega.no +vegarshei.no +vegårshei.no +vennesla.no +verdal.no +verran.no +vestby.no +vestnes.no +vestre-slidre.no +vestre-toten.no +vestvagoy.no +vestvågøy.no +vevelstad.no +vik.no +vikna.no +vindafjord.no +volda.no +voss.no +varoy.no +værøy.no +vagan.no +vågan.no +voagat.no +vagsoy.no +vågsøy.no +vaga.no +vågå.no +valer.ostfold.no +våler.østfold.no +valer.hedmark.no +våler.hedmark.no + +// np : http://www.mos.com.np/register.html +*.np + +// nr : http://cenpac.net.nr/dns/index.html +// Confirmed by registry 2008-06-17 +nr +biz.nr +info.nr +gov.nr +edu.nr +org.nr +net.nr +com.nr + +// nu : http://en.wikipedia.org/wiki/.nu +nu + +// nz : http://en.wikipedia.org/wiki/.nz +*.nz + +// om : http://en.wikipedia.org/wiki/.om +*.om +!mediaphone.om +!nawrastelecom.om +!nawras.om +!omanmobile.om +!omanpost.om +!omantel.om +!rakpetroleum.om +!siemens.om +!songfest.om +!statecouncil.om + +// org : http://en.wikipedia.org/wiki/.org +org + +// pa : http://www.nic.pa/ +// Some additional second level "domains" resolve directly as hostnames, such as +// pannet.pa, so we add a rule for "pa". +pa +ac.pa +gob.pa +com.pa +org.pa +sld.pa +edu.pa +net.pa +ing.pa +abo.pa +med.pa +nom.pa + +// pe : https://www.nic.pe/InformeFinalComision.pdf +pe +edu.pe +gob.pe +nom.pe +mil.pe +org.pe +com.pe +net.pe + +// pf : http://www.gobin.info/domainname/formulaire-pf.pdf +pf +com.pf +org.pf +edu.pf + +// pg : http://en.wikipedia.org/wiki/.pg +*.pg + +// ph : http://www.domains.ph/FAQ2.asp +// Submitted by registry 2008-06-13 +ph +com.ph +net.ph +org.ph +gov.ph +edu.ph +ngo.ph +mil.ph +i.ph + +// pk : http://pk5.pknic.net.pk/pk5/msgNamepk.PK +pk +com.pk +net.pk +edu.pk +org.pk +fam.pk +biz.pk +web.pk +gov.pk +gob.pk +gok.pk +gon.pk +gop.pk +gos.pk +info.pk + +// pl : http://www.dns.pl/english/ +pl +// NASK functional domains (nask.pl / dns.pl) : http://www.dns.pl/english/dns-funk.html +aid.pl +agro.pl +atm.pl +auto.pl +biz.pl +com.pl +edu.pl +gmina.pl +gsm.pl +info.pl +mail.pl +miasta.pl +media.pl +mil.pl +net.pl +nieruchomosci.pl +nom.pl +org.pl +pc.pl +powiat.pl +priv.pl +realestate.pl +rel.pl +sex.pl +shop.pl +sklep.pl +sos.pl +szkola.pl +targi.pl +tm.pl +tourism.pl +travel.pl +turystyka.pl +// ICM functional domains (icm.edu.pl) +6bone.pl +art.pl +mbone.pl +// Government domains (administred by ippt.gov.pl) +gov.pl +uw.gov.pl +um.gov.pl +ug.gov.pl +upow.gov.pl +starostwo.gov.pl +so.gov.pl +sr.gov.pl +po.gov.pl +pa.gov.pl +// other functional domains +ngo.pl +irc.pl +usenet.pl +// NASK geographical domains : http://www.dns.pl/english/dns-regiony.html +augustow.pl +babia-gora.pl +bedzin.pl +beskidy.pl +bialowieza.pl +bialystok.pl +bielawa.pl +bieszczady.pl +boleslawiec.pl +bydgoszcz.pl +bytom.pl +cieszyn.pl +czeladz.pl +czest.pl +dlugoleka.pl +elblag.pl +elk.pl +glogow.pl +gniezno.pl +gorlice.pl +grajewo.pl +ilawa.pl +jaworzno.pl +jelenia-gora.pl +jgora.pl +kalisz.pl +kazimierz-dolny.pl +karpacz.pl +kartuzy.pl +kaszuby.pl +katowice.pl +kepno.pl +ketrzyn.pl +klodzko.pl +kobierzyce.pl +kolobrzeg.pl +konin.pl +konskowola.pl +kutno.pl +lapy.pl +lebork.pl +legnica.pl +lezajsk.pl +limanowa.pl +lomza.pl +lowicz.pl +lubin.pl +lukow.pl +malbork.pl +malopolska.pl +mazowsze.pl +mazury.pl +mielec.pl +mielno.pl +mragowo.pl +naklo.pl +nowaruda.pl +nysa.pl +olawa.pl +olecko.pl +olkusz.pl +olsztyn.pl +opoczno.pl +opole.pl +ostroda.pl +ostroleka.pl +ostrowiec.pl +ostrowwlkp.pl +pila.pl +pisz.pl +podhale.pl +podlasie.pl +polkowice.pl +pomorze.pl +pomorskie.pl +prochowice.pl +pruszkow.pl +przeworsk.pl +pulawy.pl +radom.pl +rawa-maz.pl +rybnik.pl +rzeszow.pl +sanok.pl +sejny.pl +siedlce.pl +slask.pl +slupsk.pl +sosnowiec.pl +stalowa-wola.pl +skoczow.pl +starachowice.pl +stargard.pl +suwalki.pl +swidnica.pl +swiebodzin.pl +swinoujscie.pl +szczecin.pl +szczytno.pl +tarnobrzeg.pl +tgory.pl +turek.pl +tychy.pl +ustka.pl +walbrzych.pl +warmia.pl +warszawa.pl +waw.pl +wegrow.pl +wielun.pl +wlocl.pl +wloclawek.pl +wodzislaw.pl +wolomin.pl +wroclaw.pl +zachpomor.pl +zagan.pl +zarow.pl +zgora.pl +zgorzelec.pl +// TASK geographical domains (www.task.gda.pl/uslugi/dns) +gda.pl +gdansk.pl +gdynia.pl +med.pl +sopot.pl +// other geographical domains +gliwice.pl +krakow.pl +poznan.pl +wroc.pl +zakopane.pl + +// pm : http://www.afnic.fr/medias/documents/AFNIC-naming-policy2012.pdf +pm + +// pn : http://www.government.pn/PnRegistry/policies.htm +pn +gov.pn +co.pn +org.pn +edu.pn +net.pn + +// post : http://en.wikipedia.org/wiki/.post +post + +// pr : http://www.nic.pr/index.asp?f=1 +pr +com.pr +net.pr +org.pr +gov.pr +edu.pr +isla.pr +pro.pr +biz.pr +info.pr +name.pr +// these aren't mentioned on nic.pr, but on http://en.wikipedia.org/wiki/.pr +est.pr +prof.pr +ac.pr + +// pro : http://www.nic.pro/support_faq.htm +pro +aca.pro +bar.pro +cpa.pro +jur.pro +law.pro +med.pro +eng.pro + +// ps : http://en.wikipedia.org/wiki/.ps +// http://www.nic.ps/registration/policy.html#reg +ps +edu.ps +gov.ps +sec.ps +plo.ps +com.ps +org.ps +net.ps + +// pt : http://online.dns.pt/dns/start_dns +pt +net.pt +gov.pt +org.pt +edu.pt +int.pt +publ.pt +com.pt +nome.pt + +// pw : http://en.wikipedia.org/wiki/.pw +pw +co.pw +ne.pw +or.pw +ed.pw +go.pw +belau.pw + +// py : http://www.nic.py/pautas.html#seccion_9 +// Confirmed by registry 2012-10-03 +py +com.py +coop.py +edu.py +gov.py +mil.py +net.py +org.py + +// qa : http://domains.qa/en/ +qa +com.qa +edu.qa +gov.qa +mil.qa +name.qa +net.qa +org.qa +sch.qa + +// re : http://www.afnic.re/obtenir/chartes/nommage-re/annexe-descriptifs +re +com.re +asso.re +nom.re + +// ro : http://www.rotld.ro/ +ro +com.ro +org.ro +tm.ro +nt.ro +nom.ro +info.ro +rec.ro +arts.ro +firm.ro +store.ro +www.ro + +// rs : http://en.wikipedia.org/wiki/.rs +rs +co.rs +org.rs +edu.rs +ac.rs +gov.rs +in.rs + +// ru : http://www.cctld.ru/ru/docs/aktiv_8.php +// Industry domains +ru +ac.ru +com.ru +edu.ru +int.ru +net.ru +org.ru +pp.ru +// Geographical domains +adygeya.ru +altai.ru +amur.ru +arkhangelsk.ru +astrakhan.ru +bashkiria.ru +belgorod.ru +bir.ru +bryansk.ru +buryatia.ru +cbg.ru +chel.ru +chelyabinsk.ru +chita.ru +chukotka.ru +chuvashia.ru +dagestan.ru +dudinka.ru +e-burg.ru +grozny.ru +irkutsk.ru +ivanovo.ru +izhevsk.ru +jar.ru +joshkar-ola.ru +kalmykia.ru +kaluga.ru +kamchatka.ru +karelia.ru +kazan.ru +kchr.ru +kemerovo.ru +khabarovsk.ru +khakassia.ru +khv.ru +kirov.ru +koenig.ru +komi.ru +kostroma.ru +krasnoyarsk.ru +kuban.ru +kurgan.ru +kursk.ru +lipetsk.ru +magadan.ru +mari.ru +mari-el.ru +marine.ru +mordovia.ru +mosreg.ru +msk.ru +murmansk.ru +nalchik.ru +nnov.ru +nov.ru +novosibirsk.ru +nsk.ru +omsk.ru +orenburg.ru +oryol.ru +palana.ru +penza.ru +perm.ru +pskov.ru +ptz.ru +rnd.ru +ryazan.ru +sakhalin.ru +samara.ru +saratov.ru +simbirsk.ru +smolensk.ru +spb.ru +stavropol.ru +stv.ru +surgut.ru +tambov.ru +tatarstan.ru +tom.ru +tomsk.ru +tsaritsyn.ru +tsk.ru +tula.ru +tuva.ru +tver.ru +tyumen.ru +udm.ru +udmurtia.ru +ulan-ude.ru +vladikavkaz.ru +vladimir.ru +vladivostok.ru +volgograd.ru +vologda.ru +voronezh.ru +vrn.ru +vyatka.ru +yakutia.ru +yamal.ru +yaroslavl.ru +yekaterinburg.ru +yuzhno-sakhalinsk.ru +// More geographical domains +amursk.ru +baikal.ru +cmw.ru +fareast.ru +jamal.ru +kms.ru +k-uralsk.ru +kustanai.ru +kuzbass.ru +magnitka.ru +mytis.ru +nakhodka.ru +nkz.ru +norilsk.ru +oskol.ru +pyatigorsk.ru +rubtsovsk.ru +snz.ru +syzran.ru +vdonsk.ru +zgrad.ru +// State domains +gov.ru +mil.ru +// Technical domains +test.ru + +// rw : http://www.nic.rw/cgi-bin/policy.pl +rw +gov.rw +net.rw +edu.rw +ac.rw +com.rw +co.rw +int.rw +mil.rw +gouv.rw + +// sa : http://www.nic.net.sa/ +sa +com.sa +net.sa +org.sa +gov.sa +med.sa +pub.sa +edu.sa +sch.sa + +// sb : http://www.sbnic.net.sb/ +// Submitted by registry 2008-06-08 +sb +com.sb +edu.sb +gov.sb +net.sb +org.sb + +// sc : http://www.nic.sc/ +sc +com.sc +gov.sc +net.sc +org.sc +edu.sc + +// sd : http://www.isoc.sd/sudanic.isoc.sd/billing_pricing.htm +// Submitted by registry 2008-06-17 +sd +com.sd +net.sd +org.sd +edu.sd +med.sd +tv.sd +gov.sd +info.sd + +// se : http://en.wikipedia.org/wiki/.se +// Submitted by registry 2008-06-24 +se +a.se +ac.se +b.se +bd.se +brand.se +c.se +d.se +e.se +f.se +fh.se +fhsk.se +fhv.se +g.se +h.se +i.se +k.se +komforb.se +kommunalforbund.se +komvux.se +l.se +lanbib.se +m.se +n.se +naturbruksgymn.se +o.se +org.se +p.se +parti.se +pp.se +press.se +r.se +s.se +sshn.se +t.se +tm.se +u.se +w.se +x.se +y.se +z.se + +// sg : http://www.nic.net.sg/page/registration-policies-procedures-and-guidelines +sg +com.sg +net.sg +org.sg +gov.sg +edu.sg +per.sg + +// sh : http://www.nic.sh/registrar.html +sh +com.sh +net.sh +gov.sh +org.sh +mil.sh + +// si : http://en.wikipedia.org/wiki/.si +si + +// sj : No registrations at this time. +// Submitted by registry 2008-06-16 + +// sk : http://en.wikipedia.org/wiki/.sk +// list of 2nd level domains ? +sk + +// sl : http://www.nic.sl +// Submitted by registry 2008-06-12 +sl +com.sl +net.sl +edu.sl +gov.sl +org.sl + +// sm : http://en.wikipedia.org/wiki/.sm +sm + +// sn : http://en.wikipedia.org/wiki/.sn +sn +art.sn +com.sn +edu.sn +gouv.sn +org.sn +perso.sn +univ.sn + +// so : http://www.soregistry.com/ +so +com.so +net.so +org.so + +// sr : http://en.wikipedia.org/wiki/.sr +sr + +// st : http://www.nic.st/html/policyrules/ +st +co.st +com.st +consulado.st +edu.st +embaixada.st +gov.st +mil.st +net.st +org.st +principe.st +saotome.st +store.st + +// su : http://en.wikipedia.org/wiki/.su +su + +// sv : http://www.svnet.org.sv/svpolicy.html +*.sv + +// sx : http://en.wikipedia.org/wiki/.sx +// Confirmed by registry 2012-05-31 +sx +gov.sx + +// sy : http://en.wikipedia.org/wiki/.sy +// see also: http://www.gobin.info/domainname/sy.doc +sy +edu.sy +gov.sy +net.sy +mil.sy +com.sy +org.sy + +// sz : http://en.wikipedia.org/wiki/.sz +// http://www.sispa.org.sz/ +sz +co.sz +ac.sz +org.sz + +// tc : http://en.wikipedia.org/wiki/.tc +tc + +// td : http://en.wikipedia.org/wiki/.td +td + +// tel: http://en.wikipedia.org/wiki/.tel +// http://www.telnic.org/ +tel + +// tf : http://en.wikipedia.org/wiki/.tf +tf + +// tg : http://en.wikipedia.org/wiki/.tg +// http://www.nic.tg/ +tg + +// th : http://en.wikipedia.org/wiki/.th +// Submitted by registry 2008-06-17 +th +ac.th +co.th +go.th +in.th +mi.th +net.th +or.th + +// tj : http://www.nic.tj/policy.html +tj +ac.tj +biz.tj +co.tj +com.tj +edu.tj +go.tj +gov.tj +int.tj +mil.tj +name.tj +net.tj +nic.tj +org.tj +test.tj +web.tj + +// tk : http://en.wikipedia.org/wiki/.tk +tk + +// tl : http://en.wikipedia.org/wiki/.tl +tl +gov.tl + +// tm : http://www.nic.tm/local.html +tm +com.tm +co.tm +org.tm +net.tm +nom.tm +gov.tm +mil.tm +edu.tm + +// tn : http://en.wikipedia.org/wiki/.tn +// http://whois.ati.tn/ +tn +com.tn +ens.tn +fin.tn +gov.tn +ind.tn +intl.tn +nat.tn +net.tn +org.tn +info.tn +perso.tn +tourism.tn +edunet.tn +rnrt.tn +rns.tn +rnu.tn +mincom.tn +agrinet.tn +defense.tn +turen.tn + +// to : http://en.wikipedia.org/wiki/.to +// Submitted by registry 2008-06-17 +to +com.to +gov.to +net.to +org.to +edu.to +mil.to + +// tr : http://en.wikipedia.org/wiki/.tr +*.tr +!nic.tr +// Used by government in the TRNC +// http://en.wikipedia.org/wiki/.nc.tr +gov.nc.tr + +// travel : http://en.wikipedia.org/wiki/.travel +travel + +// tt : http://www.nic.tt/ +tt +co.tt +com.tt +org.tt +net.tt +biz.tt +info.tt +pro.tt +int.tt +coop.tt +jobs.tt +mobi.tt +travel.tt +museum.tt +aero.tt +name.tt +gov.tt +edu.tt + +// tv : http://en.wikipedia.org/wiki/.tv +// Not listing any 2LDs as reserved since none seem to exist in practice, +// Wikipedia notwithstanding. +tv + +// tw : http://en.wikipedia.org/wiki/.tw +tw +edu.tw +gov.tw +mil.tw +com.tw +net.tw +org.tw +idv.tw +game.tw +ebiz.tw +club.tw +網路.tw +組織.tw +商業.tw + +// tz : http://www.tznic.or.tz/index.php/domains +// Confirmed by registry 2013-01-22 +ac.tz +co.tz +go.tz +hotel.tz +info.tz +me.tz +mil.tz +mobi.tz +ne.tz +or.tz +sc.tz +tv.tz + +// ua : https://hostmaster.ua/policy/?ua +// Submitted by registry 2012-04-27 +ua +// ua 2LD +com.ua +edu.ua +gov.ua +in.ua +net.ua +org.ua +// ua geographic names +// https://hostmaster.ua/2ld/ +cherkassy.ua +cherkasy.ua +chernigov.ua +chernihiv.ua +chernivtsi.ua +chernovtsy.ua +ck.ua +cn.ua +cr.ua +crimea.ua +cv.ua +dn.ua +dnepropetrovsk.ua +dnipropetrovsk.ua +dominic.ua +donetsk.ua +dp.ua +if.ua +ivano-frankivsk.ua +kh.ua +kharkiv.ua +kharkov.ua +kherson.ua +khmelnitskiy.ua +khmelnytskyi.ua +kiev.ua +kirovograd.ua +km.ua +kr.ua +krym.ua +ks.ua +kv.ua +kyiv.ua +lg.ua +lt.ua +lugansk.ua +lutsk.ua +lv.ua +lviv.ua +mk.ua +mykolaiv.ua +nikolaev.ua +od.ua +odesa.ua +odessa.ua +pl.ua +poltava.ua +rivne.ua +rovno.ua +rv.ua +sb.ua +sebastopol.ua +sevastopol.ua +sm.ua +sumy.ua +te.ua +ternopil.ua +uz.ua +uzhgorod.ua +vinnica.ua +vinnytsia.ua +vn.ua +volyn.ua +yalta.ua +zaporizhzhe.ua +zaporizhzhia.ua +zhitomir.ua +zhytomyr.ua +zp.ua +zt.ua + +// Private registries in .ua +co.ua +pp.ua + +// ug : https://www.registry.co.ug/ +ug +co.ug +or.ug +ac.ug +sc.ug +go.ug +ne.ug +com.ug +org.ug + +// uk : http://en.wikipedia.org/wiki/.uk +// Submitted by registry 2012-10-02 +// and tweaked by us pending further consultation. +*.uk +*.sch.uk +!bl.uk +!british-library.uk +!jet.uk +!mod.uk +!national-library-scotland.uk +!nel.uk +!nic.uk +!nls.uk +!parliament.uk + +// us : http://en.wikipedia.org/wiki/.us +us +dni.us +fed.us +isa.us +kids.us +nsn.us +// us geographic names +ak.us +al.us +ar.us +as.us +az.us +ca.us +co.us +ct.us +dc.us +de.us +fl.us +ga.us +gu.us +hi.us +ia.us +id.us +il.us +in.us +ks.us +ky.us +la.us +ma.us +md.us +me.us +mi.us +mn.us +mo.us +ms.us +mt.us +nc.us +nd.us +ne.us +nh.us +nj.us +nm.us +nv.us +ny.us +oh.us +ok.us +or.us +pa.us +pr.us +ri.us +sc.us +sd.us +tn.us +tx.us +ut.us +vi.us +vt.us +va.us +wa.us +wi.us +wv.us +wy.us +// The registrar notes several more specific domains available in each state, +// such as state.*.us, dst.*.us, etc., but resolution of these is somewhat +// haphazard; in some states these domains resolve as addresses, while in others +// only subdomains are available, or even nothing at all. We include the +// most common ones where it's clear that different sites are different +// entities. +k12.ak.us +k12.al.us +k12.ar.us +k12.as.us +k12.az.us +k12.ca.us +k12.co.us +k12.ct.us +k12.dc.us +k12.de.us +k12.fl.us +k12.ga.us +k12.gu.us +// k12.hi.us Hawaii has a state-wide DOE login: bug 614565 +k12.ia.us +k12.id.us +k12.il.us +k12.in.us +k12.ks.us +k12.ky.us +k12.la.us +k12.ma.us +k12.md.us +k12.me.us +k12.mi.us +k12.mn.us +k12.mo.us +k12.ms.us +k12.mt.us +k12.nc.us +k12.nd.us +k12.ne.us +k12.nh.us +k12.nj.us +k12.nm.us +k12.nv.us +k12.ny.us +k12.oh.us +k12.ok.us +k12.or.us +k12.pa.us +k12.pr.us +k12.ri.us +k12.sc.us +k12.sd.us +k12.tn.us +k12.tx.us +k12.ut.us +k12.vi.us +k12.vt.us +k12.va.us +k12.wa.us +k12.wi.us +k12.wv.us +k12.wy.us + +cc.ak.us +cc.al.us +cc.ar.us +cc.as.us +cc.az.us +cc.ca.us +cc.co.us +cc.ct.us +cc.dc.us +cc.de.us +cc.fl.us +cc.ga.us +cc.gu.us +cc.hi.us +cc.ia.us +cc.id.us +cc.il.us +cc.in.us +cc.ks.us +cc.ky.us +cc.la.us +cc.ma.us +cc.md.us +cc.me.us +cc.mi.us +cc.mn.us +cc.mo.us +cc.ms.us +cc.mt.us +cc.nc.us +cc.nd.us +cc.ne.us +cc.nh.us +cc.nj.us +cc.nm.us +cc.nv.us +cc.ny.us +cc.oh.us +cc.ok.us +cc.or.us +cc.pa.us +cc.pr.us +cc.ri.us +cc.sc.us +cc.sd.us +cc.tn.us +cc.tx.us +cc.ut.us +cc.vi.us +cc.vt.us +cc.va.us +cc.wa.us +cc.wi.us +cc.wv.us +cc.wy.us + +lib.ak.us +lib.al.us +lib.ar.us +lib.as.us +lib.az.us +lib.ca.us +lib.co.us +lib.ct.us +lib.dc.us +lib.de.us +lib.fl.us +lib.ga.us +lib.gu.us +lib.hi.us +lib.ia.us +lib.id.us +lib.il.us +lib.in.us +lib.ks.us +lib.ky.us +lib.la.us +lib.ma.us +lib.md.us +lib.me.us +lib.mi.us +lib.mn.us +lib.mo.us +lib.ms.us +lib.mt.us +lib.nc.us +lib.nd.us +lib.ne.us +lib.nh.us +lib.nj.us +lib.nm.us +lib.nv.us +lib.ny.us +lib.oh.us +lib.ok.us +lib.or.us +lib.pa.us +lib.pr.us +lib.ri.us +lib.sc.us +lib.sd.us +lib.tn.us +lib.tx.us +lib.ut.us +lib.vi.us +lib.vt.us +lib.va.us +lib.wa.us +lib.wi.us +lib.wv.us +lib.wy.us + +// k12.ma.us contains school districts in Massachusetts. The 4LDs are +// managed indepedently except for private (PVT), charter (CHTR) and +// parochial (PAROCH) schools. Those are delegated dorectly to the +// 5LD operators. +pvt.k12.ma.us +chtr.k12.ma.us +paroch.k12.ma.us + +// uy : http://www.nic.org.uy/ +uy +com.uy +edu.uy +gub.uy +mil.uy +net.uy +org.uy + +// uz : http://www.reg.uz/ +uz +co.uz +com.uz +net.uz +org.uz + +// va : http://en.wikipedia.org/wiki/.va +va + +// vc : http://en.wikipedia.org/wiki/.vc +// Submitted by registry 2008-06-13 +vc +com.vc +net.vc +org.vc +gov.vc +mil.vc +edu.vc + +// ve : https://registro.nic.ve/ +// Confirmed by registry 2012-10-04 +ve +co.ve +com.ve +e12.ve +edu.ve +gov.ve +info.ve +mil.ve +net.ve +org.ve +web.ve + +// vg : http://en.wikipedia.org/wiki/.vg +vg + +// vi : http://www.nic.vi/newdomainform.htm +// http://www.nic.vi/Domain_Rules/body_domain_rules.html indicates some other +// TLDs are "reserved", such as edu.vi and gov.vi, but doesn't actually say they +// are available for registration (which they do not seem to be). +vi +co.vi +com.vi +k12.vi +net.vi +org.vi + +// vn : https://www.dot.vn/vnnic/vnnic/domainregistration.jsp +vn +com.vn +net.vn +org.vn +edu.vn +gov.vn +int.vn +ac.vn +biz.vn +info.vn +name.vn +pro.vn +health.vn + +// vu : http://en.wikipedia.org/wiki/.vu +// list of 2nd level tlds ? +vu + +// wf : http://www.afnic.fr/medias/documents/AFNIC-naming-policy2012.pdf +wf + +// ws : http://en.wikipedia.org/wiki/.ws +// http://samoanic.ws/index.dhtml +ws +com.ws +net.ws +org.ws +gov.ws +edu.ws + +// yt : http://www.afnic.fr/medias/documents/AFNIC-naming-policy2012.pdf +yt + +// IDN ccTLDs +// Please sort by ISO 3166 ccTLD, then punicode string +// when submitting patches and follow this format: +// ("" ) : +// [optional sponsoring org] +// + +// xn--mgbaam7a8h ("Emerat" Arabic) : AE +// http://nic.ae/english/arabicdomain/rules.jsp +امارات + +// xn--54b7fta0cc ("Bangla" Bangla) : BD +বাংলা + +// xn--fiqs8s ("China" Chinese-Han-Simplified <.Zhonggou>) : CN +// CNNIC +// http://cnnic.cn/html/Dir/2005/10/11/3218.htm +中国 + +// xn--fiqz9s ("China" Chinese-Han-Traditional <.Zhonggou>) : CN +// CNNIC +// http://cnnic.cn/html/Dir/2005/10/11/3218.htm +中國 + +// xn--lgbbat1ad8j ("Algeria / Al Jazair" Arabic) : DZ +الجزائر + +// xn--wgbh1c ("Egypt" Arabic .masr) : EG +// http://www.dotmasr.eg/ +مصر + +// xn--node ("ge" Georgian (Mkhedruli)) : GE +გე + +// xn--j6w193g ("Hong Kong" Chinese-Han) : HK +// https://www2.hkirc.hk/register/rules.jsp +香港 + +// xn--h2brj9c ("Bharat" Devanagari) : IN +// India +भारत + +// xn--mgbbh1a71e ("Bharat" Arabic) : IN +// India +بھارت + +// xn--fpcrj9c3d ("Bharat" Telugu) : IN +// India +భారత్ + +// xn--gecrj9c ("Bharat" Gujarati) : IN +// India +ભારત + +// xn--s9brj9c ("Bharat" Gurmukhi) : IN +// India +ਭਾਰਤ + +// xn--45brj9c ("Bharat" Bengali) : IN +// India +ভারত + +// xn--xkc2dl3a5ee0h ("India" Tamil) : IN +// India +இந்தியா + +// xn--mgba3a4f16a ("Iran" Persian) : IR +ایران + +// xn--mgba3a4fra ("Iran" Arabic) : IR +ايران + +// xn--mgbayh7gpa ("al-Ordon" Arabic) : JO +// National Information Technology Center (NITC) +// Royal Scientific Society, Al-Jubeiha +الاردن + +// xn--3e0b707e ("Republic of Korea" Hangul) : KR +한국 + +// xn--fzc2c9e2c ("Lanka" Sinhalese-Sinhala) : LK +// http://nic.lk +ලංකා + +// xn--xkc2al3hye2a ("Ilangai" Tamil) : LK +// http://nic.lk +இலங்கை + +// xn--mgbc0a9azcg ("Morocco / al-Maghrib" Arabic) : MA +المغرب + +// xn--mgb9awbf ("Oman" Arabic) : OM +عمان + +// xn--ygbi2ammx ("Falasteen" Arabic) : PS +// The Palestinian National Internet Naming Authority (PNINA) +// http://www.pnina.ps +فلسطين + +// xn--90a3ac ("srb" Cyrillic) : RS +срб + +// xn--p1ai ("rf" Russian-Cyrillic) : RU +// http://www.cctld.ru/en/docs/rulesrf.php +рф + +// xn--wgbl6a ("Qatar" Arabic) : QA +// http://www.ict.gov.qa/ +قطر + +// xn--mgberp4a5d4ar ("AlSaudiah" Arabic) : SA +// http://www.nic.net.sa/ +السعودية + +// xn--mgberp4a5d4a87g ("AlSaudiah" Arabic) variant : SA +السعودیة + +// xn--mgbqly7c0a67fbc ("AlSaudiah" Arabic) variant : SA +السعودیۃ + +// xn--mgbqly7cvafr ("AlSaudiah" Arabic) variant : SA +السعوديه + +// xn--ogbpf8fl ("Syria" Arabic) : SY +سورية + +// xn--mgbtf8fl ("Syria" Arabic) variant : SY +سوريا + +// xn--yfro4i67o Singapore ("Singapore" Chinese-Han) : SG +新加坡 + +// xn--clchc0ea0b2g2a9gcd ("Singapore" Tamil) : SG +சிங்கப்பூர் + +// xn--o3cw4h ("Thai" Thai) : TH +// http://www.thnic.co.th +ไทย + +// xn--pgbs0dh ("Tunis") : TN +// http://nic.tn +تونس + +// xn--kpry57d ("Taiwan" Chinese-Han-Traditional) : TW +// http://www.twnic.net/english/dn/dn_07a.htm +台灣 + +// xn--kprw13d ("Taiwan" Chinese-Han-Simplified) : TW +// http://www.twnic.net/english/dn/dn_07a.htm +台湾 + +// xn--nnx388a ("Taiwan") variant : TW +臺灣 + +// xn--j1amh ("ukr" Cyrillic) : UA +укр + +// xn--mgb2ddes ("AlYemen" Arabic) : YE +اليمن + +// xxx : http://icmregistry.com +xxx + +// ye : http://www.y.net.ye/services/domain_name.htm +*.ye + +// za : http://www.zadna.org.za/slds.html +*.za + +// zm : http://en.wikipedia.org/wiki/.zm +*.zm + +// zw : http://en.wikipedia.org/wiki/.zw +*.zw + +// ===END ICANN DOMAINS=== +// ===BEGIN PRIVATE DOMAINS=== + +// Amazon CloudFront : https://aws.amazon.com/cloudfront/ +// Requested by Donavan Miller 2013-03-22 +cloudfront.net + +// Amazon Elastic Compute Cloud: https://aws.amazon.com/ec2/ +// Requested by Osman Surkatty 2013-04-02 +compute.amazonaws.com +us-east-1.amazonaws.com +compute-1.amazonaws.com +z-1.compute-1.amazonaws.com +z-2.compute-1.amazonaws.com +ap-northeast-1.compute.amazonaws.com +ap-southeast-1.compute.amazonaws.com +ap-southeast-2.compute.amazonaws.com +eu-west-1.compute.amazonaws.com +sa-east-1.compute.amazonaws.com +us-gov-west-1.compute.amazonaws.com +us-west-1.compute.amazonaws.com +us-west-2.compute.amazonaws.com + +// Amazon Elastic Beanstalk : https://aws.amazon.com/elasticbeanstalk/ +// Requested by Adam Stein 2013-04-02 +elasticbeanstalk.com + +// Amazon Elastic Load Balancing : https://aws.amazon.com/elasticloadbalancing/ +// Requested by Scott Vidmar 2013-03-27 +elb.amazonaws.com + +// Amazon S3 : https://aws.amazon.com/s3/ +// Requested by Courtney Eckhardt 2013-03-22 +s3.amazonaws.com +s3-us-west-2.amazonaws.com +s3-us-west-1.amazonaws.com +s3-eu-west-1.amazonaws.com +s3-ap-southeast-1.amazonaws.com +s3-ap-southeast-2.amazonaws.com +s3-ap-northeast-1.amazonaws.com +s3-sa-east-1.amazonaws.com +s3-us-gov-west-1.amazonaws.com +s3-fips-us-gov-west-1.amazonaws.com +s3-website-us-east-1.amazonaws.com +s3-website-us-west-2.amazonaws.com +s3-website-us-west-1.amazonaws.com +s3-website-eu-west-1.amazonaws.com +s3-website-ap-southeast-1.amazonaws.com +s3-website-ap-southeast-2.amazonaws.com +s3-website-ap-northeast-1.amazonaws.com +s3-website-sa-east-1.amazonaws.com +s3-website-us-gov-west-1.amazonaws.com + +// BetaInABox +// Requested by adrian@betainabox.com 2012-09-13 +betainabox.com + +// CentralNic : http://www.centralnic.com/names/domains +// Requested by registry 2012-09-27 +ae.org +ar.com +br.com +cn.com +com.de +de.com +eu.com +gb.com +gb.net +gr.com +hu.com +hu.net +jp.net +jpn.com +kr.com +no.com +qc.com +ru.com +sa.com +se.com +se.net +uk.com +uk.net +us.com +us.org +uy.com +za.com + +// c.la : http://www.c.la/ +c.la + +// cloudControl : https://www.cloudcontrol.com/ +// Requested by Tobias Wilken 2013-07-23 +cloudcontrolled.com +cloudcontrolapp.com + +// co.ca : http://registry.co.ca/ +co.ca + +// CoDNS B.V. +co.nl +co.no + +// DreamHost : http://www.dreamhost.com/ +// Requested by Andrew Farmer 2012-10-02 +dreamhosters.com + +// DynDNS.com : http://www.dyndns.com/services/dns/dyndns/ +dyndns-at-home.com +dyndns-at-work.com +dyndns-blog.com +dyndns-free.com +dyndns-home.com +dyndns-ip.com +dyndns-mail.com +dyndns-office.com +dyndns-pics.com +dyndns-remote.com +dyndns-server.com +dyndns-web.com +dyndns-wiki.com +dyndns-work.com +dyndns.biz +dyndns.info +dyndns.org +dyndns.tv +at-band-camp.net +ath.cx +barrel-of-knowledge.info +barrell-of-knowledge.info +better-than.tv +blogdns.com +blogdns.net +blogdns.org +blogsite.org +boldlygoingnowhere.org +broke-it.net +buyshouses.net +cechire.com +dnsalias.com +dnsalias.net +dnsalias.org +dnsdojo.com +dnsdojo.net +dnsdojo.org +does-it.net +doesntexist.com +doesntexist.org +dontexist.com +dontexist.net +dontexist.org +doomdns.com +doomdns.org +dvrdns.org +dyn-o-saur.com +dynalias.com +dynalias.net +dynalias.org +dynathome.net +dyndns.ws +endofinternet.net +endofinternet.org +endoftheinternet.org +est-a-la-maison.com +est-a-la-masion.com +est-le-patron.com +est-mon-blogueur.com +for-better.biz +for-more.biz +for-our.info +for-some.biz +for-the.biz +forgot.her.name +forgot.his.name +from-ak.com +from-al.com +from-ar.com +from-az.net +from-ca.com +from-co.net +from-ct.com +from-dc.com +from-de.com +from-fl.com +from-ga.com +from-hi.com +from-ia.com +from-id.com +from-il.com +from-in.com +from-ks.com +from-ky.com +from-la.net +from-ma.com +from-md.com +from-me.org +from-mi.com +from-mn.com +from-mo.com +from-ms.com +from-mt.com +from-nc.com +from-nd.com +from-ne.com +from-nh.com +from-nj.com +from-nm.com +from-nv.com +from-ny.net +from-oh.com +from-ok.com +from-or.com +from-pa.com +from-pr.com +from-ri.com +from-sc.com +from-sd.com +from-tn.com +from-tx.com +from-ut.com +from-va.com +from-vt.com +from-wa.com +from-wi.com +from-wv.com +from-wy.com +ftpaccess.cc +fuettertdasnetz.de +game-host.org +game-server.cc +getmyip.com +gets-it.net +go.dyndns.org +gotdns.com +gotdns.org +groks-the.info +groks-this.info +ham-radio-op.net +here-for-more.info +hobby-site.com +hobby-site.org +home.dyndns.org +homedns.org +homeftp.net +homeftp.org +homeip.net +homelinux.com +homelinux.net +homelinux.org +homeunix.com +homeunix.net +homeunix.org +iamallama.com +in-the-band.net +is-a-anarchist.com +is-a-blogger.com +is-a-bookkeeper.com +is-a-bruinsfan.org +is-a-bulls-fan.com +is-a-candidate.org +is-a-caterer.com +is-a-celticsfan.org +is-a-chef.com +is-a-chef.net +is-a-chef.org +is-a-conservative.com +is-a-cpa.com +is-a-cubicle-slave.com +is-a-democrat.com +is-a-designer.com +is-a-doctor.com +is-a-financialadvisor.com +is-a-geek.com +is-a-geek.net +is-a-geek.org +is-a-green.com +is-a-guru.com +is-a-hard-worker.com +is-a-hunter.com +is-a-knight.org +is-a-landscaper.com +is-a-lawyer.com +is-a-liberal.com +is-a-libertarian.com +is-a-linux-user.org +is-a-llama.com +is-a-musician.com +is-a-nascarfan.com +is-a-nurse.com +is-a-painter.com +is-a-patsfan.org +is-a-personaltrainer.com +is-a-photographer.com +is-a-player.com +is-a-republican.com +is-a-rockstar.com +is-a-socialist.com +is-a-soxfan.org +is-a-student.com +is-a-teacher.com +is-a-techie.com +is-a-therapist.com +is-an-accountant.com +is-an-actor.com +is-an-actress.com +is-an-anarchist.com +is-an-artist.com +is-an-engineer.com +is-an-entertainer.com +is-by.us +is-certified.com +is-found.org +is-gone.com +is-into-anime.com +is-into-cars.com +is-into-cartoons.com +is-into-games.com +is-leet.com +is-lost.org +is-not-certified.com +is-saved.org +is-slick.com +is-uberleet.com +is-very-bad.org +is-very-evil.org +is-very-good.org +is-very-nice.org +is-very-sweet.org +is-with-theband.com +isa-geek.com +isa-geek.net +isa-geek.org +isa-hockeynut.com +issmarterthanyou.com +isteingeek.de +istmein.de +kicks-ass.net +kicks-ass.org +knowsitall.info +land-4-sale.us +lebtimnetz.de +leitungsen.de +likes-pie.com +likescandy.com +merseine.nu +mine.nu +misconfused.org +mypets.ws +myphotos.cc +neat-url.com +office-on-the.net +on-the-web.tv +podzone.net +podzone.org +readmyblog.org +saves-the-whales.com +scrapper-site.net +scrapping.cc +selfip.biz +selfip.com +selfip.info +selfip.net +selfip.org +sells-for-less.com +sells-for-u.com +sells-it.net +sellsyourhome.org +servebbs.com +servebbs.net +servebbs.org +serveftp.net +serveftp.org +servegame.org +shacknet.nu +simple-url.com +space-to-rent.com +stuff-4-sale.org +stuff-4-sale.us +teaches-yoga.com +thruhere.net +traeumtgerade.de +webhop.biz +webhop.info +webhop.net +webhop.org +worse-than.tv +writesthisblog.com + +// Fastly Inc. http://www.fastly.com/ +// Requested by Vladimir Vuksan 2013-05-31 +a.ssl.fastly.net +b.ssl.fastly.net +global.ssl.fastly.net +a.prod.fastly.net +global.prod.fastly.net + +// GitHub, Inc. +// Requested by Ben Toews 2013-04-18 +github.io + +// GlobeHosting, Inc. +// Requested by Zoltan Egresi 2013-07-12 +ro.com + +// Google, Inc. +// Requested by Eduardo Vela 2012-10-24 +appspot.com +blogspot.be +blogspot.bj +blogspot.ca +blogspot.cf +blogspot.ch +blogspot.co.at +blogspot.co.il +blogspot.co.nz +blogspot.co.uk +blogspot.com +blogspot.com.ar +blogspot.com.au +blogspot.com.br +blogspot.com.es +blogspot.cv +blogspot.cz +blogspot.de +blogspot.dk +blogspot.fi +blogspot.fr +blogspot.gr +blogspot.hk +blogspot.hu +blogspot.ie +blogspot.in +blogspot.it +blogspot.jp +blogspot.kr +blogspot.mr +blogspot.mx +blogspot.nl +blogspot.no +blogspot.pt +blogspot.re +blogspot.ro +blogspot.se +blogspot.sg +blogspot.sk +blogspot.td +blogspot.tw +codespot.com +googleapis.com +googlecode.com + +// Heroku : https://www.heroku.com/ +// Requested by Tom Maher 2013-05-02 +herokuapp.com +herokussl.com + +// iki.fi +// Requested by Hannu Aronsson 2009-11-05 +iki.fi + +// info.at : http://www.info.at/ +biz.at +info.at + +// Michau Enterprises Limited : http://www.co.pl/ +co.pl + +// NYC.mn : http://www.information.nyc.mn +// Requested by Matthew Brown 2013-03-11 +nyc.mn + +// Opera Software, A.S.A. +// Requested by Yngve Pettersen 2009-11-26 +operaunite.com + +// Red Hat, Inc. OpenShift : https://openshift.redhat.com/ +// Requested by Tim Kramer 2012-10-24 +rhcloud.com + +// priv.at : http://www.nic.priv.at/ +// Requested by registry 2008-06-09 +priv.at + +// ZaNiC : http://www.za.net/ +// Requested by registry 2009-10-03 +za.net +za.org + +// ===END PRIVATE DOMAINS=== diff --git a/src/test/java/org/archive/net/PublicSuffixesTest.java b/src/test/java/org/archive/net/PublicSuffixesTest.java new file mode 100644 index 00000000..b88acb6d --- /dev/null +++ b/src/test/java/org/archive/net/PublicSuffixesTest.java @@ -0,0 +1,193 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.net; + +import java.io.PrintWriter; +import java.io.StringWriter; +import java.util.ArrayList; +import java.util.regex.Matcher; + +import junit.framework.TestCase; + +import org.archive.net.PublicSuffixes.Node; + +/** + * Test cases for PublicSuffixes utility. Confirm expected matches/nonmatches + * from constructed regex. + * + * @author gojomo + */ +public class PublicSuffixesTest extends TestCase { + // test of low level implementation + + public void testCompare() { + Node n = new Node("hoge"); + assertTrue(n.compareTo('a') > 0); + assertEquals(-1, n.compareTo('*')); + assertEquals(-1, n.compareTo('!')); + assertEquals(-1, n.compareTo(new Node("*,"))); + assertEquals(-1, n.compareTo(new Node("!muga,"))); + assertEquals(-1, n.compareTo(new Node(""))); + + n = new Node("*,"); + assertEquals(1, n.compareTo('a')); + assertEquals(0, n.compareTo('*')); + assertEquals(1, n.compareTo('!')); + assertEquals(0, n.compareTo(new Node("*,"))); + assertEquals(1, n.compareTo(new Node("!muga,"))); + assertEquals(-1, n.compareTo(new Node(""))); + + n = new Node("!hoge"); + assertEquals(1, n.compareTo('a')); + assertEquals(-1, n.compareTo('*')); + assertEquals(0, n.compareTo('!')); + assertEquals(-1, n.compareTo(new Node("*,"))); + assertEquals(0, n.compareTo(new Node("!muga,"))); + assertEquals(-1, n.compareTo(new Node(""))); + + n = new Node(""); + assertEquals(1, n.compareTo('a')); + assertEquals(1, n.compareTo('*')); + assertEquals(1, n.compareTo('!')); + assertEquals(0, n.compareTo(new Node(""))); + } + + protected String dump(Node alt) { + StringWriter w = new StringWriter(); + PublicSuffixes.dump(alt, 0, new PrintWriter(w)); + return w.toString(); + } + public void testTrie1() { + Node alt = new Node(null, new ArrayList()); + alt.addBranch("ac,"); + // specifically, should not have empty string as match. + assertEquals("(null)\n" + + " \"ac,\"\n", dump(alt)); + alt.addBranch("ac,com,"); + assertEquals("(null)\n" + + " \"ac,\"\n" + + " \"com,\"\n" + + " \"\"\n", dump(alt)); + alt.addBranch("ac,edu,"); + assertEquals("(null)\n" + + " \"ac,\"\n" + + " \"com,\"\n" + + " \"edu,\"\n" + + " \"\"\n", dump(alt)); + } + public void testTrie2() { + Node alt = new Node(null, new ArrayList()); + alt.addBranch("ac,"); + alt.addBranch("*,"); + assertEquals("(null)\n" + + " \"ac,\"\n" + + " \"*,\"\n", dump(alt)); + } + + public void testTrie3() { + Node alt = new Node(null, new ArrayList()); + alt.addBranch("ac,"); + alt.addBranch("ac,!hoge,"); + alt.addBranch("ac,*,"); + // exception goes first. + assertEquals("(null)\n" + + " \"ac,\"\n" + + " \"!hoge,\"\n" + + " \"*,\"\n" + + " \"\"\n", dump(alt)); + } + + // test of higher-level functionality + + Matcher m = PublicSuffixes.getTopmostAssignedSurtPrefixPattern() + .matcher(""); + + public void testBasics() { + matchPrefix("com,example,www,", "com,example,"); + matchPrefix("com,example,", "com,example,"); + matchPrefix("org,archive,www,", "org,archive,"); + matchPrefix("org,archive,", "org,archive,"); + matchPrefix("fr,yahoo,www,", "fr,yahoo,"); + matchPrefix("fr,yahoo,", "fr,yahoo,"); + matchPrefix("au,com,foobar,www,", "au,com,foobar,"); + matchPrefix("au,com,foobar,", "au,com,foobar,"); + matchPrefix("uk,co,virgin,www,", "uk,co,virgin,"); + matchPrefix("uk,co,virgin,", "uk,co,virgin,"); + matchPrefix("au,com,example,www,", "au,com,example,"); + matchPrefix("au,com,example,", "au,com,example,"); + matchPrefix("jp,yokohama,public,assigned,www,", + "jp,yokohama,public,assigned,"); + matchPrefix("jp,yokohama,public,assigned,", "jp,yokohama,public,assigned,"); + } + + public void testDomainWithDash() { + matchPrefix("de,bad-site,www", "de,bad-site,"); + } + + public void testDomainWithNumbers() { + matchPrefix("de,archive4u,www", "de,archive4u,"); + } + + public void testIPV4() { + assertEquals("unexpected reduction", + "1.2.3.4", + PublicSuffixes.reduceSurtToAssignmentLevel("1.2.3.4")); + } + + public void testIPV6() { + assertEquals("unexpected reduction", + "[2001:0db8:85a3:08d3:1319:8a2e:0370:7344]", + PublicSuffixes.reduceSurtToAssignmentLevel( + "[2001:0db8:85a3:08d3:1319:8a2e:0370:7344]")); + } + + public void testExceptions() { + matchPrefix("uk,bl,www,", "uk,bl,"); + matchPrefix("uk,bl,", "uk,bl,"); + matchPrefix("jp,tokyo,city,subdomain,", "jp,tokyo,city,"); + matchPrefix("jp,tokyo,city,", "jp,tokyo,city,"); + } + + public void testFakeTLD() { + // we assume any new/unknonwn TLD should be assumed as 2-level; + // this is preferable for our grouping purpose but might not be + // for a cookie-assigning browser (original purpose of publicsuffixlist) + matchPrefix("zzz,example,www,", "zzz,example,"); + } + + public void testUnsegmentedHostname() { + m.reset("example"); + assertFalse("unexpected match found in 'example'", m.find()); + } + + public void testTopmostAssignedCaching() { + assertSame("topmostAssignedSurtPrefixPattern not cached",PublicSuffixes.getTopmostAssignedSurtPrefixPattern(),PublicSuffixes.getTopmostAssignedSurtPrefixPattern()); + assertSame("topmostAssignedSurtPrefixRegex not cached",PublicSuffixes.getTopmostAssignedSurtPrefixRegex(),PublicSuffixes.getTopmostAssignedSurtPrefixRegex()); + } + + // TODO: test UTF domains? + + protected void matchPrefix(String surtDomain, String expectedAssignedPrefix) { + m.reset(surtDomain); + assertTrue("expected match not found in '" + surtDomain, m.find()); + assertEquals("expected match not found", expectedAssignedPrefix, m + .group()); + } +} From a54dd8eb11b13988a64fed9f0a1e94faf80dc03e Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 5 Dec 2013 17:47:26 -0800 Subject: [PATCH 25/27] moving a bunch of stuff from heritrix-commons to ia-web-commons so that wayback doesn't have to depend on heritrix-commons --- pom.xml | 11 + .../org/archive/format/arc/ARCConstants.java | 211 ++++- .../ConfigurableX509TrustManager.java | 188 ++++ .../httpclient/HttpRecorderGetMethod.java | 120 +++ .../httpclient/HttpRecorderMethod.java | 107 +++ .../httpclient/HttpRecorderPostMethod.java | 82 ++ .../SingleHttpConnectionManager.java | 70 ++ .../java/org/archive/httpclient/package.html | 24 + .../org/archive/io/ArchiveFileConstants.java | 24 + .../java/org/archive/io/ArchiveReader.java | 761 ++++++++++++++++ .../org/archive/io/ArchiveReaderFactory.java | 301 +++++++ .../java/org/archive/io/ArchiveRecord.java | 409 +++++++++ .../org/archive/io/ArchiveRecordHeader.java | 111 +++ .../org/archive/io/ArraySeekInputStream.java | 106 +++ .../archive/io/BufferedSeekInputStream.java | 217 +++++ .../java/org/archive/io/CharSubSequence.java | 90 ++ .../archive/io/CompositeFileInputStream.java | 97 ++ .../org/archive/io/CompositeFileReader.java | 40 + src/main/java/org/archive/io/Endian.java | 125 +++ .../archive/io/GZIPMembersInputStream.java | 38 + .../org/archive/io/GenerationFileHandler.java | 200 +++++ .../archive/io/GenericReplayCharSequence.java | 412 +++++++++ src/main/java/org/archive/io/GzipHeader.java | 26 + .../org/archive/io/HeaderedArchiveRecord.java | 423 +++++++++ .../archive/io/LoudObjectOutputStream.java | 63 ++ .../org/archive/io/MiserOutputStream.java | 82 ++ .../org/archive/io/NoGzipMagicException.java | 26 + .../io/ObjectPlusFilesInputStream.java | 143 +++ .../io/ObjectPlusFilesOutputStream.java | 134 +++ .../org/archive/io/OriginSeekInputStream.java | 121 +++ .../java/org/archive/io/Preformatter.java | 32 + .../archive/io/RandomAccessInputStream.java | 180 ++++ .../archive/io/RandomAccessOutputStream.java | 69 ++ src/main/java/org/archive/io/ReadSource.java | 37 + .../org/archive/io/RecorderIOException.java | 38 + .../io/RecorderLengthExceededException.java | 39 + .../archive/io/RecorderTimeoutException.java | 37 + .../io/RecorderTooMuchHeaderException.java | 40 + .../org/archive/io/RecordingInputStream.java | 355 ++++++++ .../org/archive/io/RecordingOutputStream.java | 576 ++++++++++++ .../archive/io/RecoverableIOException.java | 83 ++ .../io/RecyclingFastBufferedOutputStream.java | 37 + .../org/archive/io/ReplayCharSequence.java | 77 ++ .../org/archive/io/ReplayInputStream.java | 325 +++++++ .../archive/io/RepositionableInputStream.java | 133 +++ .../org/archive/io/SafeSeekInputStream.java | 124 +++ .../java/org/archive/io/SeekInputStream.java | 81 ++ src/main/java/org/archive/io/SeekReader.java | 84 ++ .../archive/io/SeekReaderCharSequence.java | 56 ++ .../org/archive/io/SinkHandlerLogThread.java | 34 + src/main/java/org/archive/io/UTF8Bytes.java | 37 + src/main/java/org/archive/io/WriterPool.java | 343 +++++++ .../java/org/archive/io/WriterPoolMember.java | 487 ++++++++++ .../org/archive/io/WriterPoolSettings.java | 39 + .../java/org/archive/io/arc/ARC2WCDX.java | 243 +++++ .../java/org/archive/io/arc/ARCConstants.java | 29 + .../java/org/archive/io/arc/ARCLocation.java | 37 + .../java/org/archive/io/arc/ARCReader.java | 553 ++++++++++++ .../org/archive/io/arc/ARCReaderFactory.java | 454 ++++++++++ .../java/org/archive/io/arc/ARCRecord.java | 835 ++++++++++++++++++ .../org/archive/io/arc/ARCRecordMetaData.java | 267 ++++++ .../java/org/archive/io/arc/ARCUtils.java | 240 +++++ .../java/org/archive/io/arc/ARCWriter.java | 459 ++++++++++ .../org/archive/io/arc/ARCWriterPool.java | 69 ++ .../io/arc/WriterPoolSettingsData.java | 80 ++ src/main/java/org/archive/io/package.html | 9 + .../org/archive/io/warc/WARCConstants.java | 24 + .../java/org/archive/io/warc/WARCReader.java | 287 ++++++ .../archive/io/warc/WARCReaderFactory.java | 307 +++++++ .../java/org/archive/io/warc/WARCRecord.java | 233 +++++ .../org/archive/io/warc/WARCRecordInfo.java | 139 +++ .../java/org/archive/io/warc/WARCWriter.java | 436 +++++++++ .../org/archive/io/warc/WARCWriterPool.java | 64 ++ .../io/warc/WARCWriterPoolSettings.java | 32 + .../io/warc/WARCWriterPoolSettingsData.java | 40 + .../java/org/archive/io/warc/package.html | 38 + .../archive/net/DownloadURLConnection.java | 131 +++ .../java/org/archive/net/FTPException.java | 56 ++ .../java/org/archive/net/md5/Handler.java | 87 ++ .../org/archive/net/md5/Md5URLConnection.java | 34 + .../java/org/archive/net/rsync/Handler.java | 71 ++ .../archive/net/rsync/RsyncURLConnection.java | 51 ++ .../org/archive/uid/RecordIDGenerator.java | 72 ++ .../java/org/archive/uid/UUIDGenerator.java | 72 ++ src/main/java/org/archive/uid/package.html | 28 + src/main/java/org/archive/util/DevUtils.java | 116 +++ src/main/java/org/archive/util/FileUtils.java | 712 +++++++++++++++ .../org/archive/util/InetAddressUtil.java | 116 +++ .../archive/util/IterableLineIterator.java | 26 + .../java/org/archive/util/LaxHttpParser.java | 242 +++++ .../java/org/archive/util/MimetypeUtils.java | 75 ++ .../java/org/archive/util/ProcessUtils.java | 151 ++++ .../util/ProgressStatisticsReporter.java | 36 + .../java/org/archive/util/PropertyUtils.java | 114 +++ src/main/java/org/archive/util/Recorder.java | 593 +++++++++++++ src/main/java/org/archive/util/Reporter.java | 56 ++ .../org/archive/util/anvl/ANVLRecord.java | 336 +++++++ .../java/org/archive/util/anvl/Element.java | 73 ++ .../java/org/archive/util/anvl/Label.java | 41 + .../org/archive/util/anvl/SubElement.java | 78 ++ .../java/org/archive/util/anvl/Value.java | 71 ++ .../java/org/archive/util/anvl/package.html | 42 + 102 files changed, 16459 insertions(+), 1 deletion(-) create mode 100644 src/main/java/org/archive/httpclient/ConfigurableX509TrustManager.java create mode 100644 src/main/java/org/archive/httpclient/HttpRecorderGetMethod.java create mode 100644 src/main/java/org/archive/httpclient/HttpRecorderMethod.java create mode 100644 src/main/java/org/archive/httpclient/HttpRecorderPostMethod.java create mode 100644 src/main/java/org/archive/httpclient/SingleHttpConnectionManager.java create mode 100644 src/main/java/org/archive/httpclient/package.html create mode 100644 src/main/java/org/archive/io/ArchiveFileConstants.java create mode 100644 src/main/java/org/archive/io/ArchiveReader.java create mode 100644 src/main/java/org/archive/io/ArchiveReaderFactory.java create mode 100644 src/main/java/org/archive/io/ArchiveRecord.java create mode 100644 src/main/java/org/archive/io/ArchiveRecordHeader.java create mode 100644 src/main/java/org/archive/io/ArraySeekInputStream.java create mode 100644 src/main/java/org/archive/io/BufferedSeekInputStream.java create mode 100644 src/main/java/org/archive/io/CharSubSequence.java create mode 100644 src/main/java/org/archive/io/CompositeFileInputStream.java create mode 100644 src/main/java/org/archive/io/CompositeFileReader.java create mode 100644 src/main/java/org/archive/io/Endian.java create mode 100644 src/main/java/org/archive/io/GZIPMembersInputStream.java create mode 100644 src/main/java/org/archive/io/GenerationFileHandler.java create mode 100644 src/main/java/org/archive/io/GenericReplayCharSequence.java create mode 100644 src/main/java/org/archive/io/GzipHeader.java create mode 100644 src/main/java/org/archive/io/HeaderedArchiveRecord.java create mode 100644 src/main/java/org/archive/io/LoudObjectOutputStream.java create mode 100644 src/main/java/org/archive/io/MiserOutputStream.java create mode 100644 src/main/java/org/archive/io/NoGzipMagicException.java create mode 100644 src/main/java/org/archive/io/ObjectPlusFilesInputStream.java create mode 100644 src/main/java/org/archive/io/ObjectPlusFilesOutputStream.java create mode 100644 src/main/java/org/archive/io/OriginSeekInputStream.java create mode 100644 src/main/java/org/archive/io/Preformatter.java create mode 100644 src/main/java/org/archive/io/RandomAccessInputStream.java create mode 100644 src/main/java/org/archive/io/RandomAccessOutputStream.java create mode 100644 src/main/java/org/archive/io/ReadSource.java create mode 100644 src/main/java/org/archive/io/RecorderIOException.java create mode 100644 src/main/java/org/archive/io/RecorderLengthExceededException.java create mode 100644 src/main/java/org/archive/io/RecorderTimeoutException.java create mode 100644 src/main/java/org/archive/io/RecorderTooMuchHeaderException.java create mode 100644 src/main/java/org/archive/io/RecordingInputStream.java create mode 100644 src/main/java/org/archive/io/RecordingOutputStream.java create mode 100644 src/main/java/org/archive/io/RecoverableIOException.java create mode 100644 src/main/java/org/archive/io/RecyclingFastBufferedOutputStream.java create mode 100644 src/main/java/org/archive/io/ReplayCharSequence.java create mode 100644 src/main/java/org/archive/io/ReplayInputStream.java create mode 100644 src/main/java/org/archive/io/RepositionableInputStream.java create mode 100644 src/main/java/org/archive/io/SafeSeekInputStream.java create mode 100644 src/main/java/org/archive/io/SeekInputStream.java create mode 100644 src/main/java/org/archive/io/SeekReader.java create mode 100644 src/main/java/org/archive/io/SeekReaderCharSequence.java create mode 100644 src/main/java/org/archive/io/SinkHandlerLogThread.java create mode 100644 src/main/java/org/archive/io/UTF8Bytes.java create mode 100644 src/main/java/org/archive/io/WriterPool.java create mode 100644 src/main/java/org/archive/io/WriterPoolMember.java create mode 100644 src/main/java/org/archive/io/WriterPoolSettings.java create mode 100644 src/main/java/org/archive/io/arc/ARC2WCDX.java create mode 100644 src/main/java/org/archive/io/arc/ARCConstants.java create mode 100644 src/main/java/org/archive/io/arc/ARCLocation.java create mode 100644 src/main/java/org/archive/io/arc/ARCReader.java create mode 100644 src/main/java/org/archive/io/arc/ARCReaderFactory.java create mode 100644 src/main/java/org/archive/io/arc/ARCRecord.java create mode 100644 src/main/java/org/archive/io/arc/ARCRecordMetaData.java create mode 100644 src/main/java/org/archive/io/arc/ARCUtils.java create mode 100644 src/main/java/org/archive/io/arc/ARCWriter.java create mode 100644 src/main/java/org/archive/io/arc/ARCWriterPool.java create mode 100644 src/main/java/org/archive/io/arc/WriterPoolSettingsData.java create mode 100644 src/main/java/org/archive/io/package.html create mode 100644 src/main/java/org/archive/io/warc/WARCConstants.java create mode 100644 src/main/java/org/archive/io/warc/WARCReader.java create mode 100644 src/main/java/org/archive/io/warc/WARCReaderFactory.java create mode 100644 src/main/java/org/archive/io/warc/WARCRecord.java create mode 100644 src/main/java/org/archive/io/warc/WARCRecordInfo.java create mode 100644 src/main/java/org/archive/io/warc/WARCWriter.java create mode 100644 src/main/java/org/archive/io/warc/WARCWriterPool.java create mode 100644 src/main/java/org/archive/io/warc/WARCWriterPoolSettings.java create mode 100644 src/main/java/org/archive/io/warc/WARCWriterPoolSettingsData.java create mode 100644 src/main/java/org/archive/io/warc/package.html create mode 100644 src/main/java/org/archive/net/DownloadURLConnection.java create mode 100644 src/main/java/org/archive/net/FTPException.java create mode 100644 src/main/java/org/archive/net/md5/Handler.java create mode 100644 src/main/java/org/archive/net/md5/Md5URLConnection.java create mode 100644 src/main/java/org/archive/net/rsync/Handler.java create mode 100644 src/main/java/org/archive/net/rsync/RsyncURLConnection.java create mode 100644 src/main/java/org/archive/uid/RecordIDGenerator.java create mode 100644 src/main/java/org/archive/uid/UUIDGenerator.java create mode 100644 src/main/java/org/archive/uid/package.html create mode 100644 src/main/java/org/archive/util/DevUtils.java create mode 100644 src/main/java/org/archive/util/FileUtils.java create mode 100644 src/main/java/org/archive/util/InetAddressUtil.java create mode 100644 src/main/java/org/archive/util/IterableLineIterator.java create mode 100644 src/main/java/org/archive/util/LaxHttpParser.java create mode 100644 src/main/java/org/archive/util/MimetypeUtils.java create mode 100644 src/main/java/org/archive/util/ProcessUtils.java create mode 100644 src/main/java/org/archive/util/ProgressStatisticsReporter.java create mode 100644 src/main/java/org/archive/util/PropertyUtils.java create mode 100644 src/main/java/org/archive/util/Recorder.java create mode 100644 src/main/java/org/archive/util/Reporter.java create mode 100644 src/main/java/org/archive/util/anvl/ANVLRecord.java create mode 100644 src/main/java/org/archive/util/anvl/Element.java create mode 100644 src/main/java/org/archive/util/anvl/Label.java create mode 100644 src/main/java/org/archive/util/anvl/SubElement.java create mode 100644 src/main/java/org/archive/util/anvl/Value.java create mode 100644 src/main/java/org/archive/util/anvl/package.html diff --git a/pom.xml b/pom.xml index 03b1240d..c714fe8c 100644 --- a/pom.xml +++ b/pom.xml @@ -124,6 +124,17 @@ httpcore 4.3 + + joda-time + joda-time + 1.6 + + + fastutil + fastutil + 5.0.7 + compile + diff --git a/src/main/java/org/archive/format/arc/ARCConstants.java b/src/main/java/org/archive/format/arc/ARCConstants.java index 6bfc5a99..a336ddeb 100755 --- a/src/main/java/org/archive/format/arc/ARCConstants.java +++ b/src/main/java/org/archive/format/arc/ARCConstants.java @@ -1,8 +1,20 @@ package org.archive.format.arc; import java.nio.charset.Charset; +import java.util.Arrays; +import java.util.List; +import java.util.zip.Deflater; +import java.util.zip.GZIPInputStream; -public interface ARCConstants { +import org.archive.format.ArchiveFileConstants; +import org.archive.util.zip.GzipHeader; + +/** + * Constants used by ARC files and in ARC file processing. + * + * @author stack + */ +public interface ARCConstants extends ArchiveFileConstants { public final static int MAX_META_LENGTH = 1024 * 32; public final static Charset ARC_META_CHARSET = Charset.forName("utf-8"); public final static int NEW_LINE_ORD = 10; @@ -25,4 +37,201 @@ public interface ARCConstants { public static final String FILEDESC_SCHEME = "filedesc:/"; public static final String DNS_MIME = "text/dns"; public static final String ALEXA_DAT_MIME = "alexa/dat"; + + /** + * Default maximum ARC file size. + */ + public static final long DEFAULT_MAX_ARC_FILE_SIZE = 100000000; + + /** + * Maximum length for a metadata line. + */ + public static final int MAX_METADATA_LINE_LENGTH = (4 * 1024); + + /** + * ARC file extention. + */ + public static final String ARC_FILE_EXTENSION = "arc"; + + /** + * Dot ARC file extension. + */ + public static final String DOT_ARC_FILE_EXTENSION = + "." + ARC_FILE_EXTENSION; + + public static final String DOT_COMPRESSED_FILE_EXTENSION = + ArchiveFileConstants.DOT_COMPRESSED_FILE_EXTENSION; + + /** + * Compressed arc file extension. + */ + public static final String COMPRESSED_ARC_FILE_EXTENSION = + ARC_FILE_EXTENSION + DOT_COMPRESSED_FILE_EXTENSION; + + /** + * Compressed dot arc file extension. + */ + public static final String DOT_COMPRESSED_ARC_FILE_EXTENSION = + DOT_ARC_FILE_EXTENSION + DOT_COMPRESSED_FILE_EXTENSION; + + /** + * Encoding to use getting bytes from strings. + * + * Specify an encoding rather than leave it to chance: i.e whatever the + * JVMs encoding. Use an encoding that gets the stream as bytes, not chars. + */ + public static final String DEFAULT_ENCODING = "ISO-8859-1"; + + /** + * ARC file line seperator character. + * + * This is what the alexa c-code looks for delimiting lines. + */ + public static final char LINE_SEPARATOR = '\n'; + + /** + * ARC header field seperator character. + */ + public static final char HEADER_FIELD_SEPARATOR = ' '; + + /** + * ARC file *MAGIC NUMBER*. + * + * Every ARC file must begin w/ this. + */ + public static final String ARC_MAGIC_NUMBER = "filedesc://"; + + /** + * The FLG.FEXTRA field that is added to ARC files. (See RFC1952 to + * understand FLG.FEXTRA). + */ + public static final byte[] ARC_GZIP_EXTRA_FIELD = { 8, 0, 'L', 'X', 4, 0, + 0, 0, 0, 0 }; + + /** + * Key for the ARC Header IP field. + * + * Lowercased. + */ + public static final String IP_HEADER_FIELD_KEY = "ip-address"; + + /** + * Key for the ARC Header Result Code field. + * + * Lowercased. + */ + public static final String CODE_HEADER_FIELD_KEY = "result-code"; + + /** + * Key for the ARC Header Checksum field. + * + * Lowercased. + */ + public static final String CHECKSUM_HEADER_FIELD_KEY = "checksum"; + + /** + * Key for the ARC Header Location field. + * + * Lowercased. + */ + public static final String LOCATION_HEADER_FIELD_KEY = "location"; + + /** + * Key for the ARC Header Offset field. + * + * Lowercased. + */ + public static final String OFFSET_HEADER_FIELD_KEY = "offset"; + + /** + * Key for the ARC Header filename field. + * + * Lowercased. + */ + public static final String FILENAME_HEADER_FIELD_KEY = "filename"; + + /** + * Key for statuscode field. + */ + public static final String STATUSCODE_FIELD_KEY = "statuscode"; + + /** + * Key for offset field. + */ + public static final String OFFSET_FIELD_KEY = OFFSET_HEADER_FIELD_KEY; + + /** + * Key for filename field. + */ + public static final String FILENAME_FIELD_KEY = FILENAME_HEADER_FIELD_KEY; + + /** + * Key for checksum field. + */ + public static final String CHECKSUM_FIELD_KEY = CHECKSUM_HEADER_FIELD_KEY; + + /** + * Tokenized field prefix. + * + * Use this prefix for tokenized fields when naming fields in + * an index. + */ + public static final String TOKENIZED_PREFIX = "tokenized_"; + + /** + * Assumed maximum size of a record meta header line. + * + * This 100k which seems massive but its the same as the LINE_LENGTH from + * alexa/include/a_arcio.h: + *
+     * #define LINE_LENGTH     (100*1024)
+     * 
+ */ + public static final int MAX_HEADER_LINE_LENGTH = 1024 * 100; + + /** + * Version 1 required metadata fields. + */ + public static List REQUIRED_VERSION_1_HEADER_FIELDS = Arrays + .asList(new String[] { URL_FIELD_KEY, IP_HEADER_FIELD_KEY, + DATE_FIELD_KEY, MIMETYPE_FIELD_KEY, + LENGTH_FIELD_KEY, VERSION_FIELD_KEY, + ABSOLUTE_OFFSET_KEY }); + + /** + * Minimum possible record length. + * + * This is a rough calc. When the header is data it will occupy less space. + */ + public static int MINIMUM_RECORD_LENGTH = 1 + "://".length() + 1 + + ARC_FILE_EXTENSION.length() + " ".length() + +1 + " ".length() + + 1 + " ".length() + 1 + "/".length() + 1 + " ".length() + 1; + + /** + * Start of a GZIP header that uses default deflater. + */ + public static final byte[] GZIP_HEADER_BEGIN = { + (byte) GZIPInputStream.GZIP_MAGIC, // Magic number (short) + (byte) (GZIPInputStream.GZIP_MAGIC >> 8), // Magic number (short) + Deflater.DEFLATED // Compression method (CM) + }; + + /** + * Length of minimual 'default GZIP header. + * + * See RFC1952 for explaination of value of 10. + */ + public static final int DEFAULT_GZIP_HEADER_LENGTH = + GzipHeader.MINIMAL_GZIP_HEADER_LENGTH; + + /** + * set of known errors encountered reading ARCs + */ + public enum ArcRecordErrors { + HTTP_HEADER_TRUNCATED, + HTTP_STATUS_LINE_INVALID, + HTTP_STATUS_LINE_EXCEPTION, + } + + } diff --git a/src/main/java/org/archive/httpclient/ConfigurableX509TrustManager.java b/src/main/java/org/archive/httpclient/ConfigurableX509TrustManager.java new file mode 100644 index 00000000..45a89ba6 --- /dev/null +++ b/src/main/java/org/archive/httpclient/ConfigurableX509TrustManager.java @@ -0,0 +1,188 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.httpclient; + +import java.security.KeyStore; +import java.security.KeyStoreException; +import java.security.NoSuchAlgorithmException; +import java.security.cert.CertificateException; +import java.security.cert.X509Certificate; +import java.util.logging.Logger; + +import javax.net.ssl.TrustManager; +import javax.net.ssl.TrustManagerFactory; +import javax.net.ssl.X509TrustManager; + +/** + * A configurable trust manager built on X509TrustManager. + * + * If set to 'open' trust, the default, will get us into sites for whom we do + * not have the CA or any of intermediary CAs that go to make up the cert chain + * of trust. Will also get us past selfsigned and expired certs. 'loose' + * trust will get us into sites w/ valid certs even if they are just + * selfsigned. 'normal' is any valid cert not including selfsigned. 'strict' + * means cert must be valid and the cert DN must match server name. + * + *

Based on pointers in + * SSL + * Guide, + * and readings done in JSSE + * Guide. + * + *

TODO: Move to an ssl subpackage when we have other classes other than + * just this one. + * + * @author stack + * @version $Id$ + */ +public class ConfigurableX509TrustManager implements X509TrustManager +{ + /** + * Logging instance. + */ + protected static Logger logger = Logger.getLogger( + "org.archive.httpclient.ConfigurableX509TrustManager"); + + public static enum TrustLevel { + /** + * Trust anything given us. + * + * Default setting. + * + *

See + * e502. Disabling Certificate Validation in an HTTPS Connection from + * the java almanac for how to trust all. + */ + OPEN, + + /** + * Trust any valid cert including self-signed certificates. + */ + LOOSE, + + /** + * Normal jsse behavior. + * + * Seemingly any certificate that supplies valid chain of trust. + */ + NORMAL, + + /** + * Strict trust. + * + * Ensure server has same name as cert DN. + */ + STRICT, + } + + /** + * Default setting for trust level. + */ + public final static TrustLevel DEFAULT = TrustLevel.OPEN; + + /** + * Trust level. + */ + private TrustLevel trustLevel = DEFAULT; + + + /** + * An instance of the SUNX509TrustManager that we adapt variously + * depending upon passed configuration. + * + * We have it do all the work we don't want to. + */ + private X509TrustManager standardTrustManager = null; + + + public ConfigurableX509TrustManager() + throws NoSuchAlgorithmException, KeyStoreException { + this(DEFAULT); + } + + /** + * Constructor. + * + * @param level Level of trust to effect. + * + * @throws NoSuchAlgorithmException + * @throws KeyStoreException + */ + public ConfigurableX509TrustManager(TrustLevel level) + throws NoSuchAlgorithmException, KeyStoreException { + super(); + TrustManagerFactory factory = TrustManagerFactory. + getInstance(TrustManagerFactory.getDefaultAlgorithm()); + + // Pass in a null (Trust) KeyStore. Null says use the 'default' + // 'trust' keystore (KeyStore class is used to hold keys and to hold + // 'trusts' (certs)). See 'X509TrustManager Interface' in this doc: + // http://java.sun.com + // /j2se/1.4.2/docs/guide/security/jsse/JSSERefGuide.html#Introduction + factory.init((KeyStore)null); + TrustManager[] trustmanagers = factory.getTrustManagers(); + if (trustmanagers.length == 0) { + throw new NoSuchAlgorithmException(TrustManagerFactory. + getDefaultAlgorithm() + " trust manager not supported"); + } + this.standardTrustManager = (X509TrustManager)trustmanagers[0]; + + this.trustLevel = level; + } + + public void checkClientTrusted(X509Certificate[] certificates, String type) + throws CertificateException { + if (this.trustLevel.equals(TrustLevel.OPEN)) { + return; + } + + this.standardTrustManager.checkClientTrusted(certificates, type); + } + + public void checkServerTrusted(X509Certificate[] certificates, String type) + throws CertificateException { + if (this.trustLevel.equals(TrustLevel.OPEN)) { + return; + } + + try { + this.standardTrustManager.checkServerTrusted(certificates, type); + if (this.trustLevel.equals(TrustLevel.STRICT)) { + logger.severe(TrustLevel.STRICT + " not implemented."); + } + } catch (CertificateException e) { + if (this.trustLevel.equals(TrustLevel.LOOSE) && + certificates != null && certificates.length == 1) + { + // If only one cert and its valid and it caused a + // CertificateException, assume its selfsigned. + X509Certificate certificate = certificates[0]; + certificate.checkValidity(); + } else { + // If we got to here, then we're probably NORMAL. Rethrow. + throw e; + } + } + } + + public X509Certificate[] getAcceptedIssuers() { + return this.standardTrustManager.getAcceptedIssuers(); + } +} diff --git a/src/main/java/org/archive/httpclient/HttpRecorderGetMethod.java b/src/main/java/org/archive/httpclient/HttpRecorderGetMethod.java new file mode 100644 index 00000000..105c4f7e --- /dev/null +++ b/src/main/java/org/archive/httpclient/HttpRecorderGetMethod.java @@ -0,0 +1,120 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.httpclient; + +import java.io.IOException; +import java.util.logging.Logger; + +import org.apache.commons.httpclient.HttpConnection; +import org.apache.commons.httpclient.HttpException; +import org.apache.commons.httpclient.HttpState; +import org.apache.commons.httpclient.methods.GetMethod; +import org.archive.util.Recorder; + + +/** + * Override of GetMethod that marks the passed HttpRecorder w/ the transition + * from HTTP head to body and that forces a close on the http connection. + * + * The actions done in this subclass used to be done by copying + * org.apache.commons.HttpMethodBase, overlaying our version in place of the + * one that came w/ httpclient. Here is the patch of the difference between + * shipped httpclient code and our mods: + *

+ *    -- -1338,6 +1346,12 --
+ *
+ *        public void releaseConnection() {
+ *
+ *   +        // HERITRIX always ants the streams closed.
+ *   +        if (responseConnection != null)
+ *   +        {
+ *   +            responseConnection.close();
+ *   +        }
+ *   +
+ *            if (responseStream != null) {
+ *                try {
+ *                    // FYI - this may indirectly invoke responseBodyConsumed.
+ *   -- -1959,6 +1973,11 --
+ *                        this.statusLine = null;
+ *                    }
+ *                }
+ *   +            // HERITRIX mark transition from header to content.
+ *   +            if (this.httpRecorder != null)
+ *   +            {
+ *   +                this.httpRecorder.markContentBegin();
+ *   +            }
+ *                readResponseBody(state, conn);
+ *                processResponseBody(state, conn);
+ *            } catch (IOException e) {
+ * 
+ * + *

We're not supposed to have access to the underlying connection object; + * am only violating contract because see cases where httpclient is skipping + * out w/o cleaning up after itself. + * + * @author stack + * @version $Revision$, $Date$ + */ +public class HttpRecorderGetMethod extends GetMethod { + + protected static Logger logger = + Logger.getLogger(HttpRecorderGetMethod.class.getName()); + + /** + * Instance of http recorder method. + */ + protected HttpRecorderMethod httpRecorderMethod = null; + + + public HttpRecorderGetMethod(String uri, Recorder recorder) { + super(uri); + this.httpRecorderMethod = new HttpRecorderMethod(recorder); + } + + protected void readResponseBody(HttpState state, HttpConnection connection) + throws IOException, HttpException { + // We're about to read the body. Mark transition in http recorder. + this.httpRecorderMethod.markContentBegin(connection); + super.readResponseBody(state, connection); + } + + protected boolean shouldCloseConnection(HttpConnection conn) { + // Always close connection after each request. As best I can tell, this + // is superfluous -- we've set our client to be HTTP/1.0. Doing this + // out of paranoia. + return true; + } + + public int execute(HttpState state, HttpConnection conn) + throws HttpException, IOException { + // Save off the connection so we can close it on our way out in case + // httpclient fails to (We're not supposed to have access to the + // underlying connection object; am only violating contract because + // see cases where httpclient is skipping out w/o cleaning up + // after itself). + this.httpRecorderMethod.setConnection(conn); + return super.execute(state, conn); + } + + protected void addProxyConnectionHeader(HttpState state, HttpConnection conn) + throws IOException, HttpException { + super.addProxyConnectionHeader(state, conn); + this.httpRecorderMethod.handleAddProxyConnectionHeader(this); + } +} diff --git a/src/main/java/org/archive/httpclient/HttpRecorderMethod.java b/src/main/java/org/archive/httpclient/HttpRecorderMethod.java new file mode 100644 index 00000000..932e7e98 --- /dev/null +++ b/src/main/java/org/archive/httpclient/HttpRecorderMethod.java @@ -0,0 +1,107 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.httpclient; + +import java.util.logging.Logger; + +import org.apache.commons.httpclient.Header; +import org.apache.commons.httpclient.HttpConnection; +import org.apache.commons.httpclient.HttpMethod; +import org.archive.util.Recorder; + + +/** + * This class encapsulates the specializations supplied by the + * overrides {@link HttpRecorderGetMethod} and {@link HttpRecorderPostMethod}. + * + * It keeps instance of HttpRecorder and HttpConnection. + * + * @author stack + * @version $Revision$, $Date$ + */ +public class HttpRecorderMethod { + protected static Logger logger = + Logger.getLogger(HttpRecorderMethod.class.getName()); + + /** + * Instance of http recorder we're using recording this http get. + */ + private Recorder httpRecorder = null; + + /** + * Save around so can force close. + * + * See [ 922080 ] IllegalArgumentException (size is wrong). + * https://sourceforge.net/tracker/?func=detail&aid=922080&group_id=73833&atid=539099 + */ + private HttpConnection connection = null; + + + public HttpRecorderMethod(Recorder recorder) { + this.httpRecorder = recorder; + } + + public void markContentBegin(HttpConnection c) { + if (c != this.connection) { + // We're checking that we're not being asked to work on + // a connection that is other than the one we started + // this method#execute with. + throw new IllegalArgumentException("Connections differ: " + + this.connection + " " + c + " " + + Thread.currentThread().getName()); + } + this.httpRecorder.markContentBegin(); + } + + /** + * @return Returns the connection. + */ + public HttpConnection getConnection() { + return this.connection; + } + + /** + * @param connection The connection to set. + */ + public void setConnection(HttpConnection connection) { + this.connection = connection; + } + /** + * @return Returns the httpRecorder. + */ + public Recorder getHttpRecorder() { + return httpRecorder; + } + + /** + * If a 'Proxy-Connection' header has been added to the request, + * it'll be of a 'keep-alive' type. Until we support 'keep-alives', + * override the Proxy-Connection setting and instead pass a 'close' + * (Otherwise every request has to timeout before we notice + * end-of-document). + * @param method Method to find proxy-connection header in. + */ + public void handleAddProxyConnectionHeader(HttpMethod method) { + Header h = method.getRequestHeader("Proxy-Connection"); + if (h != null) { + h.setValue("close"); + method.setRequestHeader(h); + } + } +} diff --git a/src/main/java/org/archive/httpclient/HttpRecorderPostMethod.java b/src/main/java/org/archive/httpclient/HttpRecorderPostMethod.java new file mode 100644 index 00000000..20f1bfd1 --- /dev/null +++ b/src/main/java/org/archive/httpclient/HttpRecorderPostMethod.java @@ -0,0 +1,82 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.httpclient; + +import java.io.IOException; + +import org.apache.commons.httpclient.HttpConnection; +import org.apache.commons.httpclient.HttpException; +import org.apache.commons.httpclient.HttpState; +import org.apache.commons.httpclient.methods.PostMethod; +import org.archive.util.Recorder; + + +/** + * Override of PostMethod that marks the passed HttpRecorder w/ the transition + * from HTTP head to body and that forces a close on the responseConnection. + * + * This is a copy of {@link HttpRecorderGetMethod}. Only difference is the + * parent subclass. + * + * @author stack + * @version $Date$ $Revision$ + */ +public class HttpRecorderPostMethod extends PostMethod { + /** + * Instance of http recorder method. + */ + protected HttpRecorderMethod httpRecorderMethod = null; + + + public HttpRecorderPostMethod(String uri, Recorder recorder) { + super(uri); + this.httpRecorderMethod = new HttpRecorderMethod(recorder); + } + + protected void readResponseBody(HttpState state, HttpConnection connection) + throws IOException, HttpException { + // We're about to read the body. Mark transition in http recorder. + this.httpRecorderMethod.markContentBegin(connection); + super.readResponseBody(state, connection); + } + + protected boolean shouldCloseConnection(HttpConnection conn) { + // Always close connection after each request. As best I can tell, this + // is superfluous -- we've set our client to be HTTP/1.0. Doing this + // out of paranoia. + return true; + } + + public int execute(HttpState state, HttpConnection conn) + throws HttpException, IOException { + // Save off the connection so we can close it on our way out in case + // httpclient fails to (We're not supposed to have access to the + // underlying connection object; am only violating contract because + // see cases where httpclient is skipping out w/o cleaning up + // after itself). + this.httpRecorderMethod.setConnection(conn); + return super.execute(state, conn); + } + + protected void addProxyConnectionHeader(HttpState state, HttpConnection conn) + throws IOException, HttpException { + super.addProxyConnectionHeader(state, conn); + this.httpRecorderMethod.handleAddProxyConnectionHeader(this); + } +} diff --git a/src/main/java/org/archive/httpclient/SingleHttpConnectionManager.java b/src/main/java/org/archive/httpclient/SingleHttpConnectionManager.java new file mode 100644 index 00000000..4ba6a837 --- /dev/null +++ b/src/main/java/org/archive/httpclient/SingleHttpConnectionManager.java @@ -0,0 +1,70 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.httpclient; + +import java.io.IOException; +import java.io.InputStream; + +import org.apache.commons.httpclient.HostConfiguration; +import org.apache.commons.httpclient.HttpConnection; +import org.apache.commons.httpclient.SimpleHttpConnectionManager; + +/** + * An HttpClient-compatible HttpConnection "manager" that actually + * just gives out a new connection each time -- skipping the overhead + * of connection management, since we already throttle our crawler + * with external mechanisms. + * + * @author gojomo + */ +public class SingleHttpConnectionManager extends SimpleHttpConnectionManager { + + public SingleHttpConnectionManager() { + super(); + } + + public HttpConnection getConnectionWithTimeout( + HostConfiguration hostConfiguration, long timeout) { + + HttpConnection conn = new HttpConnection(hostConfiguration); + conn.setHttpConnectionManager(this); + conn.getParams().setDefaults(this.getParams()); + return conn; + } + + public void releaseConnection(HttpConnection conn) { + // ensure connection is closed + conn.close(); + finishLast(conn); + } + + protected static void finishLast(HttpConnection conn) { + // copied from superclass because it wasn't made available to subclasses + InputStream lastResponse = conn.getLastResponseInputStream(); + if (lastResponse != null) { + conn.setLastResponseInputStream(null); + try { + lastResponse.close(); + } catch (IOException ioe) { + //FIXME: badness - close to force reconnect. + conn.close(); + } + } + } +} diff --git a/src/main/java/org/archive/httpclient/package.html b/src/main/java/org/archive/httpclient/package.html new file mode 100644 index 00000000..87ae77ed --- /dev/null +++ b/src/main/java/org/archive/httpclient/package.html @@ -0,0 +1,24 @@ + + + +org.archive.httpclient package + +Provides specializations on + apache jakarta + commons httpclient. + +

HttpRecorderGetMethod

+

Class that the passed HttpRecorder w/ boundary between + HTTP header and content. Also forces a close on the response on + call to releaseConnection.

+ +

ConfigurableTrustManagerProtocolSocketFactory

+

A protocol socket factory that allows setting of trust level on + construction.

+ +

References

+

JavaTM Secure Socket Extension (JSSE): Reference Guide

+ + + diff --git a/src/main/java/org/archive/io/ArchiveFileConstants.java b/src/main/java/org/archive/io/ArchiveFileConstants.java new file mode 100644 index 00000000..b1a39194 --- /dev/null +++ b/src/main/java/org/archive/io/ArchiveFileConstants.java @@ -0,0 +1,24 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + +@Deprecated +public interface ArchiveFileConstants extends org.archive.format.ArchiveFileConstants { +} diff --git a/src/main/java/org/archive/io/ArchiveReader.java b/src/main/java/org/archive/io/ArchiveReader.java new file mode 100644 index 00000000..66056d33 --- /dev/null +++ b/src/main/java/org/archive/io/ArchiveReader.java @@ -0,0 +1,761 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + + +import java.io.BufferedInputStream; +import java.io.BufferedWriter; +import java.io.Closeable; +import java.io.EOFException; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileWriter; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.logging.Level; +import java.util.logging.Logger; + +import org.apache.commons.cli.Option; +import org.apache.commons.cli.Options; +import org.archive.util.MimetypeUtils; +import org.archive.util.zip.GZIPMembersInputStream; + +import com.google.common.io.CountingInputStream; + + +/** + * Reader for an Archive file of Archive {@link ArchiveRecord}s. + * @author stack + * @version $Date$ $Version$ + */ +public abstract class ArchiveReader implements ArchiveFileConstants, Iterable, Closeable { + /** + * Is this Archive file compressed? + */ + private boolean compressed = false; + + /** + * Should we digest as we read? + */ + private boolean digest = true; + + /** + * Should the parse be strict? + */ + private boolean strict = false; + + /** + * Archive file input stream. + * + * Keep it around so we can close it when done. + * + *

Set in constructor. Should support at least 1 byte mark/reset. + * Make it protected so subclasses have access. + */ + protected InputStream in = null; + + /** + * Maximum amount of recoverable exceptions in a row. + * If more than this amount in a row, we'll let out the exception rather + * than go back in for yet another retry. + */ + public static final int MAX_ALLOWED_RECOVERABLES = 10; + + + /** + * The Record currently being read. + * + * Keep this ongoing reference so we'll close the record even if the caller + * doesn't. + */ + private ArchiveRecord currentRecord = null; + + /** + * Descriptive string for the Archive file we're going against: + * full path, url, etc. -- depends on context in which file was made. + */ + private String identifier = null; + + /** + * Archive file version. + */ + private String version = null; + + + protected ArchiveReader() { + super(); + } + + /** + * Convenience method used by subclass constructors. + * @param i Identifier for Archive file this reader goes against. + */ + protected void initialize(final String i) { + setReaderIdentifier(i); + } + + /** + * Convenience method for constructors. + * + * @param f File to read. + * @param offset Offset at which to start reading. + * @return InputStream to read from. + * @throws IOException If failed open or fail to get a memory + * mapped byte buffer on file. + */ + protected InputStream getInputStream(final File f, final long offset) + throws IOException { + FileInputStream fin = new FileInputStream(f); + return new BufferedInputStream(fin); + } + + public boolean isCompressed() { + return this.compressed; + } + + /** + * Get record at passed offset. + * + * @param offset Byte index into file at which a record starts. + * @return An Archive Record reference. + * @throws IOException + */ + public ArchiveRecord get(long offset) throws IOException { + cleanupCurrentRecord(); + long posn = positionForRecord(in); + if(offset>=posn) { + in.skip(offset-posn); + } else { + throw new UnsupportedOperationException("no reverse seeking: at "+posn+" requested "+offset); + } + return createArchiveRecord(this.in, offset); + } + + /** + * @return Return Archive Record created against current offset. + * @throws IOException + */ + public ArchiveRecord get() throws IOException { + return createArchiveRecord(this.in, positionForRecord(in)); + } + + public void close() throws IOException { + if (this.in != null) { + this.in.close(); + this.in = null; + } + } + + /** + * Cleanout the current record if there is one. + * @throws IOException + */ + protected void cleanupCurrentRecord() throws IOException { + if (this.currentRecord != null) { + this.currentRecord.close(); + gotoEOR(this.currentRecord); + this.currentRecord = null; + } + } + + /** + * Return an Archive Record homed on offset into + * is. + * @param is Stream to read Record from. + * @param offset Offset to find Record at. + * @return ArchiveRecord instance. + * @throws IOException + */ + protected abstract ArchiveRecord createArchiveRecord(InputStream is, + long offset) + throws IOException; + + /** + * Skip over any trailing new lines at end of the record so we're lined up + * ready to read the next. + * @param record + * @throws IOException + */ + protected abstract void gotoEOR(ArchiveRecord record) throws IOException; + + public abstract String getFileExtension(); + public abstract String getDotFileExtension(); + + /** + * @return Version of this Archive file. + */ + public String getVersion() { + return this.version; + } + + /** + * Validate the Archive file. + * + * This method iterates over the file throwing exception if it fails + * to successfully parse any record. + * + *

Assumes the stream is at the start of the file. + * @return List of all read Archive Headers. + * + * @throws IOException + */ + public List validate() throws IOException { + return validate(-1); + } + + /** + * Validate the Archive file. + * + * This method iterates over the file throwing exception if it fails + * to successfully parse. + * + *

We start validation from wherever we are in the stream. + * + * @param numRecords Number of records expected. Pass -1 if number is + * unknown. + * + * @return List of all read metadatas. As we validate records, we add + * a reference to the read metadata. + * + * @throws IOException + */ + public List validate(int numRecords) + throws IOException { + List hdrList = new ArrayList(); + int recordCount = 0; + setStrict(true); + for (Iterator i = iterator(); i.hasNext();) { + recordCount++; + ArchiveRecord r = i.next(); + if (r.getHeader().getLength() <= 0 + && r.getHeader().getMimetype(). + equals(MimetypeUtils.NO_TYPE_MIMETYPE)) { + throw new IOException("record content is empty."); + } + r.close(); + hdrList.add(r.getHeader()); + } + + if (numRecords != -1) { + if (recordCount != numRecords) { + throw new IOException("Count of records, " + + Integer.toString(recordCount) + + " is not equal to expected " + + Integer.toString(numRecords)); + } + } + + return hdrList; + } + + /** + * Test Archive file is valid. + * Assumes the stream is at the start of the file. Be aware that this + * method makes a pass over the whole file. + * @return True if file can be successfully parsed. + */ + public boolean isValid() { + boolean valid = false; + try { + validate(); + valid = true; + } catch(Exception e) { + // File is not valid if exception thrown parsing. + valid = false; + } + + return valid; + } + + /** + * @return Returns the strict. + */ + public boolean isStrict() { + return this.strict; + } + + /** + * @param s The strict to set. + */ + public void setStrict(boolean s) { + this.strict = s; + } + + /** + * @param d True if we're to digest. + */ + public void setDigest(boolean d) { + this.digest = d; + } + + /** + * @return True if we're digesting as we read. + */ + public boolean isDigest() { + return this.digest; + } + + protected Logger getLogger() { + return Logger.getLogger(this.getClass().getName()); + } + + /** + * Returns an ArchiveRecord iterator. + * Of note, on IOException, especially if ZipException reading compressed + * ARCs, rather than fail the iteration, try moving to the next record. + * If {@link ArchiveReader#strict} is not set, this will usually succeed. + * @return An iterator over ARC records. + */ + public Iterator iterator() { + // Eat up any record outstanding. + try { + cleanupCurrentRecord(); + } catch (IOException e) { + throw new RuntimeException(e); + } + + return new ArchiveRecordIterator(); + } + + protected void setCompressed(boolean compressed) { + this.compressed = compressed; + } + + /** + * @return The current ARC record or null if none. + * After construction has the arcfile header record. + * @see #get() + */ + protected ArchiveRecord getCurrentRecord() { + return this.currentRecord; + } + + protected ArchiveRecord currentRecord(final ArchiveRecord r) { + this.currentRecord = r; + return r; + } + + protected InputStream getIn() { + return in; + } + + protected void setIn(InputStream in) { + this.in = in; + } + + protected void setVersion(String version) { + this.version = version; + } + + public String getReaderIdentifier() { + return this.identifier; + } + + protected void setReaderIdentifier(final String i) { + this.identifier = i; + } + + /** + * Log on stderr. + * Logging should go via the logging system. This method + * bypasses the logging system going direct to stderr. + * Should not generally be used. Its used for rare messages + * that come of cmdline usage of ARCReader ERRORs and WARNINGs. + * Override if using ARCReader in a context where no stderr or + * where you'd like to redirect stderr to other than System.err. + * @param level Level to log message at. + * @param message Message to log. + */ + public void logStdErr(Level level, String message) { + System.err.println(level.toString() + " " + message); + } + +// /** +// * Add buffering to RandomAccessInputStream. +// */ +// protected class RandomAccessBufferedInputStream +// extends BufferedInputStream implements RepositionableStream { +// +// public RandomAccessBufferedInputStream(RandomAccessInputStream is) +// throws IOException { +// super(is); +// } +// +// public RandomAccessBufferedInputStream(RandomAccessInputStream is, int size) +// throws IOException { +// super(is, size); +// } +// +// public long position() throws IOException { +// // Current position is the underlying files position +// // minus the amount thats in the buffer yet to be read. +// return ((RandomAccessInputStream)this.in).position() - +// (this.count - this.pos); +// } +// +// public void position(long position) throws IOException { +// // Force refill of buffer whenever there's been a seek. +// this.pos = 0; +// this.count = 0; +// ((RandomAccessInputStream)this.in).position(position); +// } +// +// public int available() throws IOException { +// // Avoid overflow on large datastreams +// long amount = (long)in.available() + (long)(count - pos); +// return (amount >= Integer.MAX_VALUE)? Integer.MAX_VALUE: (int)amount; +// } +// } + + /** + * Inner ArchiveRecord Iterator class. + * Throws RuntimeExceptions in {@link #hasNext()} and {@link #next()} if + * trouble pulling record from underlying stream. + * @author stack + */ + protected class ArchiveRecordIterator implements Iterator { + private final Logger logger = + Logger.getLogger(this.getClass().getName()); + /** + * @return True if we have more records to read. + * @exception RuntimeException Can throw an IOException wrapped in a + * RuntimeException if a problem reading underlying stream (Corrupted + * gzip, etc.). + */ + public boolean hasNext() { + // Call close on any extant record. This will scoot us past + // any content not yet read. + try { + cleanupCurrentRecord(); + } catch (IOException e) { + if (isStrict()) { + throw new RuntimeException(e); + } + if (e instanceof EOFException) { + logger.warning("Premature EOF cleaning up " + + currentRecord.getHeader().toString() + ": " + + e.getMessage()); + return false; + } + // If not strict, try going again. We might be able to skip + // over the bad record. + logger.log(Level.WARNING,"Trying skip of failed record cleanup of " + + currentRecord.getHeader().toString() + ": " + + e.getMessage(), e); + } + return innerHasNext(); + } + + protected boolean innerHasNext(){ + try { + getIn().mark(1); + int c = getIn().read(); + getIn().reset(); + return c > -1; + } catch (IOException e) { + logger.log(Level.WARNING,"problem probing for more content",e); + return false; + } + } + + /** + * Tries to move to next record if we get + * {@link RecoverableIOException}. If not strict + * tries to move to next record if we get an + * {@link IOException}. + * @return Next object. + * @exception RuntimeException Throws a runtime exception, + * usually a wrapping of an IOException, if trouble getting + * a record (Throws exception rather than return null). + */ + public ArchiveRecord next() { + long offset = -1; + try { + offset = positionForRecord(getIn()); + return exceptionNext(); + } catch (IOException e) { + if (!isStrict()) { + // Retry though an IOE. Maybe we will succeed reading + // subsequent record. + try { + if (hasNext()) { + getLogger().warning("Bad Record. Trying skip " + + "(Record start " + offset + "): " + + e.getMessage()); + return exceptionNext(); + } + // Else we are at last record. Iterator#next is + // expecting value. We do not have one. Throw exception. + throw new RuntimeException("Retried but no next " + + "record (Record start " + offset + ")", e); + } catch (IOException e1) { + throw new RuntimeException("After retry (Offset " + + offset + ")", e1); + } + } + throw new RuntimeException("(Record start " + offset + ")", e); + } + } + + /** + * A next that throws exceptions and has handling of + * recoverable exceptions moving us to next record. Can call + * hasNext which itself may throw exceptions. + * @return Next record. + * @throws IOException + * @throws RuntimeException Thrown when we've reached maximum + * retries. + */ + protected ArchiveRecord exceptionNext() + throws IOException, RuntimeException { + ArchiveRecord result = null; + IOException ioe = null; + for (int i = MAX_ALLOWED_RECOVERABLES; i > 0 && + result == null; i--) { + ioe = null; + try { + result = innerNext(); + } catch (RecoverableIOException e) { + ioe = e; + getLogger().warning(e.getMessage()); + if (hasNext()) { + continue; + } + // No records left. Throw exception rather than + // return null. The caller is expecting to get + // back a record since they've just called + // hasNext. + break; + } + } + if (ioe != null) { + // Then we did MAX_ALLOWED_RECOVERABLES retries. Throw + // the recoverable ioe wrapped in a RuntimeException so + // it goes out pass checks for IOE. + throw new RuntimeException("Retried " + + MAX_ALLOWED_RECOVERABLES + " times in a row", ioe); + } + return result; + } + + protected ArchiveRecord innerNext() throws IOException { + return get(positionForRecord(getIn())); + } + + public void remove() { + throw new UnsupportedOperationException(); + } + } + + protected static long positionForRecord(InputStream in) { + return (in instanceof GZIPMembersInputStream) + ? ((GZIPMembersInputStream)in).getCurrentMemberStart() + : ((CountingInputStream)in).getCount(); + } + + protected static String stripExtension(final String name, + final String ext) { + return (!name.endsWith(ext))? name: + name.substring(0, name.length() - ext.length()); + } + + /** + * @return short name of Archive file. + */ + public String getFileName() { + return (new File(getReaderIdentifier())).getName(); + } + + /** + * @return short name of Archive file. + */ + public String getStrippedFileName() { + return getStrippedFileName(getFileName(), + getDotFileExtension()); + } + + /** + * @param name Name of ARCFile. + * @param dotFileExtension '.arc' or '.warc', etc. + * @return short name of Archive file. + */ + public static String getStrippedFileName(String name, + final String dotFileExtension) { + name = stripExtension(name, + ArchiveFileConstants.DOT_COMPRESSED_FILE_EXTENSION); + return stripExtension(name, dotFileExtension); + } + + /** + * @param value Value to test. + * @return True if value is 'true', else false. + */ + protected static boolean getTrueOrFalse(final String value) { + if (value == null || value.length() <= 0) { + return false; + } + return Boolean.TRUE.toString().equals(value.toLowerCase()); + } + + /** + * @param format Format to use outputting. + * @throws IOException + * @throws java.text.ParseException + * @return True if handled. + */ + protected boolean output(final String format) + throws IOException, java.text.ParseException { + boolean result = true; + // long start = System.currentTimeMillis(); + + // Write output as pseudo-CDX file. See + // http://www.archive.org/web/researcher/cdx_legend.php + // and http://www.archive.org/web/researcher/example_cdx.php. + // Hash is hard-coded straight SHA-1 hash of content. + if (format.equals(DUMP)) { + // No point digesting dumping. + setDigest(false); + dump(false); + } else if (format.equals(GZIP_DUMP)) { + // No point digesting dumping. + setDigest(false); + dump(true); + } else if (format.equals(CDX)) { + cdxOutput(false); + } else if (format.equals(CDX_FILE)) { + cdxOutput(true); + } else { + result = false; + } + return result; + } + + protected void cdxOutput(boolean toFile) + throws IOException { + BufferedWriter cdxWriter = null; + if (toFile) { + String cdxFilename = stripExtension(getReaderIdentifier(), + DOT_COMPRESSED_FILE_EXTENSION); + cdxFilename = stripExtension(cdxFilename, getDotFileExtension()); + cdxFilename += ('.' + CDX); + cdxWriter = new BufferedWriter(new FileWriter(cdxFilename)); + } + + String header = "CDX b e a m s c " + ((isCompressed()) ? "V" : "v") + + " n g"; + if (toFile) { + cdxWriter.write(header); + cdxWriter.newLine(); + } else { + System.out.println(header); + } + + String strippedFileName = getStrippedFileName(); + try { + for (Iterator ii = iterator(); ii.hasNext();) { + ArchiveRecord r = ii.next(); + if (toFile) { + cdxWriter.write(r.outputCdx(strippedFileName)); + cdxWriter.newLine(); + } else { + System.out.println(r.outputCdx(strippedFileName)); + } + } + } finally { + if (toFile) { + cdxWriter.close(); + } + } + } + + /** + * Output passed record using passed format specifier. + * @param format What format to use outputting. + * @throws IOException + * @return True if handled. + */ + public boolean outputRecord(final String format) + throws IOException { + boolean result = true; + if (format.equals(CDX)) { + System.out.println(get().outputCdx(getStrippedFileName())); + } else if(format.equals(ArchiveFileConstants.DUMP)) { + // No point digesting if dumping content. + setDigest(false); + get().dump(); + } else { + result = false; + } + return result; + } + + /** + * Dump this file on STDOUT + * @throws compress True if dumped output is compressed. + * @throws IOException + * @throws java.text.ParseException + */ + public abstract void dump(final boolean compress) + throws IOException, java.text.ParseException; + + /** + * @return an ArchiveReader that will delete a local file on close. Used + * when we bring Archive files local and need to clean up afterward. + */ + public abstract ArchiveReader getDeleteFileOnCloseReader(final File f); + + /** + * Output passed record using passed format specifier. + * @param r ARCReader instance to output. + * @param format What format to use outputting. + * @throws IOException + */ + protected static void outputRecord(final ArchiveReader r, + final String format) + throws IOException { + if (!r.outputRecord(format)) { + throw new IOException("Unsupported format" + + " (or unsupported on a single record): " + format); + } + } + + /** + * @return Base Options object filled out with help, digest, strict, etc. + * options. + */ + protected static Options getOptions() { + Options options = new Options(); + options.addOption(new Option("h","help", false, + "Prints this message and exits.")); + options.addOption(new Option("o","offset", true, + "Outputs record at this offset into file.")); + options.addOption(new Option("d","digest", true, + "Pass true|false. Expensive. Default: true (SHA-1).")); + options.addOption(new Option("s","strict", false, + "Strict mode. Fails parse if incorrectly formatted file.")); + options.addOption(new Option("f","format", true, + "Output options: 'cdx', cdxfile', 'dump', 'gzipdump'," + + "'or 'nohead'. Default: 'cdx'.")); + return options; + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/ArchiveReaderFactory.java b/src/main/java/org/archive/io/ArchiveReaderFactory.java new file mode 100644 index 00000000..17f14d3a --- /dev/null +++ b/src/main/java/org/archive/io/ArchiveReaderFactory.java @@ -0,0 +1,301 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.net.HttpURLConnection; +import java.net.MalformedURLException; +import java.net.URL; +import java.net.URLConnection; + +import org.archive.io.arc.ARCReaderFactory; +import org.archive.io.warc.WARCReaderFactory; +import org.archive.net.md5.Md5URLConnection; +import org.archive.net.rsync.RsyncURLConnection; +import org.archive.url.UsableURI; +import org.archive.util.FileUtils; + + +/** + * Factory that returns an Archive file Reader. + * Returns Readers for ARCs or WARCs. + * @author stack + * @version $Date$ $Revision$ + */ +public class ArchiveReaderFactory implements ArchiveFileConstants { + // Static block to enable S3 URLs + static { + if (System.getProperty("java.protocol.handler.pkgs") != null) { + System.setProperty("java.protocol.handler.pkgs", + System.getProperty("java.protocol.handler.pkgs") + + "|" + "org.archive.net"); + } else { + System.setProperty("java.protocol.handler.pkgs", "org.archive.net"); + } + } + + private static final ArchiveReaderFactory factory = + new ArchiveReaderFactory(); + + /** + * Shutdown any public access to default constructor. + */ + protected ArchiveReaderFactory() { + super(); + } + + /** + * Get an Archive file Reader on passed path or url. + * Does primitive heuristic figuring if path or URL. + * @param arcFileOrUrl File path or URL pointing at an Archive file. + * @return An Archive file Reader. + * @throws IOException + * @throws MalformedURLException + * @throws IOException + */ + public static ArchiveReader get(final String arcFileOrUrl) + throws MalformedURLException, IOException { + return ArchiveReaderFactory.factory.getArchiveReader(arcFileOrUrl); + } + + protected ArchiveReader getArchiveReader(final String arcFileOrUrl) + throws MalformedURLException, IOException { + return getArchiveReader(arcFileOrUrl, 0); + } + + protected ArchiveReader getArchiveReader(final String arcFileOrUrl, + final long offset) + throws MalformedURLException, IOException { + return UsableURI.hasScheme(arcFileOrUrl) && arcFileOrUrl.indexOf(":")>1? + get(new URL(arcFileOrUrl), offset): + get(new File(arcFileOrUrl), offset); + } + + /** + * @param f An Archive file to read. + * @return An ArchiveReader + * @throws IOException + */ + public static ArchiveReader get(final File f) throws IOException { + return ArchiveReaderFactory.factory.getArchiveReader(f); + } + + protected ArchiveReader getArchiveReader(final File f) + throws IOException { + return getArchiveReader(f, 0); + } + + /** + * @param f An Archive file to read. + * @param offset Have returned Reader set to start reading at this offset. + * @return An ArchiveReader + * @throws IOException + */ + public static ArchiveReader get(final File f, final long offset) + throws IOException { + return ArchiveReaderFactory.factory.getArchiveReader(f, offset); + } + + protected ArchiveReader getArchiveReader(final File f, + final long offset) + throws IOException { + if (ARCReaderFactory.isARCSuffix(f.getName())) { + return ARCReaderFactory.get(f, true, offset); + } else if (WARCReaderFactory.isWARCSuffix(f.getName())) { + return WARCReaderFactory.get(f, offset); + } + throw new IOException("Unknown file extension (Not ARC nor WARC): " + + f.getName()); + } + + /** + * Wrap a Reader around passed Stream. + * @param s Identifying String for this Stream used in error messages. + * Must be a string that ends with the name of the file we're to put + * an ArchiveReader on. This code looks at file endings to figure + * whether to return an ARC or WARC reader. + * @param is Stream. Stream will be wrapped with implementation of + * RepositionableStream unless already supported. + * @param atFirstRecord Are we at first Record? + * @return ArchiveReader. + * @throws IOException + */ + public static ArchiveReader get(final String s, final InputStream is, + final boolean atFirstRecord) + throws IOException { + return ArchiveReaderFactory.factory.getArchiveReader(s, is, + atFirstRecord); + } + + protected ArchiveReader getArchiveReader(final String id, + final InputStream is, final boolean atFirstRecord) + throws IOException { + final InputStream stream = is; + if (ARCReaderFactory.isARCSuffix(id)) { + return ARCReaderFactory.get(id, stream, atFirstRecord); + } else if (WARCReaderFactory.isWARCSuffix(id)) { + return WARCReaderFactory.get(id, stream, atFirstRecord); + } + throw new IOException("Unknown extension (Not ARC nor WARC): " + id); + } + + /** + * Get an Archive Reader aligned at offset. + * This version of get will not bring the file local but will try to + * stream across the net making an HTTP 1.1 Range request on remote + * http server (RFC1435 Section 14.35). + * @param u HTTP URL for an Archive file. + * @param offset Offset into file at which to start fetching. + * @return An ArchiveReader aligned at offset. + * @throws IOException + */ + public static ArchiveReader get(final URL u, final long offset) + throws IOException { + return ArchiveReaderFactory.factory.getArchiveReader(u, offset); + } + + protected ArchiveReader getArchiveReader(final URL f, final long offset) + throws IOException { + // Get URL connection. + URLConnection connection = f.openConnection(); + if (connection instanceof HttpURLConnection) { + addUserAgent((HttpURLConnection)connection); + } + if (offset != 0) { + // Use a Range request (Assumes HTTP 1.1 on other end). If + // length >= 0, add open-ended range header to the request. Else, + // because end-byte is inclusive, subtract 1. + connection.addRequestProperty("Range", "bytes=" + offset + "-"); + // TODO: should actually verify that server respected 'Range' request + // (spec allows them to ignore; 206 response or Content-Range header + // should be present if Range satisfied; multipart/byteranges could be + // a problem). + } + + return getArchiveReader(f.toString(), connection.getInputStream(), (offset == 0)); + } + + /** + * Get an ARCReader. + * Pulls the ARC local into whereever the System Property + * java.io.tmpdir points. It then hands back an ARCReader that + * points at this local copy. A close on this ARCReader instance will + * remove the local copy. + * @param u An URL that points at an ARC. + * @return An ARCReader. + * @throws IOException + */ + public static ArchiveReader get(final URL u) + throws IOException { + return ArchiveReaderFactory.factory.getArchiveReader(u); + } + + protected ArchiveReader getArchiveReader(final URL u) + throws IOException { + // If url represents a local file then return file it points to. + if (u.getPath() != null) { + // TODO: Add scheme check and host check. + File f = new File(u.getPath()); + if (f.exists()) { + return get(f, 0); + } + } + + String scheme = u.getProtocol(); + if (scheme.startsWith("http") || scheme.equals("s3")) { + // Try streaming if http or s3 URLs rather than copying local + // and then reading (Passing an offset will get us an Reader + // that wraps a Stream). + return get(u, 0); + } + + return makeARCLocal(u.openConnection()); + } + + protected ArchiveReader makeARCLocal(final URLConnection connection) + throws IOException { + File localFile = null; + if (connection instanceof HttpURLConnection) { + // If http url connection, bring down the resource local. + String p = connection.getURL().getPath(); + int index = p.lastIndexOf('/'); + if (index >= 0) { + // Name file for the file we're making local. + localFile = File.createTempFile("",p.substring(index + 1)); + if (localFile.exists()) { + // If file of same name already exists in TMPDIR, then + // clean it up (Assuming only reason a file of same name in + // TMPDIR is because we failed a previous download). + localFile.delete(); + } + } else { + localFile = File.createTempFile(ArchiveReader.class.getName(), + ".tmp"); + } + addUserAgent((HttpURLConnection)connection); + connection.connect(); + try { + FileUtils.readFullyToFile(connection.getInputStream(), localFile); + } catch (IOException ioe) { + localFile.delete(); + throw ioe; + } + } else if (connection instanceof RsyncURLConnection) { + // Then, connect and this will create a local file. + // See implementation of the rsync handler. + connection.connect(); + localFile = ((RsyncURLConnection)connection).getFile(); + } else if (connection instanceof Md5URLConnection) { + // Then, connect and this will create a local file. + // See implementation of the md5 handler. + connection.connect(); + localFile = ((Md5URLConnection)connection).getFile(); + } else { + throw new UnsupportedOperationException("No support for " + + connection); + } + + ArchiveReader reader = null; + try { + reader = get(localFile, 0); + } catch (IOException e) { + localFile.delete(); + throw e; + } + + // Return a delegate that does cleanup of downloaded file on close. + return reader.getDeleteFileOnCloseReader(localFile); + } + + protected void addUserAgent(final HttpURLConnection connection) { + connection.addRequestProperty("User-Agent", this.getClass().getName()); + } + + /** + * @param f File to test. + * @return True if f is compressed. + * @throws IOException + */ + protected boolean isCompressed(final File f) throws IOException { + return f.getName().toLowerCase(). + endsWith(DOT_COMPRESSED_FILE_EXTENSION); + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/ArchiveRecord.java b/src/main/java/org/archive/io/ArchiveRecord.java new file mode 100644 index 00000000..63bfe628 --- /dev/null +++ b/src/main/java/org/archive/io/ArchiveRecord.java @@ -0,0 +1,409 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.util.logging.Level; + +import org.archive.util.Base32; + +/** + * Archive file Record. + * @author stack + * @version $Date$ $Version$ + */ +public abstract class ArchiveRecord extends InputStream { + + /** + * Minimal http response or request header length. + * + * I've seen in arcs content length of 1 with no header. + */ + protected static final long MIN_HTTP_HEADER_LENGTH = + Math.min("HTTP/1.1 200 OK\r\n".length(), "GET / HTTP/1.0\n\r".length()); + + protected ArchiveRecordHeader header = null; + + /** + * Stream to read this record from. + * + * Stream can only be read sequentially. Will only return this records' + * content returning a -1 if you try to read beyond the end of the current + * record. + * + *

Streams can be markable or not. If they are, we'll be able to roll + * back when we've read too far. If not markable, assumption is that + * the underlying stream is managing our not reading too much (This pertains + * to the skipping over the end of the ARCRecord. See {@link #skip()}. + */ + protected InputStream in = null; + + /** + * Position w/i the Record content, within in. + * This position is relative within this Record. Its not same as the + * Archive file position. + */ + protected long position = 0; + + /** + * Set flag when we've reached the end-of-record. + */ + protected boolean eor = false; + + /** + * Compute digest on what we read and add to metadata when done. + * + * Currently hardcoded as sha-1. TODO: Remove when archive records + * digest or else, add a facility that allows the arc reader to + * compare the calculated digest to that which is recorded in + * the arc. + * + *

Protected instead of private so subclasses can update and complete + * the digest. + */ + protected MessageDigest digest = null; + private String digestStr = null; + + protected boolean strict = false; + + + /** + * Constructor. + * + * @param in Stream cue'd up to be at the start of the record this instance + * is to represent. + * @throws IOException + */ + public ArchiveRecord(InputStream in) + throws IOException { + this(in, null, 0, true, false); + } + + /** + * Constructor. + * + * @param in Stream cue'd up to be at the start of the record this instance + * is to represent. + * @param header Header data. + * @throws IOException + */ + public ArchiveRecord(InputStream in, ArchiveRecordHeader header) + throws IOException { + this(in, header, 0, true, false); + } + + /** + * Constructor. + * + * @param in Stream cue'd up to be at the start of the record this instance + * is to represent. + * @param header Header data. + * @param bodyOffset Offset into the body. Usually 0. + * @param digest True if we're to calculate digest for this record. Not + * digesting saves about ~15% of cpu during an ARC parse. + * @param strict Be strict parsing (Parsing stops if ARC inproperly + * formatted). + * @throws IOException + */ + public ArchiveRecord(InputStream in, ArchiveRecordHeader header, + int bodyOffset, boolean digest, boolean strict) + throws IOException { + this.in = in; + this.header = header; + this.position = bodyOffset; + if (digest) { + try { + this.digest = MessageDigest.getInstance("SHA1"); + } catch (NoSuchAlgorithmException e) { + // Convert to IOE because thats more amenable to callers + // -- they are dealing with it anyways. + throw new IOException(e.getMessage()); + } + } + this.strict = strict; + } + + public boolean markSupported() { + return false; + } + + /** + * @return Header data for this record. + */ + public ArchiveRecordHeader getHeader() { + return this.header; + } + + protected void setHeader(ArchiveRecordHeader header) { + this.header = header; + } + + /** + * Calling close on a record skips us past this record to the next record + * in the stream. + * + * It does not actually close the stream. The underlying steam is probably + * being used by the next arc record. + * + * @throws IOException + */ + public void close() throws IOException { + if (this.in != null) { + skip(); + this.in = null; + if (this.digest != null) { + this.digestStr = Base32.encode(this.digest.digest()); + } + } + } + + /** + * @return Next character in this Record content else -1 if at EOR. + * @throws IOException + */ + public int read() throws IOException { + int c = -1; + if (available() > 0) { + c = this.in.read(); + if (c == -1) { + throw new IOException("Premature EOF before end-of-record."); + } + if (this.digest != null) { + this.digest.update((byte) c); + } + incrementPosition(); + } + return c; + } + + public int read(byte[] b, int offset, int length) throws IOException { + int read = Math.min(length, available()); + if (read == -1 || read == 0) { + read = -1; + } else { + read = this.in.read(b, offset, read); + if (read == -1) { + String msg = "Premature EOF before end-of-record: " + + getHeader().getHeaderFields(); + if (isStrict()) { + throw new IOException(msg); + } + setEor(true); + System.err.println(Level.WARNING.toString() + " " + msg); + } + if (this.digest != null && read >= 0) { + this.digest.update(b, offset, read); + } + incrementPosition(read); + } + return read; + } + + /** + * This available is not the stream's available. Its an available based on + * what the stated Archive record length is minus what we've read to date. + * + * @return True if bytes remaining in record content. + */ + public int available() { + long amount = getHeader().getLength() - getPosition(); + return (amount > Integer.MAX_VALUE? Integer.MAX_VALUE: (int)amount); + } + + /** + * Skip over this records content. + * + * @throws IOException + */ + protected void skip() throws IOException { + if (this.eor) { + return; + } + + // Read to the end of the body of the record. Exhaust the stream. + // Can't skip direct to end because underlying stream may be compressed + // and we're calculating the digest for the record. + int r = available(); + while (r > 0 && !this.eor) { + skip(r); + r = available(); + } + } + + public long skip(long n) throws IOException { + final int SKIP_BUFFERSIZE = 1024 * 4; + byte[] b = new byte[SKIP_BUFFERSIZE]; + long total = 0; + for (int read = 0; (total < n) && (read != -1);) { + read = Math.min(SKIP_BUFFERSIZE, (int) (n - total)); + // TODO: Interesting is that reading from compressed stream, we only + // read about 500 characters at a time though we ask for 4k. + // Look at this sometime. + read = read(b, 0, read); + if (read <= 0) { + read = -1; + } else { + total += read; + } + } + return total; + } + + /** + * @return Returns the strict. + */ + public boolean isStrict() { + return this.strict; + } + + /** + * @param strict The strict to set. + */ + public void setStrict(boolean strict) { + this.strict = strict; + } + + protected InputStream getIn() { + return this.in; + } + + public String getDigestStr() { + return this.digestStr; + } + + protected void incrementPosition() { + this.position++; + } + + protected void incrementPosition(final long incr) { + this.position += incr; + } + + public long getPosition() { + return this.position; + } + + protected boolean isEor() { + return eor; + } + + protected void setEor(boolean eor) { + this.eor = eor; + } + + protected String getStatusCode4Cdx(final ArchiveRecordHeader h) { + return "-"; + } + + protected String getIp4Cdx(final ArchiveRecordHeader h) { + return "-"; + } + + protected String getDigest4Cdx(final ArchiveRecordHeader h) { + return getDigestStr() == null? "-": getDigestStr(); + } + + protected String getMimetype4Cdx(final ArchiveRecordHeader h) { + return h.getMimetype(); + } + + protected String outputCdx(final String strippedFileName) + throws IOException { + // Read the whole record so we get out a hash. Should be safe calling + // close on already closed Record. + close(); + ArchiveRecordHeader h = getHeader(); + StringBuilder buffer = + new StringBuilder(ArchiveFileConstants.CDX_LINE_BUFFER_SIZE); + buffer.append(h.getDate()); + buffer.append(ArchiveFileConstants.SINGLE_SPACE); + buffer.append(getIp4Cdx(h)); + buffer.append(ArchiveFileConstants.SINGLE_SPACE); + buffer.append(h.getUrl()); + buffer.append(ArchiveFileConstants.SINGLE_SPACE); + buffer.append(getMimetype4Cdx(h)); + buffer.append(ArchiveFileConstants.SINGLE_SPACE); + buffer.append(getStatusCode4Cdx(h)); + buffer.append(ArchiveFileConstants.SINGLE_SPACE); + buffer.append(getDigest4Cdx(h)); + buffer.append(ArchiveFileConstants.SINGLE_SPACE); + buffer.append(h.getOffset()); + buffer.append(ArchiveFileConstants.SINGLE_SPACE); + buffer.append(h.getLength()); + buffer.append(ArchiveFileConstants.SINGLE_SPACE); + buffer.append(strippedFileName != null? strippedFileName: '-'); + return buffer.toString(); + } + + /** + * Writes output on STDOUT. + * @throws IOException + */ + public void dump() + throws IOException { + dump(System.out); + } + + /** + * Writes output on passed os. + * @throws IOException + */ + public void dump(final OutputStream os) + throws IOException { + final byte [] outputBuffer = new byte [16*1024]; + int read = outputBuffer.length; + while ((read = read(outputBuffer, 0, outputBuffer.length)) != -1) { + os.write(outputBuffer, 0, read); + } + os.flush(); + } + + /** + * Is it likely that this record contains headers? + * This method will return true if the body is a http response that includes + * http response headers or the body is a http request that includes request + * headers, etc. Be aware that headers in content are distinct from + * {@link ArchiveRecordHeader} 'headers'. + * @return True if this Record's content has headers: + */ + public boolean hasContentHeaders() { + final String url = getHeader().getUrl(); + if (url == null) { + return false; + } + + if (!url.toLowerCase().startsWith("http")) { + return false; + } + + if (getHeader().getLength() <= MIN_HTTP_HEADER_LENGTH) { + return false; + } + + return true; + } + + protected void setBodyOffset(int bodyOffset) { + this.position = bodyOffset; + } +} diff --git a/src/main/java/org/archive/io/ArchiveRecordHeader.java b/src/main/java/org/archive/io/ArchiveRecordHeader.java new file mode 100644 index 00000000..953537b1 --- /dev/null +++ b/src/main/java/org/archive/io/ArchiveRecordHeader.java @@ -0,0 +1,111 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.util.Map; +import java.util.Set; + +/** + * Archive Record Header. + * @author stack + * @version $Date$ $Version$ + */ +public interface ArchiveRecordHeader { + /** + * Get the time when the record was created. + * @return Date in 14 digit time format (UTC). + * @see org.archive.util.ArchiveUtils#parse14DigitDate(String) + */ + public abstract String getDate(); + + /** + * @return Return length of record. + */ + public abstract long getLength(); + + /** + * @return Return Content-Length of the contents of the record + */ + public abstract long getContentLength(); + + + /** + * @return Record subject-url. + */ + public abstract String getUrl(); + + /** + * @return Record mimetype. + */ + public abstract String getMimetype(); + + /** + * @return Record version. + */ + public abstract String getVersion(); + + /** + * @return Offset into Archive file at which this record begins. + */ + public abstract long getOffset(); + + /** + * @param key Key to use looking up field value. + * @return value for passed key of null if no such entry. + */ + public abstract Object getHeaderValue(final String key); + + /** + * @return Header field name keys. + */ + public abstract Set getHeaderFieldKeys(); + + /** + * @return Map of header fields. + */ + public abstract Map getHeaderFields(); + + /** + * @return Returns identifier for current Archive file. Be aware this + * may not be a file name or file path. It may just be an URL. Depends + * on how Archive file was made. + */ + public abstract String getReaderIdentifier(); + + /** + * @return Identifier for the record. If ARC, the URL + date. If WARC, + * the GUID assigned. + */ + public abstract String getRecordIdentifier(); + + /** + * @return Returns digest as String for this record. Only available after + * the record has been read in totality. + */ + public abstract String getDigest(); + + /** + * Offset at which the content begins. + * For ARCs, its used to delimit where http headers end and content begins. + * For WARCs, its end of Named Fields before payload starts. + */ + public int getContentBegin(); + + public abstract String toString(); +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/ArraySeekInputStream.java b/src/main/java/org/archive/io/ArraySeekInputStream.java new file mode 100644 index 00000000..5b30747e --- /dev/null +++ b/src/main/java/org/archive/io/ArraySeekInputStream.java @@ -0,0 +1,106 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.io.IOException; + + +/** + * A repositionable stream backed by an array. + * + * @author pjack + */ +public class ArraySeekInputStream extends SeekInputStream { + + + /** + * The array of bytes to read from. + */ + private byte[] array; + + + /** + * The offset in the array of the next byte to read. + */ + private int offset; + + + /** + * Constructor. Note that changes to the given array will be reflected + * in the stream. + * + * @param array The array to read bytes from. + */ + public ArraySeekInputStream(byte[] array) { + this.array = array; + this.offset = 0; + } + + + @Override + public int read() { + if (offset >= array.length) { + return -1; + } + int r = array[offset] & 0xFF; + offset++; + return r; + } + + + @Override + public int read(byte[] buf, int ofs, int len) { + if (offset >= array.length) { + return 0; + } + len = Math.min(len, array.length - offset); + System.arraycopy(array, offset, buf, ofs, len); + offset += len; + return len; + } + + + @Override + public int read(byte[] buf) { + return read(buf, 0, buf.length); + } + + + /** + * Returns the position of the stream. + */ + public long position() { + return offset; + } + + + /** + * Repositions the stream. + * + * @param p the new position for the stream + * @throws IOException if the given position is out of bounds + */ + public void position(long p) throws IOException { + if ((p < 0) || (p > array.length)) { + throw new IOException("Invalid position: " + p); + } + offset = (int)p; + } + +} diff --git a/src/main/java/org/archive/io/BufferedSeekInputStream.java b/src/main/java/org/archive/io/BufferedSeekInputStream.java new file mode 100644 index 00000000..2fdc72b7 --- /dev/null +++ b/src/main/java/org/archive/io/BufferedSeekInputStream.java @@ -0,0 +1,217 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + + +import java.io.IOException; + + +/** + * Buffers data from some other SeekInputStream. + * + * @author pjack + */ +public class BufferedSeekInputStream extends SeekInputStream { + + + /** + * The underlying input stream. + */ + final private SeekInputStream input; + + + /** + * The buffered data. + */ + final private byte[] buffer; + + + /** + * The maximum offset of valid data in the buffer. Usually the same + * as buffer.length, but may be shorter if we're in the last region + * of the stream. + */ + private int maxOffset; + + + /** + * The offset of within the buffer of the next byte to read. + */ + private int offset; + + + /** + * Constructor. + * + * @param input the underlying input stream + * @param capacity the size of the buffer + * @throws IOException if an IO occurs filling the first buffer + */ + public BufferedSeekInputStream(SeekInputStream input, int capacity) + throws IOException { + this.input = input; + this.buffer = new byte[capacity]; + buffer(); + } + + /** + * Fills the buffer. + * + * @throws IOException if an IO error occurs + */ + private void buffer() throws IOException { + int remaining = buffer.length; + while (remaining > 0) { + int r = input.read(buffer, buffer.length - remaining, remaining); + if (r <= 0) { + // Not enough information to fill the buffer + offset = 0; + maxOffset = buffer.length - remaining; + return; + } + remaining -= r; + } + maxOffset = buffer.length; + offset = 0; + } + + + /** + * Ensures that the buffer is valid. + * + * @throws IOException if an IO error occurs + */ + private void ensureBuffer() throws IOException { + if (offset >= maxOffset) { + buffer(); + } + } + + + /** + * Returns the number of unread bytes in the current buffer. + * + * @return the remaining bytes + */ + private int remaining() { + return maxOffset - offset; + } + + + @Override + public int read() throws IOException { + ensureBuffer(); + if (maxOffset == 0) { + return -1; + } + int ch = buffer[offset] & 0xFF; + offset++; + return ch; + } + + + @Override + public int read(byte[] buf, int ofs, int len) throws IOException { + ensureBuffer(); + if (maxOffset == 0) { + return 0; + } + len = Math.min(len, remaining()); + System.arraycopy(buffer, offset, buf, ofs, len); + offset += len; + return len; + } + + + @Override + public int read(byte[] buf) throws IOException { + return read(buf, 0, buf.length); + } + + + @Override + public long skip(long c) throws IOException { + ensureBuffer(); + if (maxOffset == 0) { + return 0; + } + int count = (c > Integer.MAX_VALUE) ? Integer.MAX_VALUE : (int)c; + int skip = Math.min(count, remaining()); + offset += skip; + return skip; + } + + + /** + * Returns the stream's current position. + * + * @return the current position + */ + public long position() throws IOException { + return input.position() - buffer.length + offset; + } + + + /** + * Seeks to the given position. This method avoids re-filling the buffer + * if at all possible. + * + * @param p the position to set + * @throws IOException if an IO error occurs + */ + public void position(long p) throws IOException { + long blockStart = (input.position() - maxOffset) + / buffer.length * buffer.length; + long blockEnd = blockStart + maxOffset; + if ((p >= blockStart) && (p < blockEnd)) { + // Desired position is somewhere inside current buffer + long adj = p - blockStart; + offset = (int)adj; + return; + } + positionDirect(p); + } + + + /** + * Positions the underlying stream at the given position, then refills + * the buffer. + * + * @param p the position to set + * @throws IOException if an IO error occurs + */ + private void positionDirect(long p) throws IOException { + long newBlockStart = p / buffer.length * buffer.length; + input.position(newBlockStart); + buffer(); + offset = (int)(p % buffer.length); + } + + /** + * Close the stream, including the wrapped input stream. + */ + public void close() throws IOException { + super.close(); + if(this.input!=null) { + this.input.close(); + } + } + + +} diff --git a/src/main/java/org/archive/io/CharSubSequence.java b/src/main/java/org/archive/io/CharSubSequence.java new file mode 100644 index 00000000..1e89da56 --- /dev/null +++ b/src/main/java/org/archive/io/CharSubSequence.java @@ -0,0 +1,90 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + + +/** + * Provides a subsequence view onto a CharSequence. + * + * @author gojomo + * @version $Revision$, $Date$ + */ +public class CharSubSequence implements CharSequence { + + protected CharSequence inner; + protected int start; + protected int end; + + public CharSubSequence(CharSequence inner, int start, int end) { + if (end < start) { + throw new IllegalArgumentException("Start " + start + " is > " + + " than end " + end); + } + + if (end < 0 || start < 0) { + throw new IllegalArgumentException("Start " + start + " or end " + + end + " is < 0."); + } + + if (inner == null) { + throw new NullPointerException("Passed charsequence is null."); + } + + this.inner = inner; + this.start = start; + this.end = end; + } + + /* + * (non-Javadoc) + * @see java.lang.CharSequence#length() + */ + public int length() { + return this.end - this.start; + } + + /* + * (non-Javadoc) + * @see java.lang.CharSequence#charAt(int) + */ + public char charAt(int index) { + return this.inner.charAt(this.start + index); + } + + /* + * (non-Javadoc) + * @see java.lang.CharSequence#subSequence(int, int) + */ + public CharSequence subSequence(int begin, int finish) { + return new CharSubSequence(this, begin, finish); + } + + /* + * (non-Javadoc) + * @see java.lang.CharSequence#toString() + */ + public String toString() { + StringBuffer sb = new StringBuffer(length()); + // could use StringBuffer.append(CharSequence) if willing to do 1.5 & up + for (int i = 0;i filenames; + + /* (non-Javadoc) + * @see java.io.InputStream#read() + */ + public int read() throws IOException { + int c = super.read(); + if( c == -1 && filenames.hasNext() ) { + cueStream(); + return read(); + } + return c; + } + /* (non-Javadoc) + * @see java.io.InputStream#read(byte[], int, int) + */ + public int read(byte[] b, int off, int len) throws IOException { + int c = super.read(b, off, len); + if( c == -1 && filenames.hasNext() ) { + cueStream(); + return read(b,off,len); + } + return c; + } + /* (non-Javadoc) + * @see java.io.InputStream#read(byte[]) + */ + public int read(byte[] b) throws IOException { + int c = super.read(b); + if( c == -1 && filenames.hasNext() ) { + cueStream(); + return read(b); + } + return c; + } + + /* (non-Javadoc) + * @see java.io.InputStream#skip(long) + */ + public long skip(long n) throws IOException { + long s = super.skip(n); + if( s files) throws IOException { + super(null); + filenames = files.iterator(); + cueStream(); + } + + private void cueStream() throws IOException { + if(filenames.hasNext()) { + this.in = new FileInputStream(filenames.next()); + } + } + +} diff --git a/src/main/java/org/archive/io/CompositeFileReader.java b/src/main/java/org/archive/io/CompositeFileReader.java new file mode 100644 index 00000000..14b56219 --- /dev/null +++ b/src/main/java/org/archive/io/CompositeFileReader.java @@ -0,0 +1,40 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.io.File; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.List; + + +/** + * @author gojomo + */ +public class CompositeFileReader extends InputStreamReader { + + /** + * @param filenames + * @throws IOException + */ + public CompositeFileReader(List filenames) throws IOException { + super(new CompositeFileInputStream(filenames)); + } + +} diff --git a/src/main/java/org/archive/io/Endian.java b/src/main/java/org/archive/io/Endian.java new file mode 100644 index 00000000..f6d89aaa --- /dev/null +++ b/src/main/java/org/archive/io/Endian.java @@ -0,0 +1,125 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + + +import java.io.EOFException; +import java.io.IOException; +import java.io.InputStream; + + +/** + * Reads integers stored in big or little endian streams. + * + * @author pjack + */ +public class Endian { + + + /** + * Static utility class. + */ + private Endian() { + } + + + /** + * Reads the next little-endian unsigned 16 bit integer from the + * given stream. + * + * @param input the input stream to read from + * @return the next 16-bit little-endian integer + * @throws IOException if an IO error occurs + */ + public static char littleChar(InputStream input) throws IOException { + int lo = input.read(); + if (lo < 0) { + throw new EOFException(); + } + int hi = input.read(); + if (hi < 0) { + throw new EOFException(); + } + return (char)((hi << 8) | lo); + } + + + /** + * Reads the next little-endian signed 16-bit integer from the + * given stream. + * + * @param input the input stream to read from + * @return the next 16-bit little-endian integer + * @throws IOException if an IO error occurs + */ + public static short littleShort(InputStream input) throws IOException { + return (short)littleChar(input); + } + + + /** + * Reads the next little-endian signed 32-bit integer from the + * given stream. + * + * @param input the input stream to read from + * @return the next 32-bit little-endian integer + * @throws IOException if an IO error occurs + */ + public static int littleInt(InputStream input) throws IOException { + char lo = littleChar(input); + char hi = littleChar(input); + return (hi << 16) | lo; + } + + + /** + * Reads the next big-endian unsigned 16 bit integer from the + * given stream. + * + * @param input the input stream to read from + * @return the next 16-bit big-endian integer + * @throws IOException if an IO error occurs + */ + public static char bigChar(InputStream input) throws IOException { + int hi = input.read(); + if (hi < 0) { + throw new EOFException(); + } + int lo = input.read(); + if (lo < 0) { + throw new EOFException(); + } + return (char)((hi << 8) | lo); + } + + + /** + * Reads the next big-endian signed 32-bit integer from the + * given stream. + * + * @param input the input stream to read from + * @return the next 32-bit big-endian integer + * @throws IOException if an IO error occurs + */ + public static int bigInt(InputStream input) throws IOException { + char hi = bigChar(input); + char lo = bigChar(input); + return (hi << 16) | lo; + } +} diff --git a/src/main/java/org/archive/io/GZIPMembersInputStream.java b/src/main/java/org/archive/io/GZIPMembersInputStream.java new file mode 100644 index 00000000..35fb9e90 --- /dev/null +++ b/src/main/java/org/archive/io/GZIPMembersInputStream.java @@ -0,0 +1,38 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.io.IOException; +import java.io.InputStream; + +/** + * @deprecated use {@link org.archive.util.zip.GZIPMembersInputStream} + */ +@Deprecated +public class GZIPMembersInputStream extends org.archive.util.zip.GZIPMembersInputStream { + + public GZIPMembersInputStream(InputStream in) throws IOException { + super(in); + } + + public GZIPMembersInputStream(InputStream in, int size) throws IOException { + super(in, size); + } + +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/GenerationFileHandler.java b/src/main/java/org/archive/io/GenerationFileHandler.java new file mode 100644 index 00000000..c1ce8d79 --- /dev/null +++ b/src/main/java/org/archive/io/GenerationFileHandler.java @@ -0,0 +1,200 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.LinkedList; +import java.util.List; +import java.util.logging.FileHandler; +import java.util.logging.Formatter; +import java.util.logging.LogRecord; + +import org.archive.util.FileUtils; + + +/** + * FileHandler with support for rotating the current file to + * an archival name with a specified integer suffix, and + * provision of a new replacement FileHandler with the current + * filename. + * + * @author gojomo + */ +public class GenerationFileHandler extends FileHandler { + private LinkedList filenameSeries = new LinkedList(); + private boolean shouldManifest = false; + + /** + * @return Returns the filenameSeries. + */ + public List getFilenameSeries() { + return filenameSeries; + } + + /** + * Constructor. + * @param pattern + * @param append + * @param shouldManifest + * @throws IOException + * @throws SecurityException + */ + public GenerationFileHandler(String pattern, boolean append, + boolean shouldManifest) + throws IOException, SecurityException { + super(pattern, append); + filenameSeries.addFirst(pattern); + this.shouldManifest = shouldManifest; + } + + /** + * @param filenameSeries + * @param shouldManifest + * @throws IOException + */ + public GenerationFileHandler(LinkedList filenameSeries, + boolean shouldManifest) + throws IOException { + super((String)filenameSeries.getFirst(), false); // Never append in this case + this.filenameSeries = filenameSeries; + this.shouldManifest = shouldManifest; + } + + /** + * Move the current file to a new filename with the storeSuffix in place + * of the activeSuffix; continuing logging to a new file under the + * original filename. + * + * @param storeSuffix Suffix to put in place of activeSuffix + * @param activeSuffix Suffix to replace with storeSuffix. + * @return GenerationFileHandler instance. + * @throws IOException + */ + public GenerationFileHandler rotate(String storeSuffix, + String activeSuffix) + throws IOException { + return rotate(storeSuffix, activeSuffix, false); + } + + public GenerationFileHandler rotate(String storeSuffix, + String activeSuffix, boolean mergeOld) throws IOException { + close(); + String filename = (String) filenameSeries.getFirst(); + if (!filename.endsWith(activeSuffix)) { + throw new FileNotFoundException("Active file does not have" + + " expected suffix"); + } + String storeFilename = filename.substring(0, filename.length() + - activeSuffix.length()) + + storeSuffix; + File activeFile = new File(filename); + File storeFile = new File(storeFilename); + FileUtils.moveAsideIfExists(storeFile); + + if (mergeOld) { + File fileToAppendTo = new File(filenameSeries.getLast()); + for (int i = filenameSeries.size() - 2; i >= 0; i--) { + File f = new File(filenameSeries.get(i)); + FileUtils.appendTo(fileToAppendTo, f); + f.delete(); + } + filenameSeries.clear(); + filenameSeries.add(filename); + if (!fileToAppendTo.renameTo(storeFile)) { + throw new IOException("Unable to move " + fileToAppendTo + " to " + + storeFilename); + } + } else { + if (!activeFile.renameTo(storeFile)) { + throw new IOException("Unable to move " + filename + " to " + + storeFilename); + } + } + filenameSeries.add(1, storeFilename); + GenerationFileHandler newGfh = new GenerationFileHandler( + filenameSeries, shouldManifest); + newGfh.setFormatter(this.getFormatter()); + return newGfh; + } + + /** + * @return True if should manifest. + */ + public boolean shouldManifest() { + return this.shouldManifest; + } + + /** + * Constructor-helper that rather than clobbering any existing + * file, moves it aside with a timestamp suffix. + * + * @param filename + * @param append + * @param shouldManifest + * @return + * @throws SecurityException + * @throws IOException + */ + public static GenerationFileHandler makeNew(String filename, boolean append, boolean shouldManifest) throws SecurityException, IOException { + FileUtils.moveAsideIfExists(new File(filename)); + return new GenerationFileHandler(filename, append, shouldManifest); + } + + @Override + public void publish(LogRecord record) { + // when possible preformat outside synchronized superclass method + // (our most involved UriProcessingFormatter can cache result) + Formatter f = getFormatter(); + if(!(f instanceof Preformatter)) { + super.publish(record); + } else { + try { + ((Preformatter)f).preformat(record); + super.publish(record); + } finally { + ((Preformatter)f).clear(); + } + } + } +// +// TODO: determine if there's another way to have this optimization without +// negative impact on log-following (esp. in web UI) +// /** +// * Flush only 1/100th of the usual once-per-record, to reduce the time +// * spent holding the synchronization lock. (Flush is primarily called in +// * a superclass's synchronized publish()). +// * +// * The eventual close calls a direct flush on the target writer, so all +// * rotates/ends will ultimately be fully flushed. +// * +// * @see java.util.logging.StreamHandler#flush() +// */ +// @Override +// public synchronized void flush() { +// flushCount++; +// if(flushCount==100) { +// super.flush(); +// flushCount=0; +// } +// } +// int flushCount; + +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/GenericReplayCharSequence.java b/src/main/java/org/archive/io/GenericReplayCharSequence.java new file mode 100644 index 00000000..1af3922b --- /dev/null +++ b/src/main/java/org/archive/io/GenericReplayCharSequence.java @@ -0,0 +1,412 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.io.Writer; +import java.nio.CharBuffer; +import java.nio.channels.FileChannel; +import java.nio.charset.CharacterCodingException; +import java.nio.charset.Charset; +import java.text.NumberFormat; +import java.util.logging.Level; +import java.util.logging.Logger; + +import org.apache.commons.io.IOUtils; +import org.archive.util.DevUtils; + +import com.google.common.base.Charsets; +import com.google.common.primitives.Ints; + +/** + * (Replay)CharSequence view on recorded streams. + * + * For small streams, use {@link InMemoryReplayCharSequence}. + * + *

Call {@link close()} on this class when done to clean up resources. + * + * @contributor stack + * @contributor nlevitt + * @version $Revision$, $Date$ + */ +public class GenericReplayCharSequence implements ReplayCharSequence { + + protected static Logger logger = Logger + .getLogger(GenericReplayCharSequence.class.getName()); + + /** + * Name of the encoding we use writing out concatenated decoded prefix + * buffer and decoded backing file. + * + *

This define is also used as suffix for the file that holds the + * decodings. The name of the file that holds the decoding is the name + * of the backing file w/ this encoding for a suffix. + * + *

See Encoding. + */ + public static final Charset WRITE_ENCODING = Charsets.UTF_16BE; + + private static final long MAP_MAX_BYTES = 64 * 1024 * 1024; // 64M + + /** + * When the memory map moves away from the beginning of the file + * (to the "right") in order to reach a certain index, it will + * map up to this many bytes preceding (to the left of) the target character. + * Consequently it will map up to + * MAP_MAX_BYTES - MAP_TARGET_LEFT_PADDING + * bytes to the right of the target. + */ + private static final long MAP_TARGET_LEFT_PADDING_BYTES = (long) (MAP_MAX_BYTES * 0.01); + + /** + * Total length of character stream to replay minus the HTTP headers + * if present. + * + * If the backing file is larger than Integer.MAX_VALUE (i.e. 2gb), + * only the first Integer.MAX_VALUE characters are available through this API. + * We're overriding java.lang.CharSequence so that we can use + * java.util.regex directly on the data, and the CharSequence + * API uses int for the length and index. + */ + protected int length; + + /** counter of decoding exceptions for report at end */ + protected long decodingExceptions = 0; + protected CharacterCodingException codingException = null; + + /** + * Byte offset into the file where the memory mapped portion begins. + */ + private long mapByteOffset; + + // XXX do we need to keep the input stream around? + private FileInputStream backingFileIn = null; + + private FileChannel backingFileChannel = null; + + private long bytesPerChar; + + private CharBuffer mappedBuffer = null; + + /** + * File that has decoded content. + * + * Keep it around so we can remove on close. + */ + private File decodedFile = null; + + /* + * This portion of the CharSequence precedes what's in the backing file. In + * cases where we decodeToFile(), this is always empty, because we decode + * the entire input stream. + */ + private CharBuffer prefixBuffer = null; + + private boolean isOpen = true; + + protected Charset charset = null; + + /** + * Constructor. + * + * @param contentReplayInputStream inputStream of content + * @param charset Encoding to use reading the passed prefix + * buffer and backing file. Must not be null. + * @param backingFilename Path to backing file with content in excess of + * whats in buffer. + * + * @throws IOException + */ + public GenericReplayCharSequence(InputStream contentReplayInputStream, + int prefixMax, + String backingFilename, + Charset charset) throws IOException { + super(); + logger.fine("characterEncoding=" + charset + " backingFilename=" + + backingFilename); + + if(charset==null) { + charset = ReplayCharSequence.FALLBACK_CHARSET; + } + // decodes only up to Integer.MAX_VALUE characters + decode(contentReplayInputStream, prefixMax, backingFilename, charset); + + this.bytesPerChar = 2; + + if(length>prefixBuffer.position()) { + this.backingFileIn = new FileInputStream(decodedFile); + this.backingFileChannel = backingFileIn.getChannel(); + this.mapByteOffset = 0; + updateMemoryMappedBuffer(); + } + } + + private void updateMemoryMappedBuffer() { + long charLength = (long) this.length() - (long) prefixBuffer.limit(); // in characters + long mapSize = Math.min((charLength * bytesPerChar) - mapByteOffset, MAP_MAX_BYTES); + logger.fine("updateMemoryMappedBuffer: mapOffset=" + + NumberFormat.getInstance().format(mapByteOffset) + + " mapSize=" + NumberFormat.getInstance().format(mapSize)); + try { + // TODO: stress-test without these possibly-costly requests! +// System.gc(); +// System.runFinalization(); + // TODO: Confirm the READ_ONLY works. I recall it not working. + // The buffers seem to always say that the buffer is writable. + mappedBuffer = backingFileChannel.map( + FileChannel.MapMode.READ_ONLY, mapByteOffset, mapSize) + .asReadOnlyBuffer().asCharBuffer(); + } catch (IOException e) { + // TODO convert this to a runtime error? + DevUtils.logger.log(Level.SEVERE, + " backingFileChannel.map() mapByteOffset=" + mapByteOffset + + " mapSize=" + mapSize + "\n" + "decodedFile=" + + decodedFile + " length=" + length + "\n" + + DevUtils.extraInfo(), e); + throw new RuntimeException(e); + } + } + + /** + * Converts the first Integer.MAX_VALUE characters from the + * file backingFilename from encoding encoding to + * encoding WRITE_ENCODING and saves as + * this.decodedFile, which is named backingFilename + * + "." + WRITE_ENCODING. + * + * @throws IOException + */ + protected void decode(InputStream inStream, int prefixMax, + String backingFilename, Charset charset) throws IOException { + + this.charset = charset; + + // TODO: consider if BufferedReader is helping any + // TODO: consider adding TBW 'LimitReader' to stop reading at + // Integer.MAX_VALUE characters because of charAt(int) limit + BufferedReader reader = new BufferedReader(new InputStreamReader( + inStream, charset)); + + logger.fine("backingFilename=" + backingFilename + " encoding=" + + charset + " decodedFile=" + decodedFile); + + this.prefixBuffer = CharBuffer.allocate(prefixMax); + + long count = 0; + while(count < prefixMax) { + int read = reader.read(prefixBuffer); + if(read<0) { + break; + } + count += read; + } + + int ch = reader.read(); + if(ch >= 0) { + count++; + + // more to decode to file overflow + this.decodedFile = new File(backingFilename + "." + WRITE_ENCODING); + + FileOutputStream fos; + try { + fos = new FileOutputStream(this.decodedFile); + } catch (FileNotFoundException e) { + // Windows workaround attempt + System.gc(); + System.runFinalization(); + this.decodedFile = new File(decodedFile.getAbsolutePath()+".win"); + logger.info("Windows 'file with a user-mapped section open' " + + "workaround gc/finalization/name-extension performed."); + // try again + fos = new FileOutputStream(this.decodedFile); + } + + Writer writer = new OutputStreamWriter(fos,WRITE_ENCODING); + writer.write(ch); + count += IOUtils.copyLarge(reader, writer); + writer.close(); + reader.close(); + } + + this.length = Ints.saturatedCast(count); + if(count>Integer.MAX_VALUE) { + logger.warning("input stream is longer than Integer.MAX_VALUE=" + + NumberFormat.getInstance().format(Integer.MAX_VALUE) + + " characters -- only first " + + NumberFormat.getInstance().format(Integer.MAX_VALUE) + + " are accessible through this GenericReplayCharSequence"); + } + + logger.fine("decode: decoded " + count + " characters" + + ((decodedFile==null) ? "" + : " ("+(count-prefixBuffer.length())+" to "+decodedFile+")")); + } + + /** + * Get character at passed absolute position. + * @param index Index into content + * @return Character at offset index. + */ + public char charAt(int index) { + if (index < 0 || index >= this.length()) { + throw new IndexOutOfBoundsException("index=" + index + + " - should be between 0 and length()=" + this.length()); + } + + // is it in the buffer + if (index < prefixBuffer.limit()) { + return prefixBuffer.get(index); + } + + // otherwise we gotta get it from disk via memory map + long charFileIndex = (long) index - (long) prefixBuffer.limit(); + long charFileLength = (long) this.length() - (long) prefixBuffer.limit(); // in characters + if (charFileIndex * bytesPerChar < mapByteOffset) { + logger.log(Level.WARNING,"left-fault; probably don't want to use CharSequence that far backward"); + } + if (charFileIndex * bytesPerChar < mapByteOffset + || charFileIndex - (mapByteOffset / bytesPerChar) >= mappedBuffer.limit()) { + // fault + /* + * mapByteOffset is bounded by 0 and file size +/- size of the map, + * and starts as close to fileIndex - + * MAP_TARGET_LEFT_PADDING_BYTES as it can while also not + * being smaller than it needs to be. + */ + mapByteOffset = Math.min(charFileIndex * bytesPerChar - MAP_TARGET_LEFT_PADDING_BYTES, + charFileLength * bytesPerChar - MAP_MAX_BYTES); + mapByteOffset = Math.max(0, mapByteOffset); + updateMemoryMappedBuffer(); + } + + return mappedBuffer.get((int)(charFileIndex-(mapByteOffset/bytesPerChar))); + } + + public CharSequence subSequence(int start, int end) { + return new CharSubSequence(this, start, end); + } + + private void deleteFile(File fileToDelete) { + deleteFile(fileToDelete, null); + } + + private void deleteFile(File fileToDelete, final Exception e) { + if (e != null) { + // Log why the delete to help with debug of + // java.io.FileNotFoundException: + // ....tt53http.ris.UTF-16BE. + logger.severe("Deleting " + fileToDelete + " because of " + + e.toString()); + } + if (fileToDelete != null && fileToDelete.exists()) { + logger.fine("deleting file: " + fileToDelete); + fileToDelete.delete(); + } + } + + + @Override + public boolean isOpen() { + return this.isOpen; + } + + public void close() throws IOException { + this.isOpen = false; + + logger.fine("closing"); + + if (this.backingFileChannel != null && this.backingFileChannel.isOpen()) { + this.backingFileChannel.close(); + } + if (backingFileIn != null) { + backingFileIn.close(); + } + + deleteFile(this.decodedFile); + + // clear decodedFile -- so that double-close (as in finalize()) won't + // delete a later instance with same name see bug [ 1218961 ] + // "failed get of replay" in ExtractorHTML... usu: UTF-16BE + this.decodedFile = null; + } + + /* + * (non-Javadoc) + * + * @see java.lang.Object#finalize() + */ + protected void finalize() throws Throwable { + super.finalize(); + logger.fine("finalizing"); + close(); + } + + /** + * Convenience method for getting a substring. + * + * @deprecated please use subSequence() and then toString() directly + */ + public String substring(int offset, int len) { + return subSequence(offset, offset + len).toString(); + } + + public String toString() { + StringBuilder sb = new StringBuilder(this.length()); + sb.append(this); + return sb.toString(); + } + + public int length() { + return length; + } + + /* (non-Javadoc) + * @see org.archive.io.ReplayCharSequence#getDecodeExceptionCount() + */ + @Override + public long getDecodeExceptionCount() { + return decodingExceptions; + } + + + /* (non-Javadoc) + * @see org.archive.io.ReplayCharSequence#getCodingException() + */ + @Override + public CharacterCodingException getCodingException() { + return codingException; + } + + /* (non-Javadoc) + * @see org.archive.io.ReplayCharSequence#getCharset() + */ + public Charset getCharset() { + return charset; + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/GzipHeader.java b/src/main/java/org/archive/io/GzipHeader.java new file mode 100644 index 00000000..6b8263bc --- /dev/null +++ b/src/main/java/org/archive/io/GzipHeader.java @@ -0,0 +1,26 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +/** + * @deprecated use {@link org.archive.util.zip.GzipHeader} + */ +@Deprecated +public class GzipHeader extends org.archive.util.zip.GzipHeader { +} diff --git a/src/main/java/org/archive/io/HeaderedArchiveRecord.java b/src/main/java/org/archive/io/HeaderedArchiveRecord.java new file mode 100644 index 00000000..3cce595b --- /dev/null +++ b/src/main/java/org/archive/io/HeaderedArchiveRecord.java @@ -0,0 +1,423 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.io.PrintStream; + +import org.apache.commons.httpclient.Header; +import org.apache.commons.httpclient.HttpParser; +import org.apache.commons.httpclient.StatusLine; +import org.apache.commons.httpclient.util.EncodingUtil; +import org.archive.io.arc.ARCConstants; +import org.archive.util.LaxHttpParser; + +/** + * An ArchiveRecord whose content has a preamble of RFC822-like headers: e.g. + * The ArchiveRecord is a http response that leads off with http response + * headers. Use this ArchiveRecord Decorator to get at the content headers and + * the header/content demarcation. + * + * @author stack + * @author Olaf Freyer + */ +public class HeaderedArchiveRecord extends ArchiveRecord { + private int contentHeadersLength = -1; + private int statusCode = -1; + + /** + * Http header bytes. + * + * If non-null and bytes available, give out its contents before we + * go back to the underlying stream. + */ + private InputStream contentHeaderStream = null; + + /** + * Content headers. + * + * Only available after the reading of headers. + */ + private Header [] contentHeaders = null; + + + public HeaderedArchiveRecord(final ArchiveRecord ar) throws IOException { + super(ar); + } + + public HeaderedArchiveRecord(final ArchiveRecord ar, + final boolean readContentHeader) throws IOException { + super(ar); + if (readContentHeader) { + this.contentHeaderStream = readContentHeaders(); + } + } + + /** + * Skip over the the content headers if present. + * + * Subsequent reads will get the body. + * + *

Calling this method in the midst of reading the header + * will make for strange results. Otherwise, safe to call + * at any time though before reading any of the record + * content is only time that it makes sense. + * + *

After calling this method, you can call + * {@link #getContentHeaders()} to get the read http header. + * + * @throws IOException + */ + public void skipHttpHeader() throws IOException { + if (this.contentHeaderStream == null) { + return; + } + // Empty the contentHeaderStream + for (int available = this.contentHeaderStream.available(); + this.contentHeaderStream != null + && (available = this.contentHeaderStream.available()) > 0;) { + // We should be in this loop once only we should only do this + // buffer allocation once. + byte[] buffer = new byte[available]; + // The read nulls out httpHeaderStream when done with it so + // need check for null in the loop control line. + read(buffer, 0, available); + } + } + + public void dumpHttpHeader() throws IOException { + dumpHttpHeader(System.out); + } + + public void dumpHttpHeader(final PrintStream stream) throws IOException { + if (this.contentHeaderStream == null) { + return; + } + // Dump the httpHeaderStream to STDOUT + for (int available = this.contentHeaderStream.available(); + this.contentHeaderStream != null + && (available = this.contentHeaderStream.available()) > 0;) { + // We should be in this loop only once and should do this + // buffer allocation once. + byte[] buffer = new byte[available]; + // The read nulls out httpHeaderStream when done with it so + // need check for null in the loop control line. + int read = read(buffer, 0, available); + stream.write(buffer, 0, read); + } + } + + /** + * Read header if present. Technique borrowed from HttpClient HttpParse + * class. Using http parser code for now. Later move to more generic header + * parsing code if there proves a need. + * + * @return ByteArrayInputStream with the http header in it or null if no + * http header. + * @throws IOException + */ + private InputStream readContentHeaders() throws IOException { + // If judged a record that doesn't have an http header, return + // immediately. + if (!hasContentHeaders()) { + return null; + } + byte [] statusBytes = LaxHttpParser.readRawLine(getIn()); + int eolCharCount = getEolCharsCount(statusBytes); + if (eolCharCount <= 0) { + throw new IOException("Failed to read raw lie where one " + + " was expected: " + new String(statusBytes)); + } + String statusLine = EncodingUtil.getString(statusBytes, 0, + statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING); + if (statusLine == null) { + throw new NullPointerException("Expected status line is null"); + } + // TODO: Tighten up this test. + boolean isHttpResponse = StatusLine.startsWithHTTP(statusLine); + boolean isHttpRequest = false; + if (!isHttpResponse) { + isHttpRequest = statusLine.toUpperCase().startsWith("GET") || + !statusLine.toUpperCase().startsWith("POST"); + } + if (!isHttpResponse && !isHttpRequest) { + throw new UnexpectedStartLineIOException("Failed parse of " + + "status line: " + statusLine); + } + this.statusCode = isHttpResponse? + (new StatusLine(statusLine)).getStatusCode(): -1; + + // Save off all bytes read. Keep them as bytes rather than + // convert to strings so we don't have to worry about encodings + // though this should never be a problem doing http headers since + // its all supposed to be ascii. + ByteArrayOutputStream baos = + new ByteArrayOutputStream(statusBytes.length + 4 * 1024); + baos.write(statusBytes); + + // Now read rest of the header lines looking for the separation + // between header and body. + for (byte [] lineBytes = null; true;) { + lineBytes = LaxHttpParser.readRawLine(getIn()); + eolCharCount = getEolCharsCount(lineBytes); + if (eolCharCount <= 0) { + throw new IOException("Failed reading headers: " + + ((lineBytes != null)? new String(lineBytes): null)); + } + // Save the bytes read. + baos.write(lineBytes); + if ((lineBytes.length - eolCharCount) <= 0) { + // We've finished reading the http header. + break; + } + } + + byte [] headerBytes = baos.toByteArray(); + // Save off where content body, post content headers, starts. + this.contentHeadersLength = headerBytes.length; + ByteArrayInputStream bais = + new ByteArrayInputStream(headerBytes); + if (!bais.markSupported()) { + throw new IOException("ByteArrayInputStream does not support mark"); + } + bais.mark(headerBytes.length); + // Read the status line. Don't let it into the parseHeaders function. + // It doesn't know what to do with it. + bais.read(statusBytes, 0, statusBytes.length); + this.contentHeaders = LaxHttpParser.parseHeaders(bais, + ARCConstants.DEFAULT_ENCODING); + bais.reset(); + return bais; + } + + public static class UnexpectedStartLineIOException + extends RecoverableIOException { + private static final long serialVersionUID = 1L; + + public UnexpectedStartLineIOException(final String reason) { + super(reason); + } + } + + /** + * @param bytes Array of bytes to examine for an EOL. + * @return Count of end-of-line characters or zero if none. + */ + private int getEolCharsCount(byte [] bytes) { + int count = 0; + if (bytes != null && bytes.length >=1 && + bytes[bytes.length - 1] == '\n') { + count++; + if (bytes.length >=2 && bytes[bytes.length -2] == '\r') { + count++; + } + } + return count; + } + + /** + * @return If headers are for a http response AND the headers have been + * read, return status code. Else return -1. + */ + public int getStatusCode() { + return this.statusCode; + } + + /** + * @return Returns length of content headers or -1 if headers have + * not yet been read. + */ + public int getContentHeadersLength() { + return this.contentHeadersLength; + } + + public Header[] getContentHeaders() { + return contentHeaders; + } + + /** + * @return Next character in this ARCRecord's content else -1 if at end of + * this record. + * @throws IOException + */ + public int read() throws IOException { + int c = -1; + if (this.contentHeaderStream != null && + (this.contentHeaderStream.available() > 0)) { + // If http header, return bytes from it before we go to underlying + // stream. + c = this.contentHeaderStream.read(); + // If done with the header stream, null it out. + if (this.contentHeaderStream.available() <= 0) { + this.contentHeaderStream = null; + } + // do not increment position - + // the underlying ArchiveRecord stream allready did this + // incrementPosition(); + } else { + c = super.read(); + } + return c; + } + + public int read(byte [] b, int offset, int length) throws IOException { + int read = -1; + if (this.contentHeaderStream != null && + (this.contentHeaderStream.available() > 0)) { + // If http header, return bytes from it before we go to underlying + // stream. + read = Math.min(length, this.contentHeaderStream.available()); + if (read == 0) { + read = -1; + } else { + read = this.contentHeaderStream.read(b, offset, read); + } + // If done with the header stream, null it out. + if (this.contentHeaderStream.available() <= 0) { + this.contentHeaderStream = null; + } + // do not increment position - + // the underlying ArchiveRecord stream allready did this + //incrementPosition(); + } else { + read = super.read(b, offset, length); + } + return read; + } + + @Override + public int available() { + return ((ArchiveRecord)this.in).available(); + } + + @Override + public void close() throws IOException { + ((ArchiveRecord)this.in).close(); + } + + @Override + public void dump() throws IOException { + ((ArchiveRecord)this.in).dump(); + } + + @Override + public void dump(OutputStream os) throws IOException { + ((ArchiveRecord)this.in).dump(os); + } + + @Override + protected String getDigest4Cdx(ArchiveRecordHeader h) { + return ((ArchiveRecord)this.in).getDigest4Cdx(h); + } + + @Override + public String getDigestStr() { + return ((ArchiveRecord)this.in).getDigestStr(); + } + + @Override + public ArchiveRecordHeader getHeader() { + return ((ArchiveRecord)this.in).getHeader(); + } + + @Override + protected String getIp4Cdx(ArchiveRecordHeader h) { + return ((ArchiveRecord)this.in).getIp4Cdx(h); + } + + @Override + protected String getMimetype4Cdx(ArchiveRecordHeader h) { + return ((ArchiveRecord)this.in).getMimetype4Cdx(h); + } + + @Override + public long getPosition() { + return ((ArchiveRecord)this.in).getPosition(); + } + + @Override + protected String getStatusCode4Cdx(ArchiveRecordHeader h) { + return ((ArchiveRecord)this.in).getStatusCode4Cdx(h); + } + + @Override + public boolean hasContentHeaders() { + return ((ArchiveRecord)this.in).hasContentHeaders(); + } + + @Override + protected void incrementPosition() { + ((ArchiveRecord)this.in).incrementPosition(); + } + + @Override + protected void incrementPosition(long incr) { + ((ArchiveRecord)this.in).incrementPosition(incr); + } + + @Override + protected boolean isEor() { + return ((ArchiveRecord)this.in).isEor(); + } + + @Override + public boolean isStrict() { + return ((ArchiveRecord)this.in).isStrict(); + } + + @Override + public boolean markSupported() { + return ((ArchiveRecord)this.in).markSupported(); + } + + @Override + protected String outputCdx(String strippedFileName) throws IOException { + return ((ArchiveRecord)this.in).outputCdx(strippedFileName); + } + + @Override + protected void setEor(boolean eor) { + ((ArchiveRecord)this.in).setEor(eor); + } + + @Override + protected void setHeader(ArchiveRecordHeader header) { + ((ArchiveRecord)this.in).setHeader(header); + } + + @Override + public void setStrict(boolean strict) { + ((ArchiveRecord)this.in).setStrict(strict); + } + + @Override + protected void skip() throws IOException { + ((ArchiveRecord)this.in).skip(); + } + + @Override + public long skip(long n) throws IOException { + return ((ArchiveRecord)this.in).skip(n); + } +} diff --git a/src/main/java/org/archive/io/LoudObjectOutputStream.java b/src/main/java/org/archive/io/LoudObjectOutputStream.java new file mode 100644 index 00000000..959c2620 --- /dev/null +++ b/src/main/java/org/archive/io/LoudObjectOutputStream.java @@ -0,0 +1,63 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + +import java.io.IOException; +import java.io.ObjectOutputStream; +import java.io.OutputStream; +import java.util.HashSet; +import java.util.Set; +import java.util.logging.Logger; + +/** + * ObjectOutputStream that logs class name of each object that is written + * to the stream. Useful for tracking down sources of NotSerializableException. + * + * @author pjack + * + */ +public class LoudObjectOutputStream extends ObjectOutputStream { + + + final private static Logger LOGGER = Logger.getLogger( + LoudObjectOutputStream.class.getName()); + + // Only log each class name once + private Set alreadyLogged = new HashSet(); + + public LoudObjectOutputStream(OutputStream out) throws IOException { + super(out); + this.enableReplaceObject(true); + } + + + @Override + protected Object replaceObject(Object obj) throws IOException { + if (obj != null) { + String name = obj.getClass().getName(); + if (alreadyLogged.add(name)) { + LOGGER.info("WROTE: " + name); + } + } + return obj; + } + + +} diff --git a/src/main/java/org/archive/io/MiserOutputStream.java b/src/main/java/org/archive/io/MiserOutputStream.java new file mode 100644 index 00000000..f10ac9ca --- /dev/null +++ b/src/main/java/org/archive/io/MiserOutputStream.java @@ -0,0 +1,82 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + +import java.io.FilterOutputStream; +import java.io.IOException; +import java.io.OutputStream; + +/** + * A filter stream that both counts bytes written, and optionally swallows + * flush() requests. + * + * @contributor gojomo + */ +public class MiserOutputStream extends FilterOutputStream { + protected long count; + protected boolean passFlushes; + + /** + * Wraps another output stream, counting the number of bytes written. + * + * @param out the output stream to be wrapped + */ + public MiserOutputStream(OutputStream out) { + this(out,true); + } + + /** + * Wraps another output stream, counting the number of bytes written. + * + * @param out the output stream to be wrapped + */ + public MiserOutputStream(OutputStream out, boolean passFlushes) { + super(out); + this.passFlushes = passFlushes; + } + + /** Returns the number of bytes written. */ + public long getCount() { + return count; + } + + @Override public void write(byte[] b, int off, int len) throws IOException { + out.write(b, off, len); + count += len; + } + + @Override public void write(int b) throws IOException { + out.write(b); + count++; + } + + @Override + public void close() throws IOException { + passFlushes = true; + super.close(); + } + + @Override + public void flush() throws IOException { + if(passFlushes) { + super.flush(); + } + } +} diff --git a/src/main/java/org/archive/io/NoGzipMagicException.java b/src/main/java/org/archive/io/NoGzipMagicException.java new file mode 100644 index 00000000..27d1058a --- /dev/null +++ b/src/main/java/org/archive/io/NoGzipMagicException.java @@ -0,0 +1,26 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +/** + * @deprecated use {@link org.archive.util.zip.NoGzipMagicException} + */ +@Deprecated +public class NoGzipMagicException extends org.archive.util.zip.NoGzipMagicException { +} diff --git a/src/main/java/org/archive/io/ObjectPlusFilesInputStream.java b/src/main/java/org/archive/io/ObjectPlusFilesInputStream.java new file mode 100644 index 00000000..892860ed --- /dev/null +++ b/src/main/java/org/archive/io/ObjectPlusFilesInputStream.java @@ -0,0 +1,143 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.ObjectInputStream; +import java.util.Iterator; +import java.util.LinkedList; + +import org.archive.util.FileUtils; + + +/** + * Enhanced ObjectOutputStream with support for restoring + * files that had been saved, in parallel with object + * serialization. + * + * @author gojomo + * + */ +public class ObjectPlusFilesInputStream extends ObjectInputStream { + protected LinkedList auxiliaryDirectoryStack = new LinkedList(); + protected LinkedList postRestoreTasks = new LinkedList(); + + /** + * Instantiate over the given stream and using the supplied + * auxiliary storage directory. + * + * @param in + * @param storeDir + * @throws IOException + */ + public ObjectPlusFilesInputStream(InputStream in, File storeDir) + throws IOException { + super(in); + auxiliaryDirectoryStack.addFirst(storeDir); + } + + /** + * Push another default storage directory for use + * until popped. + * + * @param dir + */ + public void pushAuxiliaryDirectory(String dir) { + auxiliaryDirectoryStack. + addFirst(new File(getAuxiliaryDirectory(), dir)); + } + + /** + * Discard the top auxiliary directory. + */ + public void popAuxiliaryDirectory() { + auxiliaryDirectoryStack.removeFirst(); + } + + /** + * Return the top auxiliary directory, from + * which saved files are restored. + * + * @return Auxillary directory. + */ + public File getAuxiliaryDirectory() { + return (File)auxiliaryDirectoryStack.getFirst(); + } + + /** + * Restore a file from storage, using the name and length + * info on the serialization stream and the file from the + * current auxiliary directory, to the given File. + * + * @param destination + * @throws IOException + */ + public void restoreFile(File destination) throws IOException { + String nameAsStored = readUTF(); + long lengthAtStoreTime = readLong(); + File storedFile = new File(getAuxiliaryDirectory(),nameAsStored); + FileUtils.copyFile(storedFile, destination, lengthAtStoreTime); + } + + /** + * Restore a file from storage, using the name and length + * info on the serialization stream and the file from the + * current auxiliary directory, to the given File. + * + * @param directory + * @throws IOException + */ + public void restoreFileTo(File directory) throws IOException { + String nameAsStored = readUTF(); + long lengthAtStoreTime = readLong(); + File storedFile = new File(getAuxiliaryDirectory(),nameAsStored); + File destination = new File(directory,nameAsStored); + FileUtils.copyFile(storedFile, destination, lengthAtStoreTime); + } + + /** + * Register a task to be done when the ObjectPlusFilesInputStream + * is closed. + * + * @param task + */ + public void registerFinishTask(Runnable task) { + postRestoreTasks.addFirst(task); + } + + private void doFinishTasks() { + Iterator iter = postRestoreTasks.iterator(); + while(iter.hasNext()) { + ((Runnable)iter.next()).run(); + } + } + + /** + * In addition to default, do any registered cleanup tasks. + * + * @see java.io.InputStream#close() + */ + public void close() throws IOException { + super.close(); + doFinishTasks(); + } +} diff --git a/src/main/java/org/archive/io/ObjectPlusFilesOutputStream.java b/src/main/java/org/archive/io/ObjectPlusFilesOutputStream.java new file mode 100644 index 00000000..224f24e7 --- /dev/null +++ b/src/main/java/org/archive/io/ObjectPlusFilesOutputStream.java @@ -0,0 +1,134 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.io.File; +import java.io.IOException; +import java.io.ObjectOutputStream; +import java.io.OutputStream; +import java.util.LinkedList; + +import org.archive.util.FileUtils; + + +/** + * Enhanced ObjectOutputStream which maintains (a stack of) auxiliary + * directories and offers convenience methods for serialized objects + * to save their related disk files alongside their serialized version. + * + * @author gojomo + */ +public class ObjectPlusFilesOutputStream extends ObjectOutputStream { + protected LinkedList auxiliaryDirectoryStack = new LinkedList(); + + /** + * Constructor + * + * @param out + * @param topDirectory + * @throws java.io.IOException + */ + public ObjectPlusFilesOutputStream(OutputStream out, File topDirectory) throws IOException { + super(out); + auxiliaryDirectoryStack.addFirst(topDirectory); + } + + /** + * Add another subdirectory for any file-capture needs during the + * current serialization. + * + * @param dir + */ + public void pushAuxiliaryDirectory(String dir) { + auxiliaryDirectoryStack.addFirst(new File(getAuxiliaryDirectory(),dir)); + } + + /** + * Remove the top subdirectory. + * + */ + public void popAuxiliaryDirectory() { + auxiliaryDirectoryStack.removeFirst(); + } + + /** + * Return the current auxiliary directory for storing + * files associated with serialized objects. + * + * @return Auxillary directory. + */ + public File getAuxiliaryDirectory() { + return (File)auxiliaryDirectoryStack.getFirst(); + } + + /** + * Store a snapshot of an object's supporting file to the + * current auxiliary directory. Should only be used for + * files which are strictly appended-to, because it tries + * to use a "hard link" where possible (meaning that + * future edits to the original file's contents will + * also affect the snapshot). + * + * Remembers current file extent to allow a future restore + * to ignore subsequent appended data. + * + * @param file + * @throws IOException + */ + public void snapshotAppendOnlyFile(File file) throws IOException { + // write filename + String name = file.getName(); + writeUTF(name); + // write current file length + writeLong(file.length()); + File auxDir = getAuxiliaryDirectory(); + if(!auxDir.exists()) { + FileUtils.ensureWriteableDirectory(auxDir); + } + File destination = new File(auxDir,name); + hardlinkOrCopy(file, destination); + } + + /** + * Create a backup of this given file, first by trying a "hard + * link", then by using a copy if hard linking is unavailable + * (either because it is unsupported or the origin and checkpoint + * directories are on different volumes). + * + * @param file + * @param destination + * @throws IOException + */ + private void hardlinkOrCopy(File file, File destination) throws IOException { + // For Linux/UNIX, try a hard link first. + Process link = Runtime.getRuntime().exec("ln "+file.getAbsolutePath()+" "+destination.getAbsolutePath()); + // TODO NTFS also supports hard links; add appropriate try + try { + link.waitFor(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + if(link.exitValue()!=0) { + // hard link failed + FileUtils.copyFile(file,destination); + } + } + +} diff --git a/src/main/java/org/archive/io/OriginSeekInputStream.java b/src/main/java/org/archive/io/OriginSeekInputStream.java new file mode 100644 index 00000000..00605d82 --- /dev/null +++ b/src/main/java/org/archive/io/OriginSeekInputStream.java @@ -0,0 +1,121 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + + +import java.io.IOException; + + +/** + * Alters the origin of some other SeekInputStream. This class allows you + * to completely ignore everything in the underlying stream before a specified + * position, the origin position. + * + *

With the exception of {@link #position()} and {@link position(long)}, + * all of the methods in this class simply delegate to the underlying input + * stream. The position methods adjust the position of the + * underlying stream relative to the origin specified at construction time. + * + * @author pjack + */ +public class OriginSeekInputStream extends SeekInputStream { + + + /** + * The underlying stream. + */ + final private SeekInputStream input; + + + /** + * The origin position. In other words, this.position(0) + * resolves to input.position(start). + */ + final private long origin; + + + /** + * Constructor. + * + * @param input the underlying stream + * @param origin the origin position + * @throws IOException if an IO error occurs + */ + public OriginSeekInputStream(SeekInputStream input, long origin) + throws IOException { + this.input = input; + this.origin = origin; + input.position(origin); + } + + + @Override + public int available() throws IOException { + return input.available(); + } + + + @Override + public int read() throws IOException { + return input.read(); + } + + + @Override + public int read(byte[] buf, int ofs, int len) throws IOException { + return input.read(buf, ofs, len); + } + + + @Override + public int read(byte[] buf) throws IOException { + return input.read(buf); + } + + + @Override + public long skip(long count) throws IOException { + return input.skip(count); + } + + + /** + * Returns the position of the underlying stream relative to the origin. + * + * @return the relative position + * @throws IOException if an IO error occurs + */ + public long position() throws IOException { + return input.position() - origin; + } + + + /** + * Positions the underlying stream relative to the origin. + * In other words, this.position(0) resolves to input.position(origin), + * where input is underlying stream and origin is the origin specified + * at construction time. + * + * @param p the new position for this stream + * @throws IOException if an IO error occurs + */ + public void position(long p) throws IOException { + input.position(p + origin); + } +} diff --git a/src/main/java/org/archive/io/Preformatter.java b/src/main/java/org/archive/io/Preformatter.java new file mode 100644 index 00000000..dcd31bb6 --- /dev/null +++ b/src/main/java/org/archive/io/Preformatter.java @@ -0,0 +1,32 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.util.logging.LogRecord; + +/** + * Interface indicating a logging Formatter can preformat a record (outside + * the standard-implementation synchronized block) and cache it, returning it + * for the next request for formatting from the same thread. + * @contributor gojomo + */ +public interface Preformatter { + public void preformat(LogRecord record); + public void clear(); +} diff --git a/src/main/java/org/archive/io/RandomAccessInputStream.java b/src/main/java/org/archive/io/RandomAccessInputStream.java new file mode 100644 index 00000000..d8dd260b --- /dev/null +++ b/src/main/java/org/archive/io/RandomAccessInputStream.java @@ -0,0 +1,180 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + + +import java.io.File; +import java.io.IOException; +import java.io.RandomAccessFile; + + +/** + * Wraps a RandomAccessFile with an InputStream interface. + * + * @author gojomo + */ +public class RandomAccessInputStream extends SeekInputStream { + + /** + * Reference to the random access file this stream is reading from. + */ + private RandomAccessFile raf = null; + + /** + * When mark is called, save here the current position so we can go back + * on reset. + */ + private long markpos = -1; + + /** + * True if we are to close the underlying random access file when this + * stream is closed. + */ + private boolean sympathyClose; + + /** + * Constructor. + * + * If using this constructor, caller created the RAF and therefore + * its assumed wants to control close of the RAF. The RAF.close + * is not called if this constructor is used on close of this stream. + * + * @param raf RandomAccessFile to wrap. + * @throws IOException + */ + public RandomAccessInputStream(RandomAccessFile raf) + throws IOException { + this(raf, false, 0); + } + + /** + * Constructor. + * + * @param file File to get RAFIS on. Creates an RAF from passed file. + * Closes the created RAF when this stream is closed. + * @throws IOException + */ + public RandomAccessInputStream(final File file) + throws IOException { + this(new RandomAccessFile(file, "r"), true, 0); + } + + /** + * Constructor. + * + * @param file File to get RAFIS on. Creates an RAF from passed file. + * Closes the created RAF when this stream is closed. + * @param offset + * @throws IOException + */ + public RandomAccessInputStream(final File file, final long offset) + throws IOException { + this(new RandomAccessFile(file, "r"), true, offset); + } + + /** + * @param raf RandomAccessFile to wrap. + * @param sympathyClose Set to true if we are to close the RAF + * file when this stream is closed. + * @param offset + * @throws IOException + */ + public RandomAccessInputStream(final RandomAccessFile raf, + final boolean sympathyClose, final long offset) + throws IOException { + super(); + this.sympathyClose = sympathyClose; + this.raf = raf; + if (offset > 0) { + this.raf.seek(offset); + } + } + + /* (non-Javadoc) + * @see java.io.InputStream#read() + */ + public int read() throws IOException { + return this.raf.read(); + } + + /* (non-Javadoc) + * @see java.io.InputStream#read(byte[], int, int) + */ + public int read(byte[] b, int off, int len) throws IOException { + return this.raf.read(b, off, len); + } + + /* (non-Javadoc) + * @see java.io.InputStream#read(byte[]) + */ + public int read(byte[] b) throws IOException { + return this.raf.read(b); + } + + /* (non-Javadoc) + * @see java.io.InputStream#skip(long) + */ + public long skip(long n) throws IOException { + this.raf.seek(this.raf.getFilePointer() + n); + return n; + } + + public long position() throws IOException { + return this.raf.getFilePointer(); + } + + public void position(long position) throws IOException { + this.raf.seek(position); + } + + public int available() throws IOException { + long amount = this.raf.length() - this.position(); + return (amount >= Integer.MAX_VALUE)? Integer.MAX_VALUE: (int)amount; + } + + public boolean markSupported() { + return true; + } + + public synchronized void mark(int readlimit) { + try { + this.markpos = position(); + } catch (IOException e) { + // Set markpos to -1. Will cause exception reset. + this.markpos = -1; + } + } + + public synchronized void reset() throws IOException { + if (this.markpos == -1) { + throw new IOException("Mark has not been set."); + } + position(this.markpos); + } + + public void close() throws IOException { + try { + super.close(); + } finally { + if (this.sympathyClose) { + this.raf.close(); + } + } + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/RandomAccessOutputStream.java b/src/main/java/org/archive/io/RandomAccessOutputStream.java new file mode 100644 index 00000000..225f995f --- /dev/null +++ b/src/main/java/org/archive/io/RandomAccessOutputStream.java @@ -0,0 +1,69 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.io.IOException; +import java.io.OutputStream; +import java.io.RandomAccessFile; + + +/** + * Wraps a RandomAccessFile with OutputStream interface. + * + * @author gojomo + */ +public class RandomAccessOutputStream extends OutputStream { + protected RandomAccessFile raf; + + /** + * Wrap the given RandomAccessFile + */ + public RandomAccessOutputStream(RandomAccessFile raf) { + super(); + this.raf = raf; + } + + /* (non-Javadoc) + * @see java.io.OutputStream#write(int) + */ + public void write(int b) throws IOException { + raf.write(b); + } + + /* (non-Javadoc) + * @see java.io.OutputStream#close() + */ + public void close() throws IOException { + raf.close(); + } + + /* (non-Javadoc) + * @see java.io.OutputStream#write(byte[], int, int) + */ + public void write(byte[] b, int off, int len) throws IOException { + raf.write(b, off, len); + } + + /* (non-Javadoc) + * @see java.io.OutputStream#write(byte[]) + */ + public void write(byte[] b) throws IOException { + raf.write(b); + } +} diff --git a/src/main/java/org/archive/io/ReadSource.java b/src/main/java/org/archive/io/ReadSource.java new file mode 100644 index 00000000..a3c29967 --- /dev/null +++ b/src/main/java/org/archive/io/ReadSource.java @@ -0,0 +1,37 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + +import java.io.Reader; + +/** + * Interface for objects that can provide a Reader view of their + * contents. + * + */ +public interface ReadSource { + /** + * Obtain a Reader. Not named 'getReader' so that it is not + * considered a simple costless read-only property by + * bean-convention introspection tools. + * @return a Reader on this object + */ + Reader obtainReader(); +} diff --git a/src/main/java/org/archive/io/RecorderIOException.java b/src/main/java/org/archive/io/RecorderIOException.java new file mode 100644 index 00000000..07b30061 --- /dev/null +++ b/src/main/java/org/archive/io/RecorderIOException.java @@ -0,0 +1,38 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.io.IOException; + +/** + * + * @author Gordon Mohr + */ +public class RecorderIOException extends IOException { + + private static final long serialVersionUID = 5907470275350314277L; + + public RecorderIOException() { + super(); + } + + public RecorderIOException(String msg) { + super(msg); + } +} diff --git a/src/main/java/org/archive/io/RecorderLengthExceededException.java b/src/main/java/org/archive/io/RecorderLengthExceededException.java new file mode 100644 index 00000000..8c3e067d --- /dev/null +++ b/src/main/java/org/archive/io/RecorderLengthExceededException.java @@ -0,0 +1,39 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + + +/** + * Indicates a length exception thrown by the Recorder. + * + * @author Gordon Mohr + */ +public class RecorderLengthExceededException +extends RecorderIOException { + + private static final long serialVersionUID = 6655419033414648444L; + + public RecorderLengthExceededException() { + super(); + } + + public RecorderLengthExceededException(String msg) { + super(msg); + } +} diff --git a/src/main/java/org/archive/io/RecorderTimeoutException.java b/src/main/java/org/archive/io/RecorderTimeoutException.java new file mode 100644 index 00000000..32be5b5d --- /dev/null +++ b/src/main/java/org/archive/io/RecorderTimeoutException.java @@ -0,0 +1,37 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +/** + * Indicates a timeout thrown by the RecordingInputStream. + * + * @author Gordon Mohr + */ +public class RecorderTimeoutException extends RecorderIOException { + + private static final long serialVersionUID = 7433214063765078269L; + + public RecorderTimeoutException() { + super(); + } + + public RecorderTimeoutException(String msg) { + super(msg); + } +} diff --git a/src/main/java/org/archive/io/RecorderTooMuchHeaderException.java b/src/main/java/org/archive/io/RecorderTooMuchHeaderException.java new file mode 100644 index 00000000..23f5d264 --- /dev/null +++ b/src/main/java/org/archive/io/RecorderTooMuchHeaderException.java @@ -0,0 +1,40 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + + +/** + * Indicates a too much header material exception thrown by the Recorder + * (specificially the RecordingOutputStream) + * + * @author Gordon Mohr + */ +public class RecorderTooMuchHeaderException +extends RecorderIOException { + + private static final long serialVersionUID = 3528516034898129150L; + + public RecorderTooMuchHeaderException() { + super(); + } + + public RecorderTooMuchHeaderException(String msg) { + super(msg); + } +} diff --git a/src/main/java/org/archive/io/RecordingInputStream.java b/src/main/java/org/archive/io/RecordingInputStream.java new file mode 100644 index 00000000..b46905ed --- /dev/null +++ b/src/main/java/org/archive/io/RecordingInputStream.java @@ -0,0 +1,355 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.io.IOException; +import java.io.InputStream; +import java.net.SocketException; +import java.net.SocketTimeoutException; +import java.security.MessageDigest; +import java.util.logging.Level; +import java.util.logging.Logger; + +import org.apache.commons.io.IOUtils; + + +/** + * Stream which records all data read from it, which it acquires from a wrapped + * input stream. + * + * Makes use of a RecordingOutputStream for recording because of its being + * file backed so we can write massive amounts of data w/o worrying about + * overflowing memory. + * + * @author gojomo + * + */ +public class RecordingInputStream + extends InputStream { + + protected static Logger logger = + Logger.getLogger("org.archive.io.RecordingInputStream"); + + /** + * Where we are recording to. + */ + private RecordingOutputStream recordingOutputStream; + + /** + * Stream to record. + */ + private InputStream in = null; + + /** + * Reusable buffer to avoid reallocation on each readFullyUntil + */ + protected byte[] drainBuffer = new byte[16*1024]; + + /** + * Create a new RecordingInputStream. + * + * @param bufferSize Size of buffer to use. + * @param backingFilename Name of backing file. + */ + public RecordingInputStream(int bufferSize, String backingFilename) + { + this.recordingOutputStream = new RecordingOutputStream(bufferSize, + backingFilename); + } + + public void open(InputStream wrappedStream) throws IOException { + logger.fine(Thread.currentThread().getName() + " opening " + + wrappedStream + ", " + Thread.currentThread().getName()); + if(isOpen()) { + // error; should not be opening/wrapping in an unclosed + // stream remains open + throw new IOException("RIS already open for " + +Thread.currentThread().getName()); + } + try { + this.in = wrappedStream; + this.recordingOutputStream.open(); + } catch (IOException ioe) { + close(); // ...and rethrow... + throw ioe; + } + } + + public int read() throws IOException { + if (!isOpen()) { + throw new IOException("Stream closed " + + Thread.currentThread().getName()); + } + int b = this.in.read(); + if (b != -1) { + assert this.recordingOutputStream != null: "ROS is null " + + Thread.currentThread().getName(); + this.recordingOutputStream.write(b); + } + return b; + } + + public int read(byte[] b, int off, int len) throws IOException { + if (!isOpen()) { + throw new IOException("Stream closed " + + Thread.currentThread().getName()); + } + int count = this.in.read(b,off,len); + if (count > 0) { + assert this.recordingOutputStream != null: "ROS is null " + + Thread.currentThread().getName(); + this.recordingOutputStream.write(b,off,count); + } + return count; + } + + public int read(byte[] b) throws IOException { + if (!isOpen()) { + throw new IOException("Stream closed " + + Thread.currentThread().getName()); + } + int count = this.in.read(b); + if (count > 0) { + assert this.recordingOutputStream != null: "ROS is null " + + Thread.currentThread().getName(); + this.recordingOutputStream.write(b,0,count); + } + return count; + } + + public void close() throws IOException { + if (logger.isLoggable(Level.FINE)) { + logger.fine(Thread.currentThread().getName() + " closing " + + this.in + ", " + Thread.currentThread().getName()); + } + IOUtils.closeQuietly(this.in); + this.in = null; + IOUtils.closeQuietly(this.recordingOutputStream); + } + + public ReplayInputStream getReplayInputStream() throws IOException { + return this.recordingOutputStream.getReplayInputStream(); + } + + public ReplayInputStream getMessageBodyReplayInputStream() throws IOException { + return this.recordingOutputStream.getMessageBodyReplayInputStream(); + } + + public long readFully() throws IOException { + while(read(drainBuffer) != -1) { + // Empty out stream. + continue; + } + return this.recordingOutputStream.getSize(); + } + + /** + * Read all of a stream (Or read until we timeout or have read to the max). + * @param softMaxLength Maximum length to read; if zero or < 0, then no + * limit. If met, return normally. + * @param hardMaxLength Maximum length to read; if zero or < 0, then no + * limit. If exceeded, throw RecorderLengthExceededException + * @param timeout Timeout in milliseconds for total read; if zero or + * negative, timeout is Long.MAX_VALUE. If exceeded, throw + * RecorderTimeoutException + * @param maxBytesPerMs How many bytes per millisecond. + * @throws IOException failed read. + * @throws RecorderLengthExceededException + * @throws RecorderTimeoutException + * @throws InterruptedException + */ + public void readFullyOrUntil(long softMaxLength) + throws IOException, RecorderLengthExceededException, + RecorderTimeoutException, InterruptedException { + // Check we're open before proceeding. + if (!isOpen()) { + // TODO: should this be a noisier exception-raising error? + return; + } + + long totalBytes = 0L; + long bytesRead = -1L; + long maxToRead = -1; + while (true) { + try { + // read no more than soft max + maxToRead = (softMaxLength <= 0) + ? drainBuffer.length + : Math.min(drainBuffer.length, softMaxLength - totalBytes); + // nor more than hard max + maxToRead = Math.min(maxToRead, recordingOutputStream.getRemainingLength()); + // but always at least 1 (to trigger hard max exception + maxToRead = Math.max(maxToRead, 1); + + bytesRead = read(drainBuffer,0,(int)maxToRead); + if (bytesRead == -1) { + break; + } + totalBytes += bytesRead; + + if (Thread.interrupted()) { + throw new InterruptedException("Interrupted during IO"); + } + } catch (SocketTimeoutException e) { + // A socket timeout is just a transient problem, meaning + // nothing was available in the configured timeout period, + // but something else might become available later. + // Take this opportunity to check the overall + // timeout (below). One reason for this timeout is + // servers that keep up the connection, 'keep-alive', even + // though we asked them to not keep the connection open. + if (logger.isLoggable(Level.FINE)) { + logger.log(Level.FINE, "socket timeout", e); + } + // check for interrupt + if (Thread.interrupted()) { + throw new InterruptedException("Interrupted during IO"); + } + // check for overall timeout + recordingOutputStream.checkLimits(); + } catch (SocketException se) { + throw se; + } catch (NullPointerException e) { + // [ 896757 ] NPEs in Andy's Th-Fri Crawl. + // A crawl was showing NPE's in this part of the code but can + // not reproduce. Adding this rethrowing catch block w/ + // diagnostics to help should we come across the problem in the + // future. + throw new NullPointerException("Stream " + this.in + ", " + + e.getMessage() + " " + Thread.currentThread().getName()); + } + + // if have read 'enough', just finish + if (softMaxLength > 0 && totalBytes >= softMaxLength) { + break; // return + } + } + } + + public long getSize() { + return this.recordingOutputStream.getSize(); + } + + public void markContentBegin() { + this.recordingOutputStream.markMessageBodyBegin(); + } + + public long getContentBegin() { + return this.recordingOutputStream.getMessageBodyBegin(); + } + + public void startDigest() { + this.recordingOutputStream.startDigest(); + } + + /** + * Convenience method for setting SHA1 digest. + */ + public void setSha1Digest() { + this.recordingOutputStream.setSha1Digest(); + } + + /** + * Sets a digest algorithm which may be applied to recorded data. + * As usually only a subset of the recorded data should + * be fed to the digest, you must also call startDigest() + * to begin digesting. + * + * @param algorithm + */ + public void setDigest(String algorithm) { + this.recordingOutputStream.setDigest(algorithm); + } + + /** + * Sets a digest function which may be applied to recorded data. + * As usually only a subset of the recorded data should + * be fed to the digest, you must also call startDigest() + * to begin digesting. + * + * @param md + */ + public void setDigest(MessageDigest md) { + this.recordingOutputStream.setDigest(md); + } + + /** + * Return the digest value for any recorded, digested data. Call + * only after all data has been recorded; otherwise, the running + * digest state is ruined. + * + * @return the digest final value + */ + public byte[] getDigestValue() { + return this.recordingOutputStream.getDigestValue(); + } + + public long getResponseContentLength() { + return this.recordingOutputStream.getResponseContentLength(); + } + + public void closeRecorder() throws IOException { + this.recordingOutputStream.closeRecorder(); + } + + /** + * @return True if we've been opened. + */ + public boolean isOpen() + { + return this.in != null; + } + + @Override + public synchronized void mark(int readlimit) { + this.in.mark(readlimit); + this.recordingOutputStream.mark(); + } + + @Override + public boolean markSupported() { + return this.in.markSupported(); + } + + @Override + public synchronized void reset() throws IOException { + this.in.reset(); + this.recordingOutputStream.reset(); + } + + /** + * Set limits to be enforced by internal recording-out + */ + public void setLimits(long hardMax, long timeoutMs, long maxRateKBps) { + recordingOutputStream.setLimits(hardMax, timeoutMs, maxRateKBps); + } + + /** + * Expose the amount of in-memory buffering used by the internal + * recording stream. + * @return int buffer size + */ + public int getRecordedBufferLength() { + return recordingOutputStream.getBufferLength(); + } + + public void clearForReuse() throws IOException { + recordingOutputStream.clearForReuse(); + } +} diff --git a/src/main/java/org/archive/io/RecordingOutputStream.java b/src/main/java/org/archive/io/RecordingOutputStream.java new file mode 100644 index 00000000..4d0713da --- /dev/null +++ b/src/main/java/org/archive/io/RecordingOutputStream.java @@ -0,0 +1,576 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + +import it.unimi.dsi.fastutil.io.FastBufferedOutputStream; + +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.util.logging.Level; +import java.util.logging.Logger; + + +/** + * An output stream that records all writes to wrapped output + * stream. + * + * A RecordingOutputStream can be wrapped around any other + * OutputStream to record all bytes written to it. You can + * then request a ReplayInputStream to read those bytes. + * + *

The RecordingOutputStream uses an in-memory buffer and + * backing disk file to allow it to record streams of + * arbitrary length limited only by available disk space. + * + *

As long as the stream recorded is smaller than the + * in-memory buffer, no disk access will occur. + * + *

Recorded content can be recovered as a ReplayInputStream + * (via getReplayInputStream() or, for only the content after + * the content-begin-mark is set, getContentReplayInputStream() ) + * or as a ReplayCharSequence (via getReplayCharSequence()). + * + *

This class is also used as a straight output stream + * by {@link RecordingInputStream} to which it records all reads. + * {@link RecordingInputStream} is exploiting the file backed buffer + * facility of this class passing null for the stream + * to wrap. TODO: Make a FileBackedOutputStream class that is + * subclassed by RecordingInputStream. + * + * @author gojomo + * + */ +public class RecordingOutputStream extends OutputStream { + protected static Logger logger = + Logger.getLogger(RecordingOutputStream.class.getName()); + + /** + * Size of recording. + * + * Later passed to ReplayInputStream on creation. It uses it to know when + * EOS. + */ + protected long size = 0; + + protected String backingFilename; + protected OutputStream diskStream = null; + + /** + * Buffer we write recordings to. + * + * We write all recordings here first till its full. Thereafter we + * write the backing file. + */ + private byte[] buffer; + + /** current virtual position in the recording */ + private long position; + + /** flag to disable recording */ + private boolean recording; + + /** + * Reusable buffer for FastBufferedOutputStream + */ + protected byte[] bufStreamBuf = + new byte [ FastBufferedOutputStream.DEFAULT_BUFFER_SIZE ]; + + /** + * True if we're to digest content. + */ + private boolean shouldDigest = false; + + /** + * Digest instance. + */ + private MessageDigest digest = null; + + /** + * Define for SHA1 algarithm. + */ + private static final String SHA1 = "SHA1"; + + /** + * Maximum amount of header material to accept without the content + * body beginning -- if more, throw a RecorderTooMuchHeaderException. + * TODO: make configurable? make smaller? + */ + protected static final long MAX_HEADER_MATERIAL = 1024*1024; // 1MB + + // configurable max length, max time limits + /** maximum length of material to record before throwing exception */ + protected long maxLength = Long.MAX_VALUE; + /** maximum time to record before throwing exception */ + protected long timeoutMs = Long.MAX_VALUE; + /** maximum rate to record (adds delays to hit target rate) */ + protected long maxRateBytesPerMs = Long.MAX_VALUE; + /** time recording begins for timeout, rate calculations */ + protected long startTime = Long.MAX_VALUE; + + /** + * When recording HTTP, where the content-body starts. + */ + protected long messageBodyBeginMark; + + /** + * Stream to record. + */ + private OutputStream out = null; + + // mark/reset support + /** furthest position reached before any reset()s */ + private long maxPosition = 0; + /** remembered position to reset() to */ + private long markPosition = 0; + + /** + * Create a new RecordingOutputStream. + * + * @param bufferSize Buffer size to use. + * @param backingFilename Name of backing file to use. + */ + public RecordingOutputStream(int bufferSize, String backingFilename) { + this.buffer = new byte[bufferSize]; + this.backingFilename = backingFilename; + recording = true; + } + + /** + * Wrap the given stream, both recording and passing along any data written + * to this RecordingOutputStream. + * + * @throws IOException If failed creation of backing file. + */ + public void open() throws IOException { + this.open(null); + } + + /** + * Wrap the given stream, both recording and passing along any data written + * to this RecordingOutputStream. + * + * @param wrappedStream Stream to wrap. May be null for case where we + * want to write to a file backed stream only. + * + * @throws IOException If failed creation of backing file. + */ + public void open(OutputStream wrappedStream) throws IOException { + if(isOpen()) { + // error; should not be opening/wrapping in an unclosed + // stream remains open + throw new IOException("ROS already open for " + +Thread.currentThread().getName()); + } + clearForReuse(); + this.out = wrappedStream; + if (this.diskStream == null) { + // TODO: Fix so we only make file when its actually needed. + FileOutputStream fis = new FileOutputStream(this.backingFilename); + + this.diskStream = new RecyclingFastBufferedOutputStream(fis, bufStreamBuf); + } + startTime = System.currentTimeMillis(); + } + + public void write(int b) throws IOException { + if(position< maxPosition) { + if(position+len<=maxPosition) { + // revisiting; do nothing but advance position + position += len; + return; + } + // consume part of the array doing nothing but advancing position + long consumeRange = maxPosition - position; + position += consumeRange; + off += consumeRange; + len -= consumeRange; + } + if(recording) { + record(b, off, len); + } + if (this.out != null) { + this.out.write(b, off, len); + } + checkLimits(); + } + + /** + * Check any enforced limits. + */ + protected void checkLimits() throws RecorderIOException { + // too much material before finding end of headers? + if (messageBodyBeginMark<0) { + // no mark yet + if(position>MAX_HEADER_MATERIAL) { + throw new RecorderTooMuchHeaderException(); + } + } + // overlong? + if(position>maxLength) { + throw new RecorderLengthExceededException(); + } + // taking too long? + long duration = System.currentTimeMillis() - startTime; + duration = Math.max(duration,1); // !divzero + if(duration>timeoutMs) { + throw new RecorderTimeoutException(); + } + // need to throttle reading to hit max configured rate? + if(position/duration > maxRateBytesPerMs) { + long desiredDuration = position / maxRateBytesPerMs; + try { + Thread.sleep(desiredDuration-duration); + } catch (InterruptedException e) { + logger.log(Level.WARNING, + "bandwidth throttling sleep interrupted", e); + } + } + } + + /** + * Record the given byte for later recovery + * + * @param b Int to record. + * + * @exception IOException Failed write to backing file. + */ + private void record(int b) throws IOException { + if (this.shouldDigest) { + this.digest.update((byte)b); + } + if (this.position >= this.buffer.length) { + // TODO: Its possible to call write w/o having first opened a + // stream. Protect ourselves against this. + assert this.diskStream != null: "Diskstream is null"; + this.diskStream.write(b); + } else { + this.buffer[(int) this.position] = (byte) b; + } + this.position++; + } + + /** + * Record the given byte-array range for recovery later + * + * @param b Buffer to record. + * @param off Offset into buffer at which to start recording. + * @param len Length of buffer to record. + * + * @exception IOException Failed write to backing file. + */ + private void record(byte[] b, int off, int len) throws IOException { + if(this.shouldDigest) { + assert this.digest != null: "Digest is null."; + this.digest.update(b, off, len); + } + tailRecord(b, off, len); + } + + /** + * Record without digesting. + * + * @param b Buffer to record. + * @param off Offset into buffer at which to start recording. + * @param len Length of buffer to record. + * + * @exception IOException Failed write to backing file. + */ + private void tailRecord(byte[] b, int off, int len) throws IOException { + if(this.position >= this.buffer.length){ + // TODO: Its possible to call write w/o having first opened a + // stream. Lets protect ourselves against this. + if (this.diskStream == null) { + throw new IOException("diskstream is null"); + } + this.diskStream.write(b, off, len); + this.position += len; + } else { + assert this.buffer != null: "Buffer is null"; + int toCopy = (int)Math.min(this.buffer.length - this.position, len); + assert b != null: "Passed buffer is null"; + System.arraycopy(b, off, this.buffer, (int)this.position, toCopy); + this.position += toCopy; + // TODO verify these are +1 -1 right + if (toCopy < len) { + tailRecord(b, off + toCopy, len - toCopy); + } + } + } + + public void close() throws IOException { + if(messageBodyBeginMark<0) { + // if unset, consider 0 posn as content-start + // (so that a -1 never survives to replay step) + messageBodyBeginMark = 0; + } + if (this.out != null) { + this.out.close(); + this.out = null; + } + closeRecorder(); + } + + protected synchronized void closeDiskStream() + throws IOException { + if (this.diskStream != null) { + this.diskStream.close(); + this.diskStream = null; + } + } + + public void closeRecorder() throws IOException { + recording = false; + closeDiskStream(); // if any + // This setting of size is important. Its passed to ReplayInputStream + // on creation. It uses it to know EOS. + if (this.size == 0) { + this.size = this.position; + } + } + + /* (non-Javadoc) + * @see java.io.OutputStream#flush() + */ + public void flush() throws IOException { + if (this.out != null) { + this.out.flush(); + } + if (this.diskStream != null) { + this.diskStream.flush(); + } + } + + public ReplayInputStream getReplayInputStream() throws IOException { + return getReplayInputStream(0); + } + + public ReplayInputStream getReplayInputStream(long skip) throws IOException { + // If this method is being called, then assumption must be that the + // stream is closed. If it ain't, then the stream gotten won't work + // -- the size will zero so any attempt at a read will get back EOF. + assert this.out == null: "Stream is still open."; + ReplayInputStream replay = new ReplayInputStream(this.buffer, + this.size, this.messageBodyBeginMark, this.backingFilename); + replay.skip(skip); + return replay; + } + + /** + * Return a replay stream, cued up to begining of content + * + * @throws IOException + * @return An RIS. + */ + public ReplayInputStream getMessageBodyReplayInputStream() throws IOException { + return getReplayInputStream(this.messageBodyBeginMark); + } + + public long getSize() { + return this.size; + } + + /** + * Remember the current position as the start of the "message + * body". Useful when recording HTTP traffic as a way to start + * replays after the headers. + */ + public void markMessageBodyBegin() { + this.messageBodyBeginMark = this.position; + startDigest(); + } + + /** + * Return stored message-body-begin-mark (which is also end-of-headers) + */ + public long getMessageBodyBegin() { + return this.messageBodyBeginMark; + } + + /** + * Starts digesting recorded data, if a MessageDigest has been + * set. + */ + public void startDigest() { + if (this.digest != null) { + this.digest.reset(); + this.shouldDigest = true; + } + } + + /** + * Convenience method for setting SHA1 digest. + * @see #setDigest(String) + */ + public void setSha1Digest() { + setDigest(SHA1); + } + + + /** + * Sets a digest function which may be applied to recorded data. + * The difference between calling this method and {@link #setDigest(MessageDigest)} + * is that this method tries to reuse MethodDigest instance if already allocated + * and of appropriate algorithm. + * @param algorithm Message digest algorithm to use. + * @see #setDigest(MessageDigest) + */ + public void setDigest(String algorithm) { + try { + // Reuse extant digest if its sha1 algorithm. + if (this.digest == null || + !this.digest.getAlgorithm().equals(algorithm)) { + setDigest(MessageDigest.getInstance(algorithm)); + } + } catch (NoSuchAlgorithmException e) { + e.printStackTrace(); + } + } + + /** + * Sets a digest function which may be applied to recorded data. + * + * As usually only a subset of the recorded data should + * be fed to the digest, you must also call startDigest() + * to begin digesting. + * + * @param md Message digest function to use. + */ + public void setDigest(MessageDigest md) { + this.digest = md; + } + + /** + * Return the digest value for any recorded, digested data. Call + * only after all data has been recorded; otherwise, the running + * digest state is ruined. + * + * @return the digest final value + */ + public byte[] getDigestValue() { + if(this.digest == null) { + return null; + } + return this.digest.digest(); + } + + public long getResponseContentLength() { + return this.size - this.messageBodyBeginMark; + } + + /** + * @return True if this ROS is open. + */ + public boolean isOpen() { + return this.out != null; + } + + public int getBufferLength() { + return this.buffer.length; + } + + /** + * When used alongside a mark-supporting RecordingInputStream, remember + * a position reachable by a future reset(). + */ + public void mark() { + // remember this position for subsequent reset() + this.markPosition = position; + } + + /** + * When used alongside a mark-supporting RecordingInputStream, reset + * the position to that saved by previous mark(). Until the position + * again reached "new" material, none of the bytes pushed to this + * stream will be digested or recorded. + */ + public void reset() { + // take note of furthest-position-reached to avoid double-recording + maxPosition = Math.max(maxPosition, position); + // reset to previous position + position = markPosition; + } + + /** + * Set limits on length, time, and rate to enforce. + * + * @param length + * @param milliseconds + * @param rateKBps + */ + public void setLimits(long length, long milliseconds, long rateKBps) { + maxLength = (length>0) ? length : Long.MAX_VALUE; + timeoutMs = (milliseconds>0) ? milliseconds : Long.MAX_VALUE; + maxRateBytesPerMs = (rateKBps>0) ? rateKBps*1024/1000 : Long.MAX_VALUE; + } + + /** + * Reset limits to effectively-unlimited defaults + */ + public void resetLimits() { + maxLength = Long.MAX_VALUE; + timeoutMs = Long.MAX_VALUE; + maxRateBytesPerMs = Long.MAX_VALUE; + } + + /** + * Return number of bytes that could be recorded without hitting + * length limit + * + * @return long byte count + */ + public long getRemainingLength() { + return maxLength - position; + } + + public void clearForReuse() throws IOException { + this.out = null; + this.position = 0; + this.markPosition = 0; + this.maxPosition = 0; + this.size = 0; + this.messageBodyBeginMark = -1; + // ensure recording turned on + this.recording = true; + // Always begins false; must use startDigest() to begin + this.shouldDigest = false; + if (this.diskStream != null) { + closeDiskStream(); + } + } +} + diff --git a/src/main/java/org/archive/io/RecoverableIOException.java b/src/main/java/org/archive/io/RecoverableIOException.java new file mode 100644 index 00000000..5ce2251a --- /dev/null +++ b/src/main/java/org/archive/io/RecoverableIOException.java @@ -0,0 +1,83 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.io.IOException; +import java.io.PrintStream; +import java.io.PrintWriter; + +/** + * A decorator on IOException for IOEs that are likely not fatal or at least + * merit retry. + * @author stack + * @version $Date$, $Revision$ + */ +public class RecoverableIOException extends IOException { + private static final long serialVersionUID = 6194776587381865451L; + private final IOException decoratedIOException; + + public RecoverableIOException(final String message) { + this(new IOException(message)); + } + + public RecoverableIOException(final IOException ioe) { + super(); + this.decoratedIOException = ioe; + } + + public Throwable getCause() { + return this.decoratedIOException.getCause(); + } + + public String getLocalizedMessage() { + return this.decoratedIOException.getLocalizedMessage(); + } + + public String getMessage() { + return this.decoratedIOException.getMessage(); + } + + public StackTraceElement[] getStackTrace() { + return this.decoratedIOException.getStackTrace(); + } + + public synchronized Throwable initCause(Throwable cause) { + return this.decoratedIOException.initCause(cause); + } + + public void printStackTrace() { + this.decoratedIOException.printStackTrace(); + } + + public void printStackTrace(PrintStream s) { + this.decoratedIOException.printStackTrace(s); + } + + public void printStackTrace(PrintWriter s) { + this.decoratedIOException.printStackTrace(s); + } + + public void setStackTrace(StackTraceElement[] stackTrace) { + this.decoratedIOException.setStackTrace(stackTrace); + } + + public String toString() { + return this.decoratedIOException.toString(); + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/RecyclingFastBufferedOutputStream.java b/src/main/java/org/archive/io/RecyclingFastBufferedOutputStream.java new file mode 100644 index 00000000..a3b76e46 --- /dev/null +++ b/src/main/java/org/archive/io/RecyclingFastBufferedOutputStream.java @@ -0,0 +1,37 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import it.unimi.dsi.fastutil.io.FastBufferedOutputStream; + +import java.io.OutputStream; + +/** + * FastBufferedOutputStream that accepts a passed-in buffer (avoiding + * reallocation). + */ +public class RecyclingFastBufferedOutputStream extends FastBufferedOutputStream { + public RecyclingFastBufferedOutputStream( final OutputStream os, final byte[] buffer ) { + super(os); + this.buffer = buffer; + avail = buffer.length; + } +} + + diff --git a/src/main/java/org/archive/io/ReplayCharSequence.java b/src/main/java/org/archive/io/ReplayCharSequence.java new file mode 100644 index 00000000..aa9b9587 --- /dev/null +++ b/src/main/java/org/archive/io/ReplayCharSequence.java @@ -0,0 +1,77 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + +import java.io.Closeable; +import java.io.IOException; +import java.nio.charset.CharacterCodingException; +import java.nio.charset.Charset; + +import com.google.common.base.Charsets; + + +/** + * CharSequence interface with addition of a {@link #close()} method. + * + * Users of implementations of this interface must call {@link #close()} so + * implementations get a chance at cleaning up after themselves. + * + * @author stack + * @version $Revision$, $Date$ + */ +public interface ReplayCharSequence extends CharSequence, Closeable { + + /** charset to use in replay when declared value + * is absent/illegal/unavailable */ + public Charset FALLBACK_CHARSET = Charsets.ISO_8859_1; // TODO: should this be UTF-8? + + /** + * Call this method when done so implementation has chance to clean up + * resources. + * + * @throws IOException Problem cleaning up file system resources. + */ + public void close() throws IOException; + + /** + * Report count of decoder errors silently eaten during ReplayCharSequence + * use. May be less than the number of individual decoding anomalies in + * underlying content (if decoding method doesn't allow counting individual + * errors). + */ + public long getDecodeExceptionCount(); + + /** + * Return the first coding-exception encountered, if the count > 0. + * @return CharacterCodingException + */ + public CharacterCodingException getCodingException(); + + /** + * @return false if {@link #close()} has been called + */ + public boolean isOpen(); + + /** + * Return the effective Charset used to create this CharSequence from + * (raw byte) source material. + */ + public Charset getCharset(); +} diff --git a/src/main/java/org/archive/io/ReplayInputStream.java b/src/main/java/org/archive/io/ReplayInputStream.java new file mode 100644 index 00000000..fccf5fd3 --- /dev/null +++ b/src/main/java/org/archive/io/ReplayInputStream.java @@ -0,0 +1,325 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; + +import org.apache.commons.io.IOUtils; +import org.archive.util.ArchiveUtils; +import org.archive.util.FileUtils; + + +/** + * Replays the bytes recorded from a RecordingInputStream or + * RecordingOutputStream. + * + * This InputStream supports mark and reset. + * + * @author gojomo + */ +public class ReplayInputStream extends SeekInputStream +{ + private static final int DEFAULT_BUFFER_SIZE = 256*1024; // 256KiB + private BufferedSeekInputStream diskStream; + private byte[] buffer; + private long position; + + /** + * Total size of stream content. + * + * Size of data to replay. + */ + private long size = -1; + + /** + * Where the response body starts, if marked + */ + protected long responseBodyStart = -1; + + + /** + * Constructor. + * + * @param buffer Buffer to read from. + * @param size Size of data to replay. + * @param responseBodyStart Start of the response body. + * @param backingFilename Backing file that sits behind the buffer. If + * size > than buffer then we go to backing file to read + * data that is beyond buffer.length. + * + * @throws IOException If we fail to open an input stream on + * backing file. + */ + public ReplayInputStream(byte[] buffer, long size, long responseBodyStart, + String backingFilename) + throws IOException + { + this(buffer, size, backingFilename); + this.responseBodyStart = responseBodyStart; + } + + /** + * Constructor. + * + * @param buffer Buffer to read from. + * @param size Size of data to replay. + * @param backingFilename Backing file that sits behind the buffer. If + * size > than buffer then we go to backing file to read + * data that is beyond buffer.length. + * @throws IOException If we fail to open an input stream on + * backing file. + */ + public ReplayInputStream(byte[] buffer, long size, String backingFilename) + throws IOException + { + this.buffer = buffer; + this.size = size; + if (size > buffer.length) { + setupDiskStream(new File(backingFilename)); + } + } + + protected void setupDiskStream(File backingFile) throws IOException { + RandomAccessInputStream rais = new RandomAccessInputStream(backingFile); + diskStream = new BufferedSeekInputStream(rais, 4096); + } + + protected File backingFile; + + /** + * Create a ReplayInputStream from the given source stream. Requires + * reading the entire stream (and possibly overflowing to a temporary + * file). Primary reason for doing so would be to have a repositionable + * version of the original stream's contents. + * + * If created via this constructor, use the destroy() method to ensure + * prompt deletion of any associated tmp file when done. + * + * @param fillStream + * @throws IOException + */ + public ReplayInputStream(InputStream fillStream) throws IOException { + this.buffer = new byte[DEFAULT_BUFFER_SIZE]; + long count = ArchiveUtils.readFully(fillStream, buffer); + if(fillStream.available()>0) { + this.backingFile = File.createTempFile("tid"+Thread.currentThread().getId(), "ris"); + count += FileUtils.readFullyToFile(fillStream, backingFile); + setupDiskStream(backingFile); + } + this.size = count; + } + + /** + * Close & destroy any internally-generated temporary files. + */ + public void destroy() { + IOUtils.closeQuietly(this); + if(backingFile!=null) { + FileUtils.deleteSoonerOrLater(backingFile); + } + } + + public long setToResponseBodyStart() throws IOException { + position(responseBodyStart); + return this.position; + } + + + /* (non-Javadoc) + * @see java.io.InputStream#read() + */ + public int read() throws IOException { + if (position == size) { + return -1; // EOF + } + if (position < buffer.length) { + // Convert to unsigned int. + int c = buffer[(int) position] & 0xFF; + position++; + return c; + } + int c = diskStream.read(); + if (c >= 0) { + position++; + } + return c; + } + + /* + * (non-Javadoc) + * + * @see java.io.InputStream#read(byte[], int, int) + */ + public int read(byte[] b, int off, int len) throws IOException { + if (position == size) { + return -1; // EOF + } + if (position < buffer.length) { + int toCopy = (int)Math.min(size - position, + Math.min(len, buffer.length - position)); + System.arraycopy(buffer, (int)position, b, off, toCopy); + if (toCopy > 0) { + position += toCopy; + } + return toCopy; + } + // into disk zone + int read = diskStream.read(b,off,len); + if(read>0) { + position += read; + } + return read; + } + + public void readFullyTo(OutputStream os) throws IOException { + byte[] buf = new byte[4096]; + int c = read(buf); + while (c != -1) { + os.write(buf,0,c); + c = read(buf); + } + } + + /* + * Like 'readFullyTo', but only reads the header-part. + * Starts from the beginning each time it is called. + */ + public void readHeaderTo(OutputStream os) throws IOException { + position = 0; + byte[] buf = new byte[(int)responseBodyStart]; + int c = read(buf,0,buf.length); + if(c != -1) { + os.write(buf,0,c); + } + } + + /* + * Like 'readFullyTo', but only reads the content-part. + */ + public void readContentTo(OutputStream os) throws IOException { + setToResponseBodyStart(); + byte[] buf = new byte[4096]; + int c = read(buf); + while (c != -1) { + os.write(buf,0,c); + c = read(buf); + } + } + + /** + * Convenience method to copy content out to target stream. + * @param os stream to write content to + * @param maxSize maximum count of bytes to copy + * @throws IOException + */ + public void readContentTo(OutputStream os, long maxSize) throws IOException { + setToResponseBodyStart(); + byte[] buf = new byte[4096]; + int c = read(buf); + long tot = 0; + while (c != -1 && tot < maxSize) { + os.write(buf,0,c); + c = read(buf); + tot += c; + } + } + + /* (non-Javadoc) + * @see java.io.InputStream#close() + */ + public void close() throws IOException { + super.close(); + if(diskStream != null) { + diskStream.close(); + } + } + + /** + * Total size of stream content. + * @return Returns the size. + */ + public long getSize() + { + return size; + } + + /** + * Total size of header. + * @return the size of the header. + */ + public long getHeaderSize() + { + return responseBodyStart; + } + + /** + * Total size of content. + * @return the size of the content. + */ + public long getContentSize() + { + return size - responseBodyStart; + } + + /** + * @return Amount THEORETICALLY remaining (TODO: Its not theoretical + * seemingly. The class implemetentation depends on it being exact). + */ + public long remaining() { + return size - position; + } + + + /** + * Reposition the stream. + * + * @param p the new position for this stream + * @throws IOException if an IO error occurs + */ + public void position(long p) throws IOException { + if (p < 0) { + throw new IOException("Negative seek offset."); + } + if (p > size) { + throw new IOException("Desired position exceeds size."); + } + if (p < buffer.length) { + // Only seek file if necessary + if (position > buffer.length) { + diskStream.position(0); + } + } else { + diskStream.position(p - buffer.length); + } + this.position = p; + } + + + public long position() throws IOException { + return position; + } + + protected byte[] getBuffer() { + return buffer; + } +} diff --git a/src/main/java/org/archive/io/RepositionableInputStream.java b/src/main/java/org/archive/io/RepositionableInputStream.java new file mode 100644 index 00000000..6f885130 --- /dev/null +++ b/src/main/java/org/archive/io/RepositionableInputStream.java @@ -0,0 +1,133 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import it.unimi.dsi.fastutil.io.RepositionableStream; + +import java.io.BufferedInputStream; +import java.io.IOException; +import java.io.InputStream; + +/** + * Wrapper around an {@link InputStream} to make a primitive Repositionable + * stream. Uses a {@link BufferedInputStream}. Calls mark on every read so + * we'll remember at least the last thing read (You can only backup on the + * last thing read -- not last 2 or 3 things read). Used by + * {@link GzippedInputStream} when reading streams over a network. Wraps a + * HTTP, etc., stream so we can back it up if needs be after the + * GZIP inflater has done a fill of its full buffer though it only needed + * the first few bytes to finish decompressing the current GZIP member. + * + *

TODO: More robust implementation. Tried to use the it.unimi.dsi.io + * FastBufferdInputStream but relies on FileChannel ByteBuffers and if not + * present -- as would be the case reading from a network stream, the main + * application for this instance -- then it expects the underlying stream + * implements RepositionableStream interface so chicken or egg problem. + * @author stack + */ +public class RepositionableInputStream extends BufferedInputStream implements + RepositionableStream { + private long position = 0; + private long markPosition = -1; + + public RepositionableInputStream(InputStream in) { + super(in); + } + + public RepositionableInputStream(InputStream in, int size) { + super(in, size); + } + + public int read(byte[] b) throws IOException { + int read = super.read(b); + if (read != -1) { + position += read; + } + return read; + } + + public synchronized int read(byte[] b, int offset, int ct) + throws IOException { + // Mark the underlying stream so that we'll remember what we are about + // to read unless a mark has been set in this RepositionableStream + // (We have two levels of mark). In this latter case we want the + // underlying stream to preserve its mark position so aligns with + // this RS when eset is called. + if (!isMarked()) { + super.mark((ct > offset)? ct - offset: ct); + } + int read = super.read(b, offset, ct); + if (read != -1) { + position += read; + } + return read; + } + + public int read() throws IOException { + // Mark the underlying stream so that we'll remember what we are about + // to read unless a mark has been set in this RepositionableStream + // (We have two levels of mark). In this latter case we want the + // underlying stream to preserve its mark position so aligns with + // this RS when eset is called. + if (!isMarked()) { + super.mark(1); + } + int c = super.read(); + if (c != -1) { + position++; + } + return c; + } + + public void position(final long offset) { + if (this.position == offset) { + return; + } + int diff = (int)(offset - this.position); + long lowerBound = this.position - this.pos; + long upperBound = lowerBound + this.count; + if (offset < lowerBound || offset >= upperBound) { + throw new IllegalAccessError("Offset goes outside " + + "current this.buf (TODO: Do buffer fills if positive)"); + } + this.position = offset; + this.pos += diff; + // Clear any mark. + this.markPosition = -1; + } + + public void mark(int readlimit) { + this.markPosition = this.position; + super.mark(readlimit); + } + + public void reset() throws IOException { + super.reset(); + this.position = this.markPosition; + this.markPosition = -1; + } + + protected boolean isMarked() { + return this.markPosition != -1; + } + + public long position() { + return this.position; + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/SafeSeekInputStream.java b/src/main/java/org/archive/io/SafeSeekInputStream.java new file mode 100644 index 00000000..0d8f83b1 --- /dev/null +++ b/src/main/java/org/archive/io/SafeSeekInputStream.java @@ -0,0 +1,124 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + + +import java.io.IOException; + + +/** + * Enables multiple concurrent streams based on the same underlying stream. + * + * @author pjack + */ +public class SafeSeekInputStream extends SeekInputStream { + + + /** + * The underlying stream. + */ + private SeekInputStream input; + + + /** + * The expected position of the underlying stream. + */ + private long expected; + + + /** + * Constructor. The given stream will be positioned to 0 so that an + * accurate position can be tracked. + * + * @param input the underlying input stream + * @throws IOException if an IO error occurs + */ + public SafeSeekInputStream(SeekInputStream input) throws IOException { + this.input = input; + this.expected = input.position(); + } + + + /** + * Ensures that the underlying stream's position is what we expect to be. + * + * @throws IOException if an IO error occurs + */ + private void ensure() throws IOException { + if (expected != input.position()) { + input.position(expected); + } + } + + + @Override + public int read() throws IOException { + ensure(); + int c = input.read(); + if (c >= 0) { + expected++; + } + return c; + } + + + @Override + public int read(byte[] buf, int ofs, int len) throws IOException { + ensure(); + int r = input.read(buf, ofs, len); + if (r > 0) { + expected += r; + } + return r; + } + + + @Override + public int read(byte[] buf) throws IOException { + ensure(); + int r = input.read(buf); + if (r > 0) { + expected += r; + } + return r; + } + + + @Override + public long skip(long c) throws IOException { + ensure(); + long r = input.skip(c); + if (r > 0) { + expected += r; + } + return r; + } + + + public void position(long p) throws IOException { + input.position(p); + expected = p; + } + + + public long position() throws IOException { + return expected; + } + +} diff --git a/src/main/java/org/archive/io/SeekInputStream.java b/src/main/java/org/archive/io/SeekInputStream.java new file mode 100644 index 00000000..177724ec --- /dev/null +++ b/src/main/java/org/archive/io/SeekInputStream.java @@ -0,0 +1,81 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + + +import it.unimi.dsi.fastutil.io.RepositionableStream; + +import java.io.IOException; +import java.io.InputStream; + + +/** + * Base class for repositionable input streams. + * + * @author pjack + */ +public abstract class SeekInputStream extends InputStream +implements RepositionableStream { + + + /** + * The marked file position. A value less than zero + * indicates that no mark has been set. + */ + private long mark = -1; + + + /** + * Marks the current position of the stream. The limit parameter is + * ignored; the mark will remain valid until reset is called or the + * stream is closed. + * + * @param limit ignored + */ + public void mark(int limit) { + try { + this.mark = position(); + } catch (IOException e) { + mark = -1; + } + } + + + /** + * Resets this stream to its marked position. + * + * @throws IOException if there is no mark, or if an IO error occurs + */ + public void reset() throws IOException { + if (mark < 0) { + throw new IOException("No mark."); + } + position(mark); + } + + + /** + * Returns true, since SeekInputStreams support mark/reset by default. + * + * @return true + */ + public boolean markSupported() { + return true; + } +} diff --git a/src/main/java/org/archive/io/SeekReader.java b/src/main/java/org/archive/io/SeekReader.java new file mode 100644 index 00000000..4abf7847 --- /dev/null +++ b/src/main/java/org/archive/io/SeekReader.java @@ -0,0 +1,84 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + + +import java.io.IOException; +import java.io.Reader; + +import it.unimi.dsi.fastutil.io.RepositionableStream; + + +/** + * Base class for repositionable readers. + * + * @author pjack + */ +public abstract class SeekReader extends Reader +implements RepositionableStream { + + + /** + * The marked file position. A value less than zero + * indicates that no mark has been set. + */ + private long mark = -1; + + + /** + * Marks the current position of the stream. The limit parameter is + * ignored; the mark will remain valid until reset is called or the + * stream is closed. + * + * @param limit ignored + */ + @Override + public void mark(int limit) { + try { + this.mark = position(); + } catch (IOException e) { + mark = -1; + } + } + + + /** + * Resets this stream to its marked position. + * + * @throws IOException if there is no mark, or if an IO error occurs + */ + @Override + public void reset() throws IOException { + if (mark < 0) { + throw new IOException("No mark."); + } + position(mark); + } + + + /** + * Returns true, since SeekInputStreams support mark/reset by default. + * + * @return true + */ + @Override + public boolean markSupported() { + return true; + } +} diff --git a/src/main/java/org/archive/io/SeekReaderCharSequence.java b/src/main/java/org/archive/io/SeekReaderCharSequence.java new file mode 100644 index 00000000..a9b4880f --- /dev/null +++ b/src/main/java/org/archive/io/SeekReaderCharSequence.java @@ -0,0 +1,56 @@ +package org.archive.io; + +import java.io.IOException; + +public class SeekReaderCharSequence implements CharSequence { + + + final private SeekReader reader; + final private int size; + + + public SeekReaderCharSequence(SeekReader reader, int size) { + this.reader = reader; + this.size = size; + } + + + public int length() { + return size; + } + + + public char charAt(int index) { + if ((index < 0) || (index >= length())) { + throw new IndexOutOfBoundsException(Integer.toString(index)); + } + try { + reader.position(index); + int r = reader.read(); + if (r < 0) { + throw new IllegalStateException("EOF"); + } + return (char)reader.read(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + + public CharSequence subSequence(int start, int end) { + return new CharSubSequence(this, start, end); + } + + public String toString() { + StringBuilder sb = new StringBuilder(); + try { + reader.position(0); + for (int ch = reader.read(); ch >= 0; ch = reader.read()) { + sb.append((char)ch); + } + return sb.toString(); + } catch (IOException e) { + throw new IllegalStateException(e); + } + } +} diff --git a/src/main/java/org/archive/io/SinkHandlerLogThread.java b/src/main/java/org/archive/io/SinkHandlerLogThread.java new file mode 100644 index 00000000..0070785e --- /dev/null +++ b/src/main/java/org/archive/io/SinkHandlerLogThread.java @@ -0,0 +1,34 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + + +/** + * Implemented by threads that provide extra information. + * + * TODO: rename class, rename getCurrentProcessorName() + */ +public interface SinkHandlerLogThread { + + String getName(); + String getCurrentProcessorName(); + int getSerialNumber(); + +} diff --git a/src/main/java/org/archive/io/UTF8Bytes.java b/src/main/java/org/archive/io/UTF8Bytes.java new file mode 100644 index 00000000..c280b08d --- /dev/null +++ b/src/main/java/org/archive/io/UTF8Bytes.java @@ -0,0 +1,37 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.io.UnsupportedEncodingException; + +/** + * Marker Interface for instances that can be serialized as UTF8 bytes. + * TODO: Do we need a UTF8Stream Marker Interface? + * @author stack + * @version $Date$ $Version$ + */ +public interface UTF8Bytes { + public static final String UTF8 = "UTF-8"; + + /** + * @return Instance as UTF-8 bytes. + * @throws UnsupportedEncodingException + */ + public byte [] getUTF8Bytes() throws UnsupportedEncodingException; +} diff --git a/src/main/java/org/archive/io/WriterPool.java b/src/main/java/org/archive/io/WriterPool.java new file mode 100644 index 00000000..2dc385a1 --- /dev/null +++ b/src/main/java/org/archive/io/WriterPool.java @@ -0,0 +1,343 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + +import java.io.File; +import java.io.IOException; +import java.util.Collection; +import java.util.LinkedList; +import java.util.concurrent.ArrayBlockingQueue; +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.logging.Level; +import java.util.logging.Logger; + +import org.json.JSONArray; +import org.json.JSONException; +import org.json.JSONObject; + +/** + * Pool of Writers. + * + * Abstract. Override and pass in the Constructor a factory that creates + * {@link WriterPoolMember} implementations. + * + * @author stack + */ +public abstract class WriterPool { + private final Logger logger = Logger.getLogger(this.getClass().getName()); + + /** + * Used to generate unique filename sequences. + */ + final protected AtomicInteger serialNo; + + /** + * Default maximum active number of files in the pool. + */ + public static final int DEFAULT_MAX_ACTIVE = 1; + + /** Assumed largest possible value of maxActive; pool will have this + * maximum capacity, so dynamic changes beyond this number won't work. */ + protected static final int LARGEST_MAX_ACTIVE = 255; + + /** + * Maximum time to wait on a free file before considering + * making a new one (if not already at max) + */ + public static final int DEFAULT_MAX_WAIT_FOR_IDLE = 500; + + /** + * File settings. + * Keep in data structure rather than as individual values. + */ + protected final WriterPoolSettings settings; + + /** maximum number of writers to create at a time*/ + protected int maxActive; + /** maximum ms to wait before considering creation of a writer */ + protected int maxWait; + /** current count of active writers; only read/mutated in synchronized blocks */ + protected int currentActive = 0; + /** round-robin queue of available writers */ + protected BlockingQueue availableWriters; + + /** system time when writer was last wanted (because one was not ready in time) */ + protected long lastWriterNeededTime; + /** system time when writer was last 'rolled over' (imminent creation of new file) */ + protected long lastWriterRolloverTime; + + /** + * Constructor + * @param serial Used to generate unique filename sequences + * @param factory Factory that knows how to make a {@link WriterPoolMember}. + * @param settings Settings for this pool. + * @param poolMaximumActive + * @param poolMaximumWait + */ + public WriterPool(final AtomicInteger serial, + final WriterPoolSettings settings, + final int poolMaximumActive, final int poolMaximumWait) { + logger.info("Initial configuration:" + + " prefix=" + settings.getPrefix() + + ", template=" + settings.getTemplate() + + ", compress=" + settings.getCompress() + + ", maxSize=" + settings.getMaxFileSizeBytes() + + ", maxActive=" + poolMaximumActive + + ", maxWait=" + poolMaximumWait); + this.settings = settings; + this.maxActive = poolMaximumActive; + this.maxWait = poolMaximumWait; + availableWriters = new ArrayBlockingQueue(LARGEST_MAX_ACTIVE, true); + this.serialNo = serial; + } + + /** + * Check out a {@link WriterPoolMember}. + * + * This method should be followed by a call to + * {@link #returnFile(WriterPoolMember)} or + * {@link #invalidateFile(WriterPoolMember)} else pool starts leaking. + * + * @return Writer checked out of a pool of files or created + * @throws IOException Problem getting Writer from pool (Converted + * from Exception to IOException so this pool can live as a good citizen + * down in depths of ARCSocketFactory). + */ + public WriterPoolMember borrowFile() + throws IOException { + WriterPoolMember writer = null; + while(writer == null) { + try { + writer = availableWriters.poll(maxWait,TimeUnit.MILLISECONDS); + } catch (InterruptedException e) { + // nothing to do but proceed + } + if(writer==null) { + writer = makeNewWriterIfAppropriate(); + } + } + return writer; + } + + /** + * Create a new writer instance, if still below maxActive count. + * Remember times to help make later decision when writer should + * be discarded. + * + * @return WriterPoolMember or null if already at max + */ + protected synchronized WriterPoolMember makeNewWriterIfAppropriate() { + long now = System.currentTimeMillis(); + lastWriterNeededTime = now; + if(currentActive < maxActive) { + currentActive++; + lastWriterRolloverTime = now; + return makeWriter(); + } + return null; + } + + /** + * @return new WriterPoolMember of appropriate type + */ + protected abstract WriterPoolMember makeWriter(); + + /** + * Discard a previously-used writer, cleanly closing it and leaving it out + * of the pool. + * @param writer + * @throws IOException + */ + public synchronized void destroyWriter(WriterPoolMember writer) throws IOException { + currentActive--; + writer.close(); + } + /** + * Return a writer, for likely reuse unless (1) writer's current file has + * reached its target size; and (2) there's been no demand for additional + * writers since the last time a new writer-file was rolled-over. In that + * case, the possibly-superfluous writer instance is discarded. + * @param writer Writer to return to the pool. + * @throws IOException Problem returning File to pool. + */ + public void returnFile(WriterPoolMember writer) + throws IOException { + synchronized(this) { + if(writer.isOversize()) { + // maybe retire writer rather than recycle + if(lastWriterNeededTime<=lastWriterRolloverTime) { + // no timeouts waiting for recycled writer since last writer rollover + destroyWriter(writer); + return; + } else { + // reuse writer instance, causing new file to be created + lastWriterRolloverTime = System.currentTimeMillis(); + } + } + } + if(!availableWriters.offer(writer)) { + logger.log(Level.WARNING, "writer unreturnable to available pool; closing early"); + destroyWriter(writer); + } + } + + /** + * Close and discard a writer that experienced a potentially-corrupting + * error. + * @param f writer with problem + * @throws IOException + */ + public synchronized void invalidateFile(WriterPoolMember f) + throws IOException { + try { + destroyWriter(f); + } catch (Exception e) { + // Convert exception. + throw new IOException(e.getMessage()); + } + // It'll have been closed. Rename with an '.invalid' suffix so it + // gets attention. + File file = f.getFile(); + file.renameTo(new File(file.getAbsoluteFile() + + WriterPoolMember.INVALID_SUFFIX)); + } + + /** + * @return Number of {@link WriterPoolMember}s checked out of pool. + * @throws java.lang.UnsupportedOperationException + */ + public synchronized int getNumActive() + throws UnsupportedOperationException { + return currentActive - getNumIdle(); + } + + /** + * @return Number of {@link WriterPoolMember} instances still in the pool. + * @throws java.lang.UnsupportedOperationException + */ + public int getNumIdle() + throws UnsupportedOperationException { + return availableWriters.size(); + } + + /** + * Close all {@link WriterPoolMember}s in pool. + */ + public void close() { + Collection writers = drainAllWriters(); + for (WriterPoolMember writer: writers) { + try { + destroyWriter(writer); + } catch (IOException e) { + logger.log(Level.WARNING,"problem closing writer",e); + } + } + } + + /** + * @return Returns settings. + */ + public WriterPoolSettings getSettings() { + return this.settings; + } + + /** + * @return State of the pool string + */ + protected String getPoolState() { + StringBuffer buffer = new StringBuffer("Active "); + buffer.append(getNumActive()); + buffer.append(" of max "); + buffer.append(maxActive); + buffer.append(", idle "); + buffer.append(getNumIdle()); + return buffer.toString(); + } + + /** + * Returns the atomic integer used to generate serial numbers + * for files. + * + * @return the serial number generator + */ + public AtomicInteger getSerialNo() { + return serialNo; + } + + /** + * Drains all the active writers from {@link #availableWriters}, blocking to + * wait for any writers currently in use to become available. + * + *

+ * When finished with writers, call availableWriters.addAll(...) to put them + * back into the rotation. + * + * @return all the active writers + */ + protected synchronized Collection drainAllWriters() { + LinkedList writers = new LinkedList(); + availableWriters.drainTo(writers); + + while (writers.size() < currentActive) { + try { + WriterPoolMember w = availableWriters.take(); + writers.add(w); + } catch (InterruptedException e) { + logger.severe("caught " + e + " while waiting for writers to free up; returning only " + + writers.size() + " of " + currentActive + " active writers"); + break; + } + } + + return writers; + } + + public void flush() { + Collection writers = drainAllWriters(); + + for (WriterPoolMember writer: writers) { + try { + writer.flush(); + } catch (IOException e) { + logger.log(Level.WARNING, "problem flushing writer " + writer, e); + } + } + + availableWriters.addAll(writers); + } + + public JSONArray jsonStatus() throws JSONException { + Collection writers = drainAllWriters(); + + JSONArray ja = new JSONArray(); + for (WriterPoolMember w: writers) { + JSONObject jo = new JSONObject(); + jo.put("file", w.getFile()); + jo.put("position", w.getPosition()); + ja.put(jo); + } + + availableWriters.addAll(writers); + + return ja; + } +} diff --git a/src/main/java/org/archive/io/WriterPoolMember.java b/src/main/java/org/archive/io/WriterPoolMember.java new file mode 100644 index 00000000..6ea6b295 --- /dev/null +++ b/src/main/java/org/archive/io/WriterPoolMember.java @@ -0,0 +1,487 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.text.DecimalFormat; +import java.text.NumberFormat; +import java.util.Iterator; +import java.util.List; +import java.util.Properties; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.logging.Logger; +import java.util.zip.GZIPOutputStream; + +import org.archive.util.ArchiveUtils; +import org.archive.util.FileUtils; +import org.archive.util.PropertyUtils; + + + +/** + * Member of {@link WriterPool}. + * Implements rotating off files, file naming with some guarantee of + * uniqueness, and position in file. Subclass to pick up functionality for a + * particular Writer type. + * @author stack + * @version $Date$ $Revision$ + */ +public abstract class WriterPoolMember implements ArchiveFileConstants { + private final Logger logger = Logger.getLogger(this.getClass().getName()); + + public static final String UTF8 = "UTF-8"; + + /** + * Default archival-aggregate filename template. + * + * Under usual assumptions -- hostnames aren't shared among crawling hosts; + * processes have unique PIDs and admin ports; timestamps inside one process + * don't repeat (see UniqueTimestampService); clocks are generally + * accurate -- will generate a unique name. + * + * Stands for Internet Archive Heritrix. + */ + public static final String DEFAULT_TEMPLATE = + "${prefix}-${timestamp17}-${serialno}-${heritrix.pid}~${heritrix.hostname}~${heritrix.port}"; + + /** + * Default for file prefix. + */ + public static final String DEFAULT_PREFIX = "WEB"; + + /** + * Reference to file we're currently writing. + */ + protected File f = null; + + /** Output stream for file. */ + protected OutputStream out = null; + /** Counting stream for metering */ + protected MiserOutputStream countOut = null; + + /** reusable buffer for recycling scenarios */ + protected byte[] rebuf; + + protected WriterPoolSettings settings; + private final String extension; + + /** + * Creation date for the current file. + * Set by {@link #createFile()}. + */ + protected String currentTimestamp = "UNSET!!!"; + + protected String currentBasename; + + /** + * A running sequence used making unique file names. + */ + final private AtomicInteger serialNo; + + /** + * Directories round-robin index. + */ + protected static int roundRobinIndex = 0; + + /** + * NumberFormat instance for formatting serial number. + * + * Pads serial number with zeros. + */ + protected static NumberFormat serialNoFormatter = new DecimalFormat("00000"); + + + /** + * Buffer to reuse writing streams. + */ + protected final byte [] scratchbuffer = new byte[4 * 1024]; + + + /** + * Constructor. + * Takes a stream. Use with caution. There is no upperbound check on size. + * Will just keep writing. + * + * @param serialNo used to create unique filename sequences + * @param out Where to write. + * @param file File the out is connected to. + * @param cmprs Compress the content written. + * @param a14DigitDate If null, we'll write current time. + * @throws IOException + */ + protected WriterPoolMember(AtomicInteger serialNo, + final OutputStream out, final File file, + final WriterPoolSettings settings) + throws IOException { + this(serialNo, settings, null); + this.countOut = (out instanceof MiserOutputStream) + ? (MiserOutputStream)out + : new MiserOutputStream(out, settings.getFrequentFlushes()); + this.out = this.countOut; + this.f = file; + } + + /** + * Constructor. + * + * @param serialNo used to create unique filename sequences + * @param dirs Where to drop files. + * @param prefix File prefix to use. + * @param cmprs Compress the records written. + * @param maxSize Maximum size for ARC files written. + * @param template filenaming template to use + * @param extension Extension to give file. + */ + public WriterPoolMember(AtomicInteger serialNo, + final WriterPoolSettings settings, final String extension) { + this.settings = settings; + this.extension = extension; + this.serialNo = serialNo; + } + + /** + * Call this method just before/after any significant write. + * + * Call at the end of the writing of a record or just before we start + * writing a new record. Will close current file and open a new file + * if file size has passed out maxSize. + * + *

Creates and opens a file if none already open. One use of this method + * then is after construction, call this method to add the metadata, then + * call {@link #getPosition()} to find offset of first record. + * + * TODO: perhaps this should be called checkForNewOpen? because it also + * handles initial open, even when not rolling oversize + * + * @exception IOException + */ + public void checkSize() throws IOException { + if (this.out == null || isOversize()) { + createFile(); + } + } + + /** Check if underlying file has already reached its target size. + * @return boolean true if file has reached target size and due to be closed + */ + public boolean isOversize() { + return settings.getMaxFileSizeBytes() != -1 && (this.getPosition() > settings.getMaxFileSizeBytes()); + } + + /** + * Create a new file. + * Rotates off the current Writer and creates a new in its place + * to take subsequent writes. Usually called from {@link #checkSize()}. + * @return Name of file created. + * @throws IOException + */ + protected String createFile() throws IOException { + generateNewBasename(); + String name = currentBasename + '.' + this.extension + + ((settings.getCompress())? DOT_COMPRESSED_FILE_EXTENSION: "") + + OCCUPIED_SUFFIX; + File dir = getNextDirectory(settings.calcOutputDirs()); + return createFile(new File(dir, name)); + } + + protected String createFile(final File file) throws IOException { + close(); + this.f = file; + FileOutputStream fos = new FileOutputStream(this.f); + if(rebuf==null) { + rebuf = new byte[settings.getWriteBufferSize()]; + } + this.countOut = new MiserOutputStream(new RecyclingFastBufferedOutputStream(fos,rebuf),settings.getFrequentFlushes()); + this.out = this.countOut; + logger.fine("Opened " + this.f.getAbsolutePath()); + return this.f.getName(); + } + + /** + * @param dirs List of File objects that point at directories. + * @return Find next directory to write an arc too. If more + * than one, it tries to round-robin through each in turn. + * @throws IOException + */ + protected File getNextDirectory(List dirs) + throws IOException { + if (WriterPoolMember.roundRobinIndex >= dirs.size()) { + WriterPoolMember.roundRobinIndex = 0; + } + File d = null; + try { + d = checkWriteable((File)dirs. + get(WriterPoolMember.roundRobinIndex)); + } catch (IndexOutOfBoundsException e) { + // Dirs list might be altered underneath us. + // If so, we get this exception -- just keep on going. + } + if (d == null && dirs.size() > 1) { + for (Iterator i = dirs.iterator(); d == null && i.hasNext();) { + d = checkWriteable((File)i.next()); + } + } else { + WriterPoolMember.roundRobinIndex++; + } + if (d == null) { + throw new IOException("Directories unusable."); + } + return d; + } + + protected File checkWriteable(File d) { + if (d == null) { + return d; + } + + try { + FileUtils.ensureWriteableDirectory(d); + } catch(IOException e) { + logger.warning("Directory " + d.getPath() + " is not" + + " writeable or cannot be created: " + e.getMessage()); + d = null; + } + return d; + } + + /** + * Generate a new basename by interpolating values in the configured + * template. Values come from local state, other configured values, and + * global system properties. The recommended default template will + * generate a unique basename under reasonable assumptions. + */ + protected void generateNewBasename() { + Properties localProps = new Properties(); + localProps.setProperty("prefix", settings.getPrefix()); + synchronized(this.getClass()) { + // ensure that serialNo and timestamp are minted together (never inverted sort order) + String paddedSerialNumber = WriterPoolMember.serialNoFormatter.format(serialNo.getAndIncrement()); + String timestamp17 = ArchiveUtils.getUnique17DigitDate(); + String timestamp14 = ArchiveUtils.getUnique14DigitDate(); + currentTimestamp = timestamp17; + localProps.setProperty("serialno", paddedSerialNumber); + localProps.setProperty("timestamp17", timestamp17); + localProps.setProperty("timestamp14", timestamp14); + } + currentBasename = PropertyUtils.interpolateWithProperties(settings.getTemplate(), + localProps, System.getProperties()); + } + + + /** + * Get the file name + * + * @return the filename, as if uncompressed + */ + protected String getBaseFilename() { + String name = this.f.getName(); + if (settings.getCompress() && name.endsWith(DOT_COMPRESSED_FILE_EXTENSION)) { + return name.substring(0,name.length() - 3); + } else if(settings.getCompress() && + name.endsWith(DOT_COMPRESSED_FILE_EXTENSION + + OCCUPIED_SUFFIX)) { + return name.substring(0, name.length() - + (3 + OCCUPIED_SUFFIX.length())); + } else { + return name; + } + } + + /** + * Get this file. + * + * Used by junit test to test for creation and when {@link WriterPool} wants + * to invalidate a file. + * + * @return The current file. + */ + public File getFile() { + return this.f; + } + + /** + * Post write tasks. + * + * Has side effects. Will open new file if we're at the upper bound. + * If we're writing compressed files, it will wrap output stream with a + * GZIP writer with side effect that GZIP header is written out on the + * stream. + * + * @exception IOException + */ + protected void preWriteRecordTasks() + throws IOException { + if (this.out == null) { + createFile(); + } + if (settings.getCompress()) { + // Wrap stream in GZIP Writer. + // The below construction immediately writes the GZIP 'default' + // header out on the underlying stream. + this.out = new CompressedStream(this.out); + } + } + + /** + * Post file write tasks. + * If compressed, finishes up compression and flushes stream so any + * subsequent checks get good reading. + * + * @exception IOException + */ + protected void postWriteRecordTasks() + throws IOException { + if (settings.getCompress()) { + CompressedStream o = (CompressedStream)this.out; + o.finish(); + o.flush(); + o.end(); + this.out = o.getWrappedStream(); + } + } + + /** + * Position in raw output (typically, physical file). + * Used making accounting of bytes written. + * @return Position in final media (assuming all flushing completes) + * @throws IOException + */ + public long getPosition() { + return (countOut==null)? 0L : this.countOut.getCount(); + } + + public boolean isCompressed() { + return settings.getCompress(); + } + + protected void write(final byte [] b) throws IOException { + this.out.write(b); + } + + protected void flush() throws IOException { + this.out.flush(); + } + + protected void write(byte[] b, int off, int len) throws IOException { + this.out.write(b, off, len); + } + + protected void write(int b) throws IOException { + this.out.write(b); + } + + /** + * Copy bytes from the provided InputStream to the target file/stream being + * written. + * + * @return number of bytes written (normally equal to {@code enforceLength}) + * @param is + * InputStream to copy bytes from + * @param recordLength + * expected number of bytes to copy + * @param enforceLength + * whether to throw an exception if too many/too few bytes are + * available from stream + * @throws IOException + */ + protected long copyFrom(final InputStream is, final long recordLength, + boolean enforceLength) throws IOException { + int read = scratchbuffer.length; + long tot = 0; + while ((tot < recordLength) + && (read = is.read(scratchbuffer)) != -1) { + int write = read; + // never write more than enforced length + write = (int) Math.min(write, recordLength - tot); + tot += read; + write(scratchbuffer, 0, write); + } + if (enforceLength && tot != recordLength) { + // throw exception if desired for read vs. declared mismatches + throw new IOException("Read " + tot + " but expected " + + recordLength); + } + + return tot; + } + + public void close() throws IOException { + if (this.out == null) { + return; + } + this.out.close(); + this.out = null; + if (this.f != null && this.f.exists()) { + String path = this.f.getAbsolutePath(); + if (path.endsWith(OCCUPIED_SUFFIX)) { + File f = new File(path.substring(0, + path.length() - OCCUPIED_SUFFIX.length())); + if (f.exists() & !f.delete()) { + logger.warning("Failed delete of " + f); + } + if (!this.f.renameTo(f)) { + logger.warning("Failed rename of " + path); + } + this.f = f; + } + + logger.fine("Closed " + this.f.getAbsolutePath() + + ", size " + this.f.length()); + } + } + + protected OutputStream getOutputStream() { + return this.out; + } + + /** + * An override so we get access to underlying output stream. + * and offer an end() that does not accompany closing underlying + * stream. + * @author stack + */ + private class CompressedStream extends GZIPOutputStream { + public CompressedStream(OutputStream out) + throws IOException { + super(out); + } + + /** + * @return Reference to stream being compressed. + */ + OutputStream getWrappedStream() { + return this.out; + } + + /** + * Release the deflater's native process resources, + * which otherwise would not occur until either + * finalization or DeflaterOutputStream.close() + * (which would also close underlying stream). + */ + public void end() { + def.end(); + } + } +} diff --git a/src/main/java/org/archive/io/WriterPoolSettings.java b/src/main/java/org/archive/io/WriterPoolSettings.java new file mode 100644 index 00000000..d0805cdc --- /dev/null +++ b/src/main/java/org/archive/io/WriterPoolSettings.java @@ -0,0 +1,39 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.io.File; +import java.util.List; + +/** + * Settings object for a {@link WriterPool}. + * Used creating {@link WriterPoolMember}s. + * @author stack + * @version $Date$, $Revision$ + */ +public interface WriterPoolSettings { + public long getMaxFileSizeBytes(); + public String getPrefix(); + public String getTemplate(); + public List calcOutputDirs(); + public boolean getCompress(); + public List getMetadata(); + public boolean getFrequentFlushes(); + public int getWriteBufferSize(); +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/arc/ARC2WCDX.java b/src/main/java/org/archive/io/arc/ARC2WCDX.java new file mode 100644 index 00000000..19010131 --- /dev/null +++ b/src/main/java/org/archive/io/arc/ARC2WCDX.java @@ -0,0 +1,243 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io.arc; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.PrintStream; +import java.util.Date; +import java.util.Iterator; +import java.util.zip.GZIPOutputStream; + +import org.apache.commons.httpclient.Header; +import org.apache.commons.httpclient.HeaderGroup; +import org.apache.commons.httpclient.util.DateParseException; +import org.apache.commons.httpclient.util.DateUtil; +import org.archive.io.ArchiveRecord; +import org.archive.util.ArchiveUtils; +import org.archive.util.SURT; + +/** + * Create a 'Wide' CDX from an ARC. Takes one argument, the path to the ARC. + * Writes .wcdx.gz in same directory. + * + * @author gojomo + */ +public class ARC2WCDX { + final public static String WCDX_VERSION="0.1"; + + public static void main(String[] args) throws IOException { + String arcFilename = args[0]; + createWcdx(arcFilename); + } + + public static Object[] createWcdx(String arcFilename) throws IOException { + ARCReader reader = ARCReaderFactory.get(arcFilename); + Object[] retVal = createWcdx(reader); + reader.close(); + return retVal; + } + + public static Object[] createWcdx(ARCReader reader) { + reader.setDigest(true); + + String wcdxPath = reader.getReaderIdentifier().replaceAll("\\.arc(\\.gz)?$",".wcdx.gz"); + File wcdxFile = new File(wcdxPath+".open"); + PrintStream writer = null; + long count = 0; + try { + writer = new PrintStream(new GZIPOutputStream(new FileOutputStream(wcdxFile))); + + // write header: legend + timestamp + StringBuilder legend = new StringBuilder(); + appendField(legend,"CDX"); + appendField(legend,"surt-uri"); + appendField(legend,"b"); // ARC timestamp + appendField(legend,"http-date"); + appendField(legend,"s"); // status code + appendField(legend,"m"); // media type + appendField(legend,"sha1"); // content sha1 + appendField(legend,"g"); // ARC name + appendField(legend,"V"); // start offset + appendField(legend,"end-offset"); // TODO: implement + appendField(legend,"n"); // ARC record length TODO: verify + appendField(legend,"http-content-length"); + appendField(legend,"http-last-modified"); + appendField(legend,"http-expires"); + appendField(legend,"http-etag"); + appendField(legend,"http-location"); + appendField(legend,"e"); // IP + appendField(legend,"a"); // original URL + // WCDX version+creation time: crude version control + appendField(legend,WCDX_VERSION+"@"+ArchiveUtils.get14DigitDate()); + writer.println(legend.toString()); + + Iterator iter = reader.iterator(); + count = 0; + while(iter.hasNext()) { + ARCRecord record = (ARCRecord) iter.next(); + record.close(); + ARCRecordMetaData h = (ARCRecordMetaData) record.getHeader(); + Header[] httpHeaders = record.getHttpHeaders(); + if(httpHeaders==null) { + httpHeaders = new Header[0]; + } + HeaderGroup hg = new HeaderGroup(); + hg.setHeaders(httpHeaders); + StringBuilder builder = new StringBuilder(); + + // SURT-form URI + appendField(builder,SURT.fromURI(h.getUrl())); + // record timestamp ('b') + appendField(builder,h.getDate()); + // http header date + appendTimeField(builder,hg.getFirstHeader("Date")); + // response code ('s') + appendField(builder,h.getStatusCode()); + // media type ('m') + appendField(builder,h.getMimetype()); + // content checksum (like 'c', but here Base32 SHA1) + appendField(builder,record.getDigestStr()); + // arc name ('g') + appendField(builder,reader.getFileName()); + // compressed start offset ('V') + appendField(builder,h.getOffset()); + + // compressed end offset (?) +// appendField(builder, +// reader.getInputStream() instanceof RepositionableStream +// ? ((GzippedInputStream)reader.getInputStream()).vPosition() +// : "-"); + // TODO; leave unavail for now + appendField(builder, "-"); + + // uncompressed (declared in ARC headerline) record length + appendField(builder,h.getLength()); + // http header content-length + appendField(builder,hg.getFirstHeader("Content-Length")); + + // http header mod-date + appendTimeField(builder,hg.getFirstHeader("Last-Modified")); + // http header expires + appendTimeField(builder,hg.getFirstHeader("Expires")); + + // http header etag + appendField(builder,hg.getFirstHeader("ETag")); + // http header redirect ('Location' header?) + appendField(builder,hg.getFirstHeader("Location")); + // ip ('e') + appendField(builder,h.getIp()); + // original URI + appendField(builder,h.getUrl()); + // TODO MAYBE - a title from inside content? + + writer.println(builder.toString()); + count++; + } + wcdxFile.renameTo(new File(wcdxPath)); + } catch (IOException e) { + // soldier on: but leave '.open' wcdx file as indicator of error + if(!wcdxFile.exists()) { + try { + wcdxFile.createNewFile(); + } catch (IOException e1) { + // TODO Auto-generated catch block + throw new RuntimeException(e1); + } + } + } catch (RuntimeException e) { + // soldier on: but leave '.open' wcdx file as indicator of error + if(!wcdxFile.exists()) { + try { + wcdxFile.createNewFile(); + } catch (IOException e1) { + // TODO Auto-generated catch block + throw new RuntimeException(e1); + } + } + } finally { + if(writer!=null) { + writer.close(); + } + } + + return new Object[] {wcdxPath, count}; + } + + protected static void appendField(StringBuilder builder, Object obj) { + if(builder.length()>0) { + // prepend with delimiter + builder.append(' '); + } + if(obj instanceof Header) { + obj = ((Header)obj).getValue().trim(); + } + + builder.append((obj==null||obj.toString().length()==0)?"-":obj); + } + + protected static void appendTimeField(StringBuilder builder, Object obj) { + if(builder.length()>0) { + // prepend with delimiter + builder.append(' '); + } + if(obj==null) { + builder.append("-"); + return; + } + if(obj instanceof Header) { + String s = ((Header)obj).getValue().trim(); + try { + Date date = DateUtil.parseDate(s); + String d = ArchiveUtils.get14DigitDate(date); + if(d.startsWith("209")) { + d = "199"+d.substring(3); + } + obj = d; + } catch (DateParseException e) { + builder.append('e'); + return; + } + + } + builder.append(obj); + } +} + +//'wide' CDX +//a original url +//b timestamp +//s resp code +//m type +//? content md5 (full 'k'? 'c'? +//g arc name +//V compressed start offset +//? compressed length +//n? uncompressed length +//? mod date +//? expires +//? server 'date' hdr +//? etag +//r redirect ('Location'?) +//e ip +//MAYBE: +//? TITLE from HTML or other format? + + diff --git a/src/main/java/org/archive/io/arc/ARCConstants.java b/src/main/java/org/archive/io/arc/ARCConstants.java new file mode 100644 index 00000000..c44cfef7 --- /dev/null +++ b/src/main/java/org/archive/io/arc/ARCConstants.java @@ -0,0 +1,29 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io.arc; + + +/** + * Constants used by ARC files and in ARC file processing. + * + * @author stack + * @deprecated + */ +public interface ARCConstants extends org.archive.format.arc.ARCConstants { +} diff --git a/src/main/java/org/archive/io/arc/ARCLocation.java b/src/main/java/org/archive/io/arc/ARCLocation.java new file mode 100644 index 00000000..c6c64437 --- /dev/null +++ b/src/main/java/org/archive/io/arc/ARCLocation.java @@ -0,0 +1,37 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io.arc; + +/** + * Datastructure to hold ARC record location. + * Used by wayback machine. + * @author stack + */ +public interface ARCLocation { + /** + * @return Returns the ARC filename. Can be full path to ARC, URL to an + * ARC or just the portion of an ARC name that is unique to a collection. + */ + public String getName(); + + /** + * @return Returns the offset into the ARC. + */ + public long getOffset(); +} diff --git a/src/main/java/org/archive/io/arc/ARCReader.java b/src/main/java/org/archive/io/arc/ARCReader.java new file mode 100644 index 00000000..7f85cc2a --- /dev/null +++ b/src/main/java/org/archive/io/arc/ARCReader.java @@ -0,0 +1,553 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io.arc; + +import java.io.ByteArrayOutputStream; +import java.io.Closeable; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.logging.Logger; + +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.Options; +import org.apache.commons.cli.ParseException; +import org.apache.commons.cli.PosixParser; +import org.archive.io.ArchiveReader; +import org.archive.io.ArchiveRecord; +import org.archive.io.ArchiveRecordHeader; +import org.archive.io.RecoverableIOException; +import org.archive.io.WriterPoolMember; +import org.archive.util.ArchiveUtils; + + +/** + * Get an iterator on an ARC file or get a record by absolute position. + * + * ARC files are described here: + * Arc + * File Format. + * + *

This class knows how to parse an ARC file. Pass it a file path + * or an URL to an ARC. It can parse ARC Version 1 and 2. + * + *

Iterator returns ARCRecord + * though {@link Iterator#next()} is returning + * java.lang.Object. Cast the return. + * + *

Profiling java.io vs. memory-mapped ByteBufferInputStream shows the + * latter slightly slower -- but not by much. TODO: Test more. Just + * change {@link #getInputStream(File, long)}. + * + * @author stack + * @version $Date$ $Revision$ + */ +public abstract class ARCReader extends ArchiveReader +implements ARCConstants, Closeable { + private final Logger logger = Logger.getLogger(ARCReader.class.getName()); + + /** + * Set to true if we are aligned on first record of Archive file. + * We used depend on offset. If offset was zero, then we were + * aligned on first record. This is no longer necessarily the case when + * Reader is created at an offset into an Archive file: The offset is zero + * but its relative to where we started reading. + */ + private boolean alignedOnFirstRecord = true; + + private boolean parseHttpHeaders = true; + + protected ARCReader() { + super(); + } + + /** + * Skip over any trailing new lines at end of the record so we're lined up + * ready to read the next. + * @param record + * @throws IOException + */ + protected void gotoEOR(ArchiveRecord record) throws IOException { + if (getIn().available() <= 0) { + return; + } + + // Remove any trailing LINE_SEPARATOR + int c = -1; + while (getIn().available() > 0) { + if (getIn().markSupported()) { + getIn().mark(1); + } + c = getIn().read(); + if (c != -1) { + if (c == LINE_SEPARATOR) { + continue; + } + if (getIn().markSupported()) { + // We've overread. We're probably in next record. There is + // no way of telling for sure. It may be dross at end of + // current record. Backup. + getIn().reset(); + break; + } + ArchiveRecordHeader h = (getCurrentRecord() != null)? + record.getHeader(): null; + throw new IOException("Read " + (char)c + + " when only " + LINE_SEPARATOR + " expected. " + + getReaderIdentifier() + ((h != null)? + h.getHeaderFields().toString(): "")); + } + } + } + + /** + * Create new arc record. + * + * Encapsulate housekeeping that has to do w/ creating a new record. + * + *

Call this method at end of constructor to read in the + * arcfile header. Will be problems reading subsequent arc records + * if you don't since arcfile header has the list of metadata fields for + * all records that follow. + * + *

When parsing through ARCs writing out CDX info, we spend about + * 38% of CPU in here -- about 30% of which is in getTokenizedHeaderLine + * -- of which 16% is reading. + * + * @param is InputStream to use. + * @param offset Absolute offset into arc file. + * @return An arc record. + * @throws IOException + */ + protected ARCRecord createArchiveRecord(InputStream is, long offset) + throws IOException { + try { + String version = super.getVersion(); + ARCRecord record = new ARCRecord(is, getReaderIdentifier(), offset, + isDigest(), isStrict(), isParseHttpHeaders(), + isAlignedOnFirstRecord(), version); + if (version != null && super.getVersion() == null) + super.setVersion(version); + currentRecord(record); + } catch (IOException e) { + if (e instanceof RecoverableIOException) { + // Don't mess with RecoverableIOExceptions. Let them out. + throw e; + } + IOException newE = new IOException(e.getMessage() + " (Offset " + + offset + ")."); + newE.setStackTrace(e.getStackTrace()); + throw newE; + } + return (ARCRecord)getCurrentRecord(); + } + + /** + * Returns version of this ARC file. Usually read from first record of ARC. + * If we're reading without having first read the first record -- e.g. + * random access into middle of an ARC -- then version will not have been + * set. For now, we return a default, version 1.1. Later, if more than + * just one version of ARC, we could look at such as the meta line to see + * what version of ARC this is. + * @return Version of this ARC file. + */ + public String getVersion() { + return (super.getVersion() == null)? "1.1": super.getVersion(); + } + + protected boolean isAlignedOnFirstRecord() { + return alignedOnFirstRecord; + } + + protected void setAlignedOnFirstRecord(boolean alignedOnFirstRecord) { + this.alignedOnFirstRecord = alignedOnFirstRecord; + } + + /** + * @return Returns the parseHttpHeaders. + */ + public boolean isParseHttpHeaders() { + return this.parseHttpHeaders; + } + + /** + * @param parse The parseHttpHeaders to set. + */ + public void setParseHttpHeaders(boolean parse) { + this.parseHttpHeaders = parse; + } + + public String getFileExtension() { + return ARC_FILE_EXTENSION; + } + + public String getDotFileExtension() { + return DOT_ARC_FILE_EXTENSION; + } + + protected boolean output(final String format) + throws IOException, java.text.ParseException { + boolean result = super.output(format); + if(!result && (format.equals(NOHEAD) || format.equals(HEADER))) { + throw new IOException(format + + " format only supported for single Records"); + } + return result; + } + + public boolean outputRecord(final String format) throws IOException { + boolean result = super.outputRecord(format); + if (result) { + return result; + } + if (format.equals(NOHEAD)) { + // No point digesting if dumping content. + setDigest(false); + ARCRecord r = (ARCRecord) get(); + r.skipHttpHeader(); + r.dump(); + result = true; + } else if (format.equals(HEADER)) { + // No point digesting if dumping content. + setDigest(false); + ARCRecord r = (ARCRecord) get(); + r.dumpHttpHeader(); + result = true; + } + + return result; + } + + public void dump(final boolean compress) + throws IOException, java.text.ParseException { + // No point digesting if we're doing a dump. + setDigest(false); + boolean firstRecord = true; + ARCWriter writer = null; + for (Iterator ii = iterator(); ii.hasNext();) { + ARCRecord r = (ARCRecord)ii.next(); + // We're to dump the arc on stdout. + // Get the first record's data if any. + ARCRecordMetaData meta = r.getMetaData(); + if (firstRecord) { + firstRecord = false; + // Get an ARCWriter. + ByteArrayOutputStream baos = + new ByteArrayOutputStream(r.available()); + // This is slow but done only once at top of ARC. + while (r.available() > 0) { + baos.write(r.read()); + } + List listOfMetadata = new ArrayList(); + listOfMetadata.add(baos.toString(WriterPoolMember.UTF8)); + // Assume getArc returns full path to file. ARCWriter + // or new File will complain if it is otherwise. + List outDirs = new ArrayList(); + WriterPoolSettingsData settings = + new WriterPoolSettingsData("","",-1L,compress,outDirs,listOfMetadata); + writer = new ARCWriter(new AtomicInteger(), System.out, + new File(meta.getArc()), settings); + continue; + } + + writer.write(meta.getUrl(), meta.getMimetype(), meta.getIp(), + ArchiveUtils.parse14DigitDate(meta.getDate()).getTime(), + (int)meta.getLength(), r); + } + // System.out.println(System.currentTimeMillis() - start); + } + + /** + * @return an ArchiveReader that will delete a local file on close. Used + * when we bring Archive files local and need to clean up afterward. + */ + public ARCReader getDeleteFileOnCloseReader(final File f) { + final ARCReader d = this; + return new ARCReader() { + private final ARCReader delegate = d; + private File archiveFile = f; + + public void close() throws IOException { + this.delegate.close(); + if (this.archiveFile != null) { + if (archiveFile.exists()) { + archiveFile.delete(); + } + this.archiveFile = null; + } + } + + public ArchiveRecord get(long o) throws IOException { + return this.delegate.get(o); + } + + public boolean isDigest() { + return this.delegate.isDigest(); + } + + public boolean isStrict() { + return this.delegate.isStrict(); + } + + public Iterator iterator() { + return this.delegate.iterator(); + } + + public void setDigest(boolean d) { + this.delegate.setDigest(d); + } + + public void setStrict(boolean s) { + this.delegate.setStrict(s); + } + + public List validate() throws IOException { + return this.delegate.validate(); + } + + @Override + public ArchiveRecord get() throws IOException { + return this.delegate.get(); + } + + @Override + public String getVersion() { + return this.delegate.getVersion(); + } + + @Override + public List validate(int noRecords) throws IOException { + return this.delegate.validate(noRecords); + } + + @Override + protected ARCRecord createArchiveRecord(InputStream is, + long offset) + throws IOException { + return this.delegate.createArchiveRecord(is, offset); + } + + @Override + protected void gotoEOR(ArchiveRecord record) throws IOException { + this.delegate.gotoEOR(record); + } + + @Override + public void dump(boolean compress) + throws IOException, java.text.ParseException { + this.delegate.dump(compress); + } + + @Override + public String getDotFileExtension() { + return this.delegate.getDotFileExtension(); + } + + @Override + public String getFileExtension() { + return this.delegate.getFileExtension(); + } + }; + } + + // Static methods follow. + + /** + * + * @param formatter Help formatter instance. + * @param options Usage options. + * @param exitCode Exit code. + */ + private static void usage(HelpFormatter formatter, Options options, + int exitCode) { + formatter.printHelp("java org.archive.io.arc.ARCReader" + + " [--digest=true|false] \\\n" + + " [--format=cdx|cdxfile|dump|gzipdump|header|nohead]" + + " [--offset=#] \\\n[--strict] [--parse] ARC_FILE|ARC_URL", + options); + System.exit(exitCode); + } + + /** + * Write out the arcfile. + * + * @param reader + * @param format Format to use outputting. + * @throws IOException + * @throws java.text.ParseException + */ + protected static void output(ARCReader reader, String format) + throws IOException, java.text.ParseException { + if (!reader.output(format)) { + throw new IOException("Unsupported format: " + format); + } + } + + /** + * Generate a CDX index file for an ARC file. + * + * @param urlOrPath The ARC file to generate a CDX index for + * @throws IOException + * @throws java.text.ParseException + */ + public static void createCDXIndexFile(String urlOrPath) + throws IOException, java.text.ParseException { + ARCReader r = ARCReaderFactory.get(urlOrPath); + r.setStrict(false); + r.setParseHttpHeaders(true); + r.setDigest(true); + output(r, CDX_FILE); + } + + /** + * Command-line interface to ARCReader. + * + * Here is the command-line interface: + *

+     * usage: java org.archive.io.arc.ARCReader [--offset=#] ARCFILE
+     *  -h,--help      Prints this message and exits.
+     *  -o,--offset    Outputs record at this offset into arc file.
+ * + *

See in $HERITRIX_HOME/bin/arcreader for a script that'll + * take care of classpaths and the calling of ARCReader. + * + *

Outputs using a pseudo-CDX format as described here: + * CDX + * Legent and here + * Example. + * Legend used in below is: 'CDX b e a m s c V (or v if uncompressed) n g'. + * Hash is hard-coded straight SHA-1 hash of content. + * + * @param args Command-line arguments. + * @throws ParseException Failed parse of the command line. + * @throws IOException + * @throws java.text.ParseException + */ + @SuppressWarnings("unchecked") + public static void main(String [] args) + throws ParseException, IOException, java.text.ParseException { + Options options = getOptions(); + options.addOption(new Option("p","parse", false, "Parse headers.")); + PosixParser parser = new PosixParser(); + CommandLine cmdline = parser.parse(options, args, false); + List cmdlineArgs = cmdline.getArgList(); + Option [] cmdlineOptions = cmdline.getOptions(); + HelpFormatter formatter = new HelpFormatter(); + + // If no args, print help. + if (cmdlineArgs.size() <= 0) { + usage(formatter, options, 0); + } + + // Now look at options passed. + long offset = -1; + boolean digest = false; + boolean strict = false; + boolean parse = false; + String format = CDX; + for (int i = 0; i < cmdlineOptions.length; i++) { + switch(cmdlineOptions[i].getId()) { + case 'h': + usage(formatter, options, 0); + break; + + case 'o': + offset = + Long.parseLong(cmdlineOptions[i].getValue()); + break; + + case 's': + strict = true; + break; + + case 'p': + parse = true; + break; + + case 'd': + digest = getTrueOrFalse(cmdlineOptions[i].getValue()); + break; + + case 'f': + format = cmdlineOptions[i].getValue().toLowerCase(); + boolean match = false; + // List of supported formats. + final String [] supportedFormats = + {CDX, DUMP, GZIP_DUMP, HEADER, NOHEAD, CDX_FILE}; + for (int ii = 0; ii < supportedFormats.length; ii++) { + if (supportedFormats[ii].equals(format)) { + match = true; + break; + } + } + if (!match) { + usage(formatter, options, 1); + } + break; + + default: + throw new RuntimeException("Unexpected option: " + + + cmdlineOptions[i].getId()); + } + } + + if (offset >= 0) { + if (cmdlineArgs.size() != 1) { + System.out.println("Error: Pass one arcfile only."); + usage(formatter, options, 1); + } + ARCReader arc = ARCReaderFactory.get((String)cmdlineArgs.get(0), + offset); + arc.setStrict(strict); + // We must parse headers if we need to skip them. + if (format.equals(NOHEAD) || format.equals(HEADER)) { + parse = true; + } + arc.setParseHttpHeaders(parse); + outputRecord(arc, format); + } else { + for (String urlOrPath : cmdlineArgs) { + try { + ARCReader r = ARCReaderFactory.get(urlOrPath); + r.setStrict(strict); + r.setParseHttpHeaders(parse); + r.setDigest(digest); + output(r, format); + } catch (RuntimeException e) { + // Write out name of file we failed on to help with + // debugging. Then print stack trace and try to keep + // going. We do this for case where we're being fed + // a bunch of ARCs; just note the bad one and move + // on to the next. + System.err.println("Exception processing " + urlOrPath + + ": " + e.getMessage()); + e.printStackTrace(System.err); + System.exit(1); + } + } + } + } +} diff --git a/src/main/java/org/archive/io/arc/ARCReaderFactory.java b/src/main/java/org/archive/io/arc/ARCReaderFactory.java new file mode 100644 index 00000000..e7dc1625 --- /dev/null +++ b/src/main/java/org/archive/io/arc/ARCReaderFactory.java @@ -0,0 +1,454 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io.arc; + +import java.io.BufferedInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.Iterator; +import java.util.logging.Level; + +import org.archive.io.ArchiveReader; +import org.archive.io.ArchiveReaderFactory; +import org.archive.io.ArchiveRecord; +import org.archive.io.ArchiveRecordHeader; +import org.archive.util.FileUtils; +import org.archive.util.zip.GZIPMembersInputStream; +import org.archive.util.zip.GzipHeader; +import org.archive.util.zip.NoGzipMagicException; + +import com.google.common.io.CountingInputStream; + + +/** + * Factory that returns an ARCReader. + * + * Can handle compressed and uncompressed ARCs. + * + * @author stack + */ +public class ARCReaderFactory extends ArchiveReaderFactory +implements ARCConstants { + /** + * This factory instance. + */ + private static final ARCReaderFactory factory = new ARCReaderFactory(); + + /** + * Shutdown any access to default constructor. + */ + protected ARCReaderFactory() { + super(); + } + + public static ARCReader get(String arcFileOrUrl) + throws MalformedURLException, IOException { + return (ARCReader)ARCReaderFactory.factory. + getArchiveReader(arcFileOrUrl); + } + + public static ARCReader get(String arcFileOrUrl, final long offset) + throws MalformedURLException, IOException { + return (ARCReader)ARCReaderFactory.factory. + getArchiveReader(arcFileOrUrl, offset); + } + + public static ARCReader get(final File f) throws IOException { + return (ARCReader)ARCReaderFactory.factory.getArchiveReader(f); + } + + public static ARCReader get(final File f, final long offset) + throws IOException { + return (ARCReader)ARCReaderFactory.factory.getArchiveReader(f, offset); + } + + protected ArchiveReader getArchiveReader(final File f, final long offset) + throws IOException { + return getArchiveReader(f, true, offset); + } + + /** + * @param f An arcfile to read. + * @param skipSuffixTest Set to true if want to test that ARC has proper + * suffix. Use this method and pass false to open ARCs + * with the .open or otherwise suffix. + * @param offset Have returned ARCReader set to start reading at passed + * offset. + * @return An ARCReader. + * @throws IOException + */ + public static ARCReader get(final File f, + final boolean skipSuffixTest, final long offset) + throws IOException { + return (ARCReader)ARCReaderFactory.factory.getArchiveReader(f, + skipSuffixTest, offset); + } + + protected ArchiveReader getArchiveReader(final File arcFile, + final boolean skipSuffixTest, final long offset) + throws IOException { + boolean compressed = testCompressedARCFile(arcFile, skipSuffixTest); + if (!compressed) { + if (!FileUtils.isReadableWithExtensionAndMagic(arcFile, + ARC_FILE_EXTENSION, ARC_MAGIC_NUMBER)) { + throw new IOException(arcFile.getAbsolutePath() + + " is not an Internet Archive ARC file."); + } + } + return compressed? + (ARCReader)ARCReaderFactory.factory. + new CompressedARCReader(arcFile, offset): + (ARCReader)ARCReaderFactory.factory. + new UncompressedARCReader(arcFile, offset); + } + + public static ArchiveReader get(final String s, final InputStream is, + final boolean atFirstRecord) + throws IOException { + return ARCReaderFactory.factory.getArchiveReader(s, is, + atFirstRecord); + } + + protected ArchiveReader getArchiveReader(final String arc, + final InputStream is, final boolean atFirstRecord) + throws IOException { + + // We do this mark() reset() stuff, wrapping in a BufferedInputStream if + // necessary to make it work, because testCompressedARCStream() consumes + // some bytes from the input stream + InputStream possiblyWrapped; + if (is.markSupported()) { + possiblyWrapped = is; + } else { + possiblyWrapped = new BufferedInputStream(is); + } + + possiblyWrapped.mark(100); + boolean compressed = testCompressedARCStream(possiblyWrapped); + possiblyWrapped.reset(); + + if (compressed) { + return new CompressedARCReader(arc, possiblyWrapped, atFirstRecord); + } else { + return new UncompressedARCReader(arc, possiblyWrapped); + } + } + + /** + * Get an ARCReader aligned at offset. This version of get + * will not bring the ARC local but will try to stream across the net making + * an HTTP 1.1 Range request on remote http server (RFC1435 Section 14.35). + * + * @param arcUrl HTTP URL for an ARC (All ARCs considered remote). + * @param offset Offset into ARC at which to start fetching. + * @return An ARCReader aligned at offset. + * @throws IOException + */ + public static ARCReader get(final URL arcUrl, final long offset) + throws IOException { + return (ARCReader)ARCReaderFactory.factory.getArchiveReader(arcUrl, + offset); + } + + /** + * Get an ARCReader. + * Pulls the ARC local into whereever the System Property + * java.io.tmpdir points. It then hands back an ARCReader that + * points at this local copy. A close on this ARCReader instance will + * remove the local copy. + * @param arcUrl An URL that points at an ARC. + * @return An ARCReader. + * @throws IOException + */ + public static ARCReader get(final URL arcUrl) + throws IOException { + return (ARCReader)ARCReaderFactory.factory.getArchiveReader(arcUrl); + } + + /** + * @param arcFile File to test. + * @return True if arcFile is compressed ARC. + * @throws IOException + */ + public boolean isCompressed(File arcFile) throws IOException { + return testCompressedARCFile(arcFile); + } + + /** + * Check file is compressed and in ARC GZIP format. + * + * @param arcFile File to test if its Internet Archive ARC file + * GZIP compressed. + * + * @return True if this is an Internet Archive GZIP'd ARC file (It begins + * w/ the Internet Archive GZIP header and has the + * COMPRESSED_ARC_FILE_EXTENSION suffix). + * + * @exception IOException If file does not exist or is not unreadable. + */ + public static boolean testCompressedARCFile(File arcFile) + throws IOException { + return testCompressedARCFile(arcFile, false); + } + + /** + * Check file is compressed and in ARC GZIP format. + * + * @param arcFile File to test if its Internet Archive ARC file + * GZIP compressed. + * @param skipSuffixCheck Set to true if we're not to test on the + * '.arc.gz' suffix. + * + * @return True if this is an Internet Archive GZIP'd ARC file (It begins + * w/ the Internet Archive GZIP header). + * + * @exception IOException If file does not exist or is not unreadable. + */ + public static boolean testCompressedARCFile(File arcFile, + boolean skipSuffixCheck) + throws IOException { + boolean compressedARCFile = false; + FileUtils.assertReadable(arcFile); + if(!skipSuffixCheck && !arcFile.getName().toLowerCase() + .endsWith(COMPRESSED_ARC_FILE_EXTENSION)) { + return compressedARCFile; + } + + final InputStream is = new FileInputStream(arcFile); + try { + compressedARCFile = testCompressedARCStream(is); + } finally { + is.close(); + } + return compressedARCFile; + } + + public static boolean isARCSuffix(final String arcName) { + return (arcName == null)? + false: + (arcName.toLowerCase().endsWith(DOT_COMPRESSED_ARC_FILE_EXTENSION))? + true: + (arcName.toLowerCase().endsWith(DOT_ARC_FILE_EXTENSION))? + true: false; + } + + /** + * Tests passed stream is gzip stream by reading in the HEAD. + * Does not reposition the stream. That is left up to the caller. + * @param is An InputStream. + * @return True if compressed stream. + * @throws IOException + */ + public static boolean testCompressedARCStream(final InputStream is) + throws IOException { + boolean compressedARCFile = false; + GzipHeader gh = null; + try { + gh = new GzipHeader(is); + } catch (NoGzipMagicException e) { + return false; + } + + byte[] fextra = gh.getFextra(); + // Now make sure following bytes are IA GZIP comment. + // First check length. ARC_GZIP_EXTRA_FIELD includes length + // so subtract two and start compare to ARC_GZIP_EXTRA_FIELD + // at +2. + // some Alexa ARC files gzip extra fields have changed slightly + // after the first two bytes, so we'll just look for the 'LX' + // extension for valid IA ARC files. + if (fextra != null) { + if (fextra.length >= ARC_GZIP_EXTRA_FIELD.length - 2) { + if (fextra[0] == ARC_GZIP_EXTRA_FIELD[2] && + fextra[1] == ARC_GZIP_EXTRA_FIELD[3]) { + compressedARCFile = true; + } + } + } else { + // Some old arcs don't have an extra header at all, but they're still compressed + compressedARCFile = true; + } + + return compressedARCFile; + } + + /** + * Uncompressed arc file reader. + * @author stack + */ + public class UncompressedARCReader extends ARCReader { + /** + * Constructor. + * @param f Uncompressed arcfile to read. + * @throws IOException + */ + public UncompressedARCReader(final File f) + throws IOException { + this(f, 0); + } + + /** + * Constructor. + * + * @param f Uncompressed arcfile to read. + * @param offset Offset at which to position ARCReader. + * @throws IOException + */ + public UncompressedARCReader(final File f, final long offset) + throws IOException { + // Arc file has been tested for existence by time it has come + // to here. + setIn(new CountingInputStream(getInputStream(f, offset))); + getIn().skip(offset); + initialize(f.getAbsolutePath()); + } + + /** + * Constructor. + * + * @param f Uncompressed arc to read. + * @param is InputStream. + */ + public UncompressedARCReader(final String f, final InputStream is) { + // Arc file has been tested for existence by time it has come + // to here. + setIn(new CountingInputStream(is)); + initialize(f); + } + } + + /** + * Compressed arc file reader. + * + * @author stack + */ + public class CompressedARCReader extends ARCReader { + + /** + * Constructor. + * + * @param f + * Compressed arcfile to read. + * @throws IOException + */ + public CompressedARCReader(final File f) throws IOException { + this(f, 0); + } + + /** + * Constructor. + * + * @param f Compressed arcfile to read. + * @param offset Position at where to start reading file. + * @throws IOException + */ + public CompressedARCReader(final File f, final long offset) + throws IOException { + // Arc file has been tested for existence by time it has come + // to here. + setIn(new GZIPMembersInputStream(getInputStream(f, offset))); + ((GZIPMembersInputStream)getIn()).compressedSeek(offset); + setCompressed((offset == 0)); // TODO: does this make sense??? + initialize(f.getAbsolutePath()); + } + + /** + * Constructor. + * + * @param f Compressed arcfile. + * @param is InputStream to use. + * @throws IOException + */ + public CompressedARCReader(final String f, final InputStream is, + final boolean atFirstRecord) + throws IOException { + // Arc file has been tested for existence by time it has come + // to here. + setIn(new GZIPMembersInputStream(is)); + setCompressed(true); + setAlignedOnFirstRecord(atFirstRecord); + initialize(f); + } + + /** + * Get record at passed offset. + * + * @param offset + * Byte index into arcfile at which a record starts. + * @return An ARCRecord reference. + * @throws IOException + */ + public ARCRecord get(long offset) throws IOException { + cleanupCurrentRecord(); + ((GZIPMembersInputStream)getIn()).compressedSeek(offset); + return createArchiveRecord(getIn(), offset); + } + + public Iterator iterator() { + /** + * Override ARCRecordIterator so can base returned iterator on + * GzippedInputStream iterator. + */ + return new ArchiveRecordIterator() { + private GZIPMembersInputStream gis = + (GZIPMembersInputStream)getIn(); + + private Iterator gzipIterator = this.gis.memberIterator(); + + protected boolean innerHasNext() { + return this.gzipIterator.hasNext(); + } + + protected ArchiveRecord innerNext() throws IOException { + InputStream is = this.gzipIterator.next(); + return createArchiveRecord(is, Math.max(gis.getCurrentMemberStart(), gis.getCurrentMemberEnd())); + } + }; + } + + protected void gotoEOR(ArchiveRecord rec) throws IOException { + int c; + while ((c = getIn().read())==LINE_SEPARATOR); + if(c==-1) { + return; + } + long skipped = 1; + while (getIn().read()>-1) { + skipped++; + } + // Report on system error the number of unexpected characters + // at the end of this record. + ArchiveRecordHeader meta = (getCurrentRecord() != null)? + rec.getHeader(): null; + String message = "Record STARTING at " + + ((GZIPMembersInputStream)getIn()).getCurrentMemberStart() + + " has " + skipped + " trailing byte(s): " + + ((meta != null)? meta.toString(): ""); + if (isStrict()) { + throw new IOException(message); + } + logStdErr(Level.WARNING, message); + } + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/arc/ARCRecord.java b/src/main/java/org/archive/io/arc/ARCRecord.java new file mode 100644 index 00000000..21bea07c --- /dev/null +++ b/src/main/java/org/archive/io/arc/ARCRecord.java @@ -0,0 +1,835 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io.arc; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.logging.Level; +import java.util.logging.Logger; +import java.util.regex.Matcher; + +import org.apache.commons.httpclient.Header; +import org.apache.commons.httpclient.StatusLine; +import org.apache.commons.httpclient.util.EncodingUtil; +import org.apache.commons.lang.StringUtils; +import org.archive.io.ArchiveRecord; +import org.archive.io.ArchiveRecordHeader; +import org.archive.io.RecoverableIOException; +import org.archive.util.InetAddressUtil; +import org.archive.util.LaxHttpParser; +import org.archive.util.TextUtils; + +/** + * An ARC file record. + * Does not compass the ARCRecord metadata line, just the record content. + * @author stack + */ +public class ARCRecord extends ArchiveRecord implements ARCConstants { + /** + * Http status line object. + * + * May be null if record is not http. + */ + private StatusLine httpStatus = null; + + /** + * Http header bytes. + * + * If non-null and bytes available, give out its contents before we + * go back to the underlying stream. + */ + private InputStream httpHeaderStream = null; + + /** + * Http headers. + * + * Only populated after reading of headers. + */ + private Header [] httpHeaders = null; + + /** + * Array of field names. + * + * Used to initialize headerFieldNameKeys. + */ + private final String [] headerFieldNameKeysArray = { + URL_FIELD_KEY, + IP_HEADER_FIELD_KEY, + DATE_FIELD_KEY, + MIMETYPE_FIELD_KEY, + LENGTH_FIELD_KEY + }; + + /** + * An array of the header field names found in the ARC file header on + * the 3rd line. + * + * We used to read these in from the arc file first record 3rd line but + * now we hardcode them for sake of improved performance. + */ + private final List headerFieldNameKeys = + Arrays.asList(this.headerFieldNameKeysArray); + + /** + * Http header bytes read while trying to read http header + */ + public long httpHeaderBytesRead = -1; + + /** + * record length from metadata line + */ + public long recordDeclaredLength; + + /** + * null if source was not compressed + */ + public long compressedBytes; + + /** + * actual payload data (not including trailing newline), + * should match record-declared-length + */ + public long uncompressedBytes; + + /** + * content-length header, iff HTTP and present, null otherwise + */ + public long httpPayloadDeclaredLength; + + /** + * actual http payload length, should match http-payload-declared-length + */ + public long httpPayloadActualLength; + + /** + * errors encountered reading record + */ + public List errors = new ArrayList(); + + /** + * verbatim ARC record header string + */ + private String headerString; + public String getHeaderString() { + return this.headerString; + } + + /** + * Constructor. + * + * @param in Stream cue'd up to be at the start of the record this instance + * is to represent. + * @param metaData Meta data. + * @throws IOException + */ + public ARCRecord(InputStream in, ArchiveRecordHeader metaData) + throws IOException { + this(in, metaData, 0, true, false, true); + } + + /** + * Constructor. + * + * @param in Stream cue'd up to be at the start of the record this instance + * is to represent. + * @param metaData Meta data. + * @param bodyOffset Offset into the body. Usually 0. + * @param digest True if we're to calculate digest for this record. Not + * digesting saves about ~15% of cpu during an ARC parse. + * @param strict Be strict parsing (Parsing stops if ARC inproperly + * formatted). + * @param parseHttpHeaders True if we are to parse HTTP headers. Costs + * about ~20% of CPU during an ARC parse. + * @throws IOException + */ + public ARCRecord(InputStream in, ArchiveRecordHeader metaData, + int bodyOffset, boolean digest, boolean strict, + final boolean parseHttpHeaders) + throws IOException { + super(in, metaData, bodyOffset, digest, strict); + if (parseHttpHeaders) { + this.httpHeaderStream = readHttpHeader(); + } + } + + /** + * Constructor. + * + * @param in Stream cue'd up to be at the start of the records metadata + * this instance is to represent. + * @param identifier Identifier for this the hosting Reader. + * @param offset Current offset into in (Used to keep + * position properly aligned). Usually 0. + * @param digest True if we're to calculate digest for this record. Not + * digesting saves about ~15% of cpu during an ARC parse. + * @param strict Be strict parsing (Parsing stops if ARC inproperly + * formatted). + * @param parseHttpHeaders True if we are to parse HTTP headers. Costs + * about ~20% of CPU during an ARC parse. + * @param isAllignedOnFirstRecord True if this is the first record to be + * read from an archive + * @param String version Version information to be returned to the + * ARCReader constructing this record + * + * @throws IOException + */ + public ARCRecord(InputStream in, final String identifier, + final long offset, boolean digest, boolean strict, + final boolean parseHttpHeaders, + final boolean isAlignedOnFirstRecord, String version) + throws IOException { + super(in, null, 0, digest, strict); + setHeader(parseHeaders(in, identifier, offset, strict, isAlignedOnFirstRecord, version)); + if (parseHttpHeaders) { + this.httpHeaderStream = readHttpHeader(); + } + } + + /** + * Constructor. + * + * @param in Stream cue'd up to be at the start of the records metadata + * this instance is to represent. + * @param identifier Identifier for this the hosting Reader. + * @param offset Current offset into in (Used to keep + * position properly aligned). Usually 0. + * @param digest True if we're to calculate digest for this record. Not + * digesting saves about ~15% of cpu during an ARC parse. + * @param strict Be strict parsing (Parsing stops if ARC inproperly + * formatted). + * @param parseHttpHeaders True if we are to parse HTTP headers. Costs + * about ~20% of CPU during an ARC parse. + * + * @throws IOException + */ + public ARCRecord(InputStream in, final String identifier, + final long offset, boolean digest, boolean strict, + final boolean parseHttpHeaders) + throws IOException { + this(in, identifier, offset, digest, strict, parseHttpHeaders, + false, null); + } + + private ArchiveRecordHeader parseHeaders(final InputStream in, + final String identifier, final long offset, final boolean strict, + final boolean isAlignedOnFirstRecord, String version) + throws IOException { + + ArrayList firstLineValues = new ArrayList(20); + getTokenizedHeaderLine(in, firstLineValues); + + int bodyOffset = 0; + if (offset == 0 && isAlignedOnFirstRecord) { + // If offset is zero and we were aligned at first record on + // creation (See #alignedOnFirstRecord for more on this), then no + // records have been read yet and we're reading our first one, the + // record of ARC file meta info. Its special. In ARC versions + // 1.x, first record has three lines of meta info. We've just read + // the first line. There are two more. The second line has misc. + // info. We're only interested in the first field, the version + // number. The third line is the list of field names. Here's what + // ARC file version 1.x meta content looks like: + // + // filedesc://testIsBoundary-JunitIAH200401070157520.arc 0.0.0.0 \\ + // 20040107015752 text/plain 77 + // 1 0 InternetArchive + // URL IP-address Archive-date Content-type Archive-length + // + ArrayList secondLineValues = new ArrayList(20); + bodyOffset += getTokenizedHeaderLine(in, secondLineValues); + version = ((String)secondLineValues.get(0) + + "." + (String)secondLineValues.get(1)); + // Just read over the 3rd line. We used to parse it and use + // values found here but now we just hardcode them to avoid + // having to read this 3rd line even for random arc file accesses. + bodyOffset += getTokenizedHeaderLine(in, null); + // this.position = bodyOffset; + } + setBodyOffset(bodyOffset); + + return computeMetaData(this.headerFieldNameKeys, firstLineValues, version, offset, identifier); + } + + /** + * Get a record header line as list of tokens. + * + * We keep reading till we find a LINE_SEPARATOR or we reach the end + * of file w/o finding a LINE_SEPARATOR or the line length is crazy. + * + * @param stream InputStream to read from. + * @param list Empty list that gets filled w/ string tokens. + * @return Count of characters read. + * @exception IOException If problem reading stream or no line separator + * found or EOF before EOL or we didn't get minimum header fields. + */ + private int getTokenizedHeaderLine(final InputStream stream, + List list) throws IOException { + // Preallocate usual line size. + StringBuilder buffer = new StringBuilder(2048 + 20); + int read = 0; + int previous = -1; + for (int c = -1; true;) { + previous = c; + c = stream.read(); + if (c == -1) { + throw new RecoverableIOException("Hit EOF before header EOL."); + } + c &= 0xff; + read++; + if (read > MAX_HEADER_LINE_LENGTH) { + throw new IOException("Header line longer than max allowed " + + " -- " + String.valueOf(MAX_HEADER_LINE_LENGTH) + + " -- or passed buffer doesn't contain a line (Read: " + + buffer.length() + "). Here's" + + " some of what was read: " + + buffer.substring(0, Math.min(buffer.length(), 256))); + } + + if (c == LINE_SEPARATOR) { + if (buffer.length() == 0) { + // Empty line at start of buffer. Skip it and try again. + continue; + } + + if (list != null) { + list.add(buffer.toString()); + } + // LOOP TERMINATION. + break; + } else if (c == HEADER_FIELD_SEPARATOR) { + if (!isStrict() && previous == HEADER_FIELD_SEPARATOR) { + // Early ARCs sometimes had multiple spaces between fields. + continue; + } + if (list != null) { + list.add(buffer.toString()); + } + // reset to empty + buffer.setLength(0); + } else { + buffer.append((char)c); + } + } + + // List must have at least 3 elements in it and no more than 10. If + // it has other than this, then bogus parse. + if (list != null && (list.size() < 3 || list.size() > 100)) { + throw new IOException("Unparseable header line: " + list); + } + + // save verbatim header String + this.headerString = StringUtils.join(list," "); + + return read; + } + + /** + * Compute metadata fields. + * + * Here we check the meta field has right number of items in it. + * + * @param keys Keys to use composing headerFields map. + * @param values Values to set into the headerFields map. + * @param v The version of this ARC file. + * @param offset Offset into arc file. + * + * @return Metadata structure for this record. + * + * @exception IOException If no. of keys doesn't match no. of values. + */ + private ARCRecordMetaData computeMetaData(List keys, + List values, String v, long offset, final String identifier) + throws IOException { + if (keys.size() != values.size()) { + List originalValues = values; + if (!isStrict()) { + values = fixSpaceInURL(values, keys.size()); + // If values still doesn't match key size, try and do + // further repair. + if (keys.size() != values.size()) { + // Early ARCs had a space in mimetype. + if (values.size() == (keys.size() + 1) && + values.get(4).toLowerCase().startsWith("charset=")) { + List nuvalues = + new ArrayList(keys.size()); + nuvalues.add(0, values.get(0)); + nuvalues.add(1, values.get(1)); + nuvalues.add(2, values.get(2)); + nuvalues.add(3, values.get(3) + values.get(4)); + nuvalues.add(4, values.get(5)); + values = nuvalues; + } else if((values.size() + 1) == keys.size() && + isLegitimateIPValue(values.get(1)) && + isDate(values.get(2)) && isNumber(values.get(3))) { + // Mimetype is empty. + List nuvalues = + new ArrayList(keys.size()); + nuvalues.add(0, values.get(0)); + nuvalues.add(1, values.get(1)); + nuvalues.add(2, values.get(2)); + nuvalues.add(3, "-"); + nuvalues.add(4, values.get(3)); + values = nuvalues; + } + } + } + if (keys.size() != values.size()) { + throw new IOException("Size of field name keys does" + + " not match count of field values: " + values); + } + // Note that field was fixed on stderr. + System.err.println(Level.WARNING.toString() + "Fixed spaces in metadata line at " + + "offset " + offset + + " Original: " + originalValues + ", New: " + values); + } + + Map headerFields = + new HashMap(keys.size() + 2); + for (int i = 0; i < keys.size(); i++) { + headerFields.put(keys.get(i), values.get(i)); + } + + // Add a check for tabs in URLs. If any, replace with '%09'. + // See https://sourceforge.net/tracker/?group_id=73833&atid=539099&func=detail&aid=1010966, + // [ 1010966 ] crawl.log has URIs with spaces in them. + String url = (String)headerFields.get(URL_FIELD_KEY); + if (url != null && url.indexOf('\t') >= 0) { + headerFields.put(URL_FIELD_KEY, + TextUtils.replaceAll("\t", url, "%09")); + } + + headerFields.put(VERSION_FIELD_KEY, v); + headerFields.put(ABSOLUTE_OFFSET_KEY, new Long(offset)); + + return new ARCRecordMetaData(identifier, headerFields); + } + + /** + * Fix space in URLs. + * The ARCWriter used to write into the ARC URLs with spaces in them. + * See [ 1010966 ] + * crawl.log has URIs with spaces in them. + * This method does fix up on such headers converting all spaces found + * to '%20'. + * @param values List of metadata values. + * @param requiredSize Expected size of resultant values list. + * @return New list if we successfully fixed up values or original if + * fixup failed. + */ + private List fixSpaceInURL(List values, int requiredSize) { + // Do validity check. 3rd from last is a date of 14 numeric + // characters. The 4th from last is IP, all before the IP + // should be concatenated together with a '%20' joiner. + // In the below, '4' is 4th field from end which has the IP. + if (!(values.size() > requiredSize) || values.size() < 4) { + return values; + } + // Test 3rd field is valid date. + if (!isDate((String) values.get(values.size() - 3))) { + return values; + } + + // Test 4th field is valid IP. + if (!isLegitimateIPValue((String) values.get(values.size() - 4))) { + return values; + } + + List newValues = new ArrayList(requiredSize); + StringBuffer url = new StringBuffer(); + for (int i = 0; i < (values.size() - 4); i++) { + if (i > 0) { + url.append("%20"); + } + url.append(values.get(i)); + } + newValues.add(url.toString()); + for (int i = values.size() - 4; i < values.size(); i++) { + newValues.add(values.get(i)); + } + return newValues; + } + + private boolean isDate(final String date) { + if (date.length() != 14) { + return false; + } + return isNumber(date); + } + + private boolean isNumber(final String n) { + for (int i = 0; i < n.length(); i++) { + if (!Character.isDigit(n.charAt(i))) { + return false; + } + } + return true; + } + + private boolean isLegitimateIPValue(final String ip) { + if ("-".equals(ip)) { + return true; + } + Matcher m = InetAddressUtil.IPV4_QUADS.matcher(ip); + return m != null && m.matches(); + } + + /** + * Skip over the the http header if one present. + * + * Subsequent reads will get the body. + * + *

Calling this method in the midst of reading the header + * will make for strange results. Otherwise, safe to call + * at any time though before reading any of the arc record + * content is only time that it makes sense. + * + *

After calling this method, you can call + * {@link #getHttpHeaders()} to get the read http header. + * + * @throws IOException + */ + public void skipHttpHeader() throws IOException { + if (this.httpHeaderStream != null) { + // Empty the httpHeaderStream + for (int available = this.httpHeaderStream.available(); + this.httpHeaderStream != null && + (available = this.httpHeaderStream.available()) > 0;) { + // We should be in this loop once only we should only do this + // buffer allocation once. + byte [] buffer = new byte[available]; + // The read nulls out httpHeaderStream when done with it so + // need check for null in the loop control line. + read(buffer, 0, available); + } + } + } + + public void dumpHttpHeader() throws IOException { + if (this.httpHeaderStream == null) { + return; + } + // Dump the httpHeaderStream to STDOUT + for (int available = this.httpHeaderStream.available(); + this.httpHeaderStream != null + && (available = this.httpHeaderStream.available()) > 0;) { + // We should be in this loop only once and should do this + // buffer allocation once. + byte[] buffer = new byte[available]; + // The read nulls out httpHeaderStream when done with it so + // need check for null in the loop control line. + int read = read(buffer, 0, available); + System.out.write(buffer, 0, read); + } + } + + /** + * Read http header if present. Technique borrowed from HttpClient HttpParse + * class. set errors when found. + * + * @return ByteArrayInputStream with the http header in it or null if no + * http header. + * @throws IOException + */ + private InputStream readHttpHeader() throws IOException { + + // this can be helpful when simply iterating over records, + // looking for problems. + Logger logger = Logger.getLogger(this.getClass().getName()); + ArchiveRecordHeader h = this.getHeader(); + + // If judged a record that doesn't have an http header, return + // immediately. + String url = getHeader().getUrl(); + if(!url.startsWith("http") || + getHeader().getLength() <= MIN_HTTP_HEADER_LENGTH) { + return null; + } + + String statusLine; + byte[] statusBytes; + int eolCharCount = 0; + int errOffset = 0; + + // Read status line, skipping any errant http headers found before it + // This allows a larger number of 'corrupt' arcs -- where headers were accidentally + // inserted before the status line to be readable + while (true) { + statusBytes = LaxHttpParser.readRawLine(getIn()); + eolCharCount = getEolCharsCount(statusBytes); + if (eolCharCount <= 0) { + throw new RecoverableIOException( + "Failed to read http status where one was expected: " + + ((statusBytes == null) ? "" : new String(statusBytes))); + } + + statusLine = EncodingUtil.getString(statusBytes, 0, + statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING); + + // If a null or DELETED break immediately + if ((statusLine == null) || statusLine.startsWith("DELETED")) { + break; + } + + // If it's actually the status line, break, otherwise continue skipping any + // previous header values + if (!statusLine.contains(":") && StatusLine.startsWithHTTP(statusLine)) { + break; + } + + // Add bytes read to error "offset" to add to position + errOffset += statusBytes.length; + } + + if (errOffset > 0) { + this.incrementPosition(errOffset); + } + + if ((statusLine == null) || + !StatusLine.startsWithHTTP(statusLine)) { + if (statusLine.startsWith("DELETED")) { + // Some old ARCs have deleted records like following: + // http://vireo.gatech.edu:80/ebt-bin/nph-dweb/dynaweb/SGI_Developer/SGITCL_PG/@Generic__BookTocView/11108%3Btd%3D2 130.207.168.42 19991010131803 text/html 29202 + // DELETED_TIME=20000425001133_DELETER=Kurt_REASON=alexalist + // (follows ~29K spaces) + // For now, throw a RecoverableIOException so if iterating over + // records, we keep going. TODO: Later make a legitimate + // ARCRecord from the deleted record rather than throw + // exception. + throw new DeletedARCRecordIOException(statusLine); + } else { + this.errors.add(ArcRecordErrors.HTTP_STATUS_LINE_INVALID); + } + } + + try { + this.httpStatus = new StatusLine(statusLine); + } catch(IOException e) { + logger.warning(e.getMessage() + " at offset: " + h.getOffset()); + this.errors.add(ArcRecordErrors.HTTP_STATUS_LINE_EXCEPTION); + } + + // Save off all bytes read. Keep them as bytes rather than + // convert to strings so we don't have to worry about encodings + // though this should never be a problem doing http headers since + // its all supposed to be ascii. + ByteArrayOutputStream baos = + new ByteArrayOutputStream(statusBytes.length + 4 * 1024); + baos.write(statusBytes); + + // Now read rest of the header lines looking for the separation + // between header and body. + for (byte [] lineBytes = null; true;) { + lineBytes = LaxHttpParser.readRawLine(getIn()); + eolCharCount = getEolCharsCount(lineBytes); + if (eolCharCount <= 0) { + if (getIn().available() == 0) { + httpHeaderBytesRead += statusBytes.length; + logger.warning("HTTP header truncated at offset: " + h.getOffset()); + this.errors.add(ArcRecordErrors.HTTP_HEADER_TRUNCATED); + this.setEor(true); + break; + } else { + throw new IOException("Failed reading http headers: " + + ((lineBytes != null)? new String(lineBytes): null)); + } + } else { + httpHeaderBytesRead += lineBytes.length; + } + // Save the bytes read. + baos.write(lineBytes); + if ((lineBytes.length - eolCharCount) <= 0) { + // We've finished reading the http header. + break; + } + } + + byte [] headerBytes = baos.toByteArray(); + // Save off where body starts. + this.getMetaData().setContentBegin(headerBytes.length); + ByteArrayInputStream bais = + new ByteArrayInputStream(headerBytes); + if (!bais.markSupported()) { + throw new IOException("ByteArrayInputStream does not support mark"); + } + bais.mark(headerBytes.length); + // Read the status line. Don't let it into the parseHeaders function. + // It doesn't know what to do with it. + bais.read(statusBytes, 0, statusBytes.length); + this.httpHeaders = LaxHttpParser.parseHeaders(bais, + ARCConstants.DEFAULT_ENCODING); + this.getMetaData().setStatusCode(Integer.toString(getStatusCode())); + bais.reset(); + return bais; + } + + private static class DeletedARCRecordIOException + extends RecoverableIOException { + private static final long serialVersionUID = 1L; + + public DeletedARCRecordIOException(final String reason) { + super(reason); + } + } + + /** + * Return status code for this record. + * + * This method will return -1 until the http header has been read. + * @return Status code. + */ + public int getStatusCode() { + return (this.httpStatus == null)? -1: this.httpStatus.getStatusCode(); + } + + /** + * @param bytes Array of bytes to examine for an EOL. + * @return Count of end-of-line characters or zero if none. + */ + private int getEolCharsCount(byte [] bytes) { + int count = 0; + if (bytes != null && bytes.length >=1 && + bytes[bytes.length - 1] == '\n') { + count++; + if (bytes.length >=2 && bytes[bytes.length -2] == '\r') { + count++; + } + } + return count; + } + + /** + * @return Meta data for this record. + */ + public ARCRecordMetaData getMetaData() { + return (ARCRecordMetaData)getHeader(); + } + + /** + * @return http headers (Only available after header has been read). + */ + public Header [] getHttpHeaders() { + return this.httpHeaders; + } + + /** + * @return ArcRecordErrors encountered when reading + */ + public List getErrors() { + return this.errors; + } + + /** + * @return true if ARC record errors found + */ + public boolean hasErrors() { + return !this.errors.isEmpty(); + } + + /** + * @return Next character in this ARCRecord's content else -1 if at end of + * this record. + * @throws IOException + */ + public int read() throws IOException { + int c = -1; + if (this.httpHeaderStream != null && + (this.httpHeaderStream.available() > 0)) { + // If http header, return bytes from it before we go to underlying + // stream. + c = this.httpHeaderStream.read(); + // If done with the header stream, null it out. + if (this.httpHeaderStream.available() <= 0) { + this.httpHeaderStream = null; + } + incrementPosition(); + } else { + c = super.read(); + } + return c; + } + + public int read(byte [] b, int offset, int length) throws IOException { + int read = -1; + if (this.httpHeaderStream != null && + (this.httpHeaderStream.available() > 0)) { + // If http header, return bytes from it before we go to underlying + // stream. + read = Math.min(length, this.httpHeaderStream.available()); + if (read == 0) { + read = -1; + } else { + read = this.httpHeaderStream.read(b, offset, read); + } + // If done with the header stream, null it out. + if (this.httpHeaderStream.available() <= 0) { + this.httpHeaderStream = null; + } + incrementPosition(read); + } else { + read = super.read(b, offset, length); + } + return read; + } + + /** + * @return Offset at which the body begins (Only known after + * header has been read) or -1 if none or if we haven't read + * headers yet. Usually length of HTTP headers (does not include ARC + * metadata line length). + */ + public int getBodyOffset() { + return this.getMetaData().getContentBegin(); + } + + @Override + protected String getIp4Cdx(ArchiveRecordHeader h) { + String result = null; + if (h instanceof ARCRecordMetaData) { + result = ((ARCRecordMetaData)h).getIp(); + } + return (result != null)? result: super.getIp4Cdx(h); + } + + @Override + protected String getStatusCode4Cdx(ArchiveRecordHeader h) { + String result = null; + if (h instanceof ARCRecordMetaData) { + result = ((ARCRecordMetaData) h).getStatusCode(); + } + return (result != null) ? result: super.getStatusCode4Cdx(h); + } + + @Override + protected String getDigest4Cdx(ArchiveRecordHeader h) { + String result = null; + if (h instanceof ARCRecordMetaData) { + result = ((ARCRecordMetaData) h).getDigest(); + } + return (result != null) ? result: super.getDigest4Cdx(h); + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/arc/ARCRecordMetaData.java b/src/main/java/org/archive/io/arc/ARCRecordMetaData.java new file mode 100644 index 00000000..3f617041 --- /dev/null +++ b/src/main/java/org/archive/io/arc/ARCRecordMetaData.java @@ -0,0 +1,267 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io.arc; + +import java.io.File; +import java.io.IOException; +import java.util.Iterator; +import java.util.Map; +import java.util.Set; + +import org.archive.io.ArchiveRecordHeader; + + +/** + * An immutable class to hold an ARC record meta data. + * + * @author stack + */ +public class ARCRecordMetaData implements ArchiveRecordHeader, ARCConstants { + /** + * Map of record header fields. + * + * We store all in a hashmap. This way we can hold version 1 or + * version 2 record meta data. + * + *

Keys are lowercase. + */ + protected Map headerFields = null; + + /** + * Digest for the record. + * + * Only available after the record has been read in totality. + */ + private String digest = null; + + /** + * Status for this request. + * + * There may be no status. + */ + private String statusCode = null; + + /** + * The arc this metadata came out. + * Descriptive String, either path or URL. + */ + private String arc = null; + + private int contentBegin = 0; + + /** + * Shut down the default constructor. + */ + protected ARCRecordMetaData() { + super(); + } + + /** + * Constructor. + * + * @param arc The arc file this metadata came out of. + * @param headerFields Hash of meta fields. + * + * @throws IOException + */ + public ARCRecordMetaData(final String arc, Map headerFields) + throws IOException { + // Make sure the minimum required fields are present, + for (Iterator i = REQUIRED_VERSION_1_HEADER_FIELDS.iterator(); + i.hasNext(); ) { + testRequiredField(headerFields, (String)i.next()); + } + this.headerFields = headerFields; + this.arc = arc; + } + + /** + * Test required field is present in hash. + * + * @param fields Map of fields. + * @param requiredField Field to test for. + * + * @exception IOException If required field is not present. + */ + protected void testRequiredField(Map fields, String requiredField) + throws IOException { + if (!fields.containsKey(requiredField)) { + throw new IOException("Required field " + requiredField + + " not in meta data."); + } + } + + /** + * Get the time when the record was harvested. + *

+ * Returns the date in Heritrix 14 digit time format (UTC). See the + * {@link org.archive.util.ArchiveUtils} class for converting to Java + * dates. + * + * @return Header date in Heritrix 14 digit format. + * @see org.archive.util.ArchiveUtils#parse14DigitDate(String) + */ + public String getDate() { + return (String) this.headerFields.get(DATE_FIELD_KEY); + } + + /** + * @return Return length of the record. + */ + public long getLength() { + return Long.parseLong((String)this.headerFields. + get(LENGTH_FIELD_KEY)); + } + + /** + * @return Return Content-Length of the contents of the record + * Same as record length for arcs? TODO + */ + public long getContentLength() { + return getLength(); + } + + /** + * @return Header url. + */ + public String getUrl() { + return (String)this.headerFields.get(URL_FIELD_KEY); + } + + /** + * @return IP. + */ + public String getIp() + { + return (String)this.headerFields.get(IP_HEADER_FIELD_KEY); + } + + /** + * @return mimetype The mimetype that is in the ARC metaline -- NOT the http + * content-type content. + */ + public String getMimetype() { + return (String)this.headerFields.get(MIMETYPE_FIELD_KEY); + } + + /** + * @return Arcfile version. + */ + public String getVersion() { + return (String)this.headerFields.get(VERSION_FIELD_KEY); + } + + /** + * @return Offset into arcfile at which this record begins. + */ + public long getOffset() { + return ((Long)this.headerFields.get(ABSOLUTE_OFFSET_KEY)).longValue(); + } + + /** + * @param key Key to use looking up field value. + * @return value for passed key of null if no such entry. + */ + public Object getHeaderValue(String key) { + return this.headerFields.get(key); + } + + /** + * @return Header field name keys. + */ + public Set getHeaderFieldKeys() + { + return this.headerFields.keySet(); + } + + /** + * @return Map of header fields. + */ + public Map getHeaderFields() { + return this.headerFields; + } + + /** + * @return Returns identifier for ARC. + */ + public String getArc() { + return this.arc; + } + + /** + * @return Convenience method that does a + * return new File(this.arc) (Be aware this.arc is not always + * full path to an ARC file -- may be an URL). Test + * returned file for existence. + */ + public File getArcFile() { + return new File(this.arc); + } + + /** + * @return Returns the digest. + */ + public String getDigest() { + return this.digest; + } + + /** + * @param d The digest to set. + */ + public void setDigest(String d) { + this.digest = d; + } + + /** + * @return Returns the statusCode. May be null. + */ + public String getStatusCode() { + return this.statusCode; + } + + /** + * @param statusCode The statusCode to set. + */ + public void setStatusCode(String statusCode) { + this.statusCode = statusCode; + } + + public String toString() { + return ((this.arc != null)? this.arc: "") + + ": " + + ((this.headerFields != null)? this.headerFields.toString(): ""); + } + + public String getReaderIdentifier() { + return this.getArc(); + } + + public String getRecordIdentifier() { + return getDate() + "/" + getUrl(); + } + + public int getContentBegin() { + return this.contentBegin; + } + + protected void setContentBegin(final int offset) { + this.contentBegin = offset; + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/arc/ARCUtils.java b/src/main/java/org/archive/io/arc/ARCUtils.java new file mode 100644 index 00000000..985457e2 --- /dev/null +++ b/src/main/java/org/archive/io/arc/ARCUtils.java @@ -0,0 +1,240 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io.arc; + +import it.unimi.dsi.fastutil.io.RepositionableStream; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.net.URI; +import java.net.URISyntaxException; + +import org.archive.url.UsableURI; +import org.archive.util.zip.GzipHeader; +import org.archive.util.zip.NoGzipMagicException; + +public class ARCUtils implements ARCConstants { + /** + * @param pathOrUri Path or URI to extract arc filename from. + * @return Extracted arc file name. + * @throws URISyntaxException + */ + public static String parseArcFilename(final String pathOrUri) + throws URISyntaxException { + String path = pathOrUri; + if (UsableURI.hasScheme(pathOrUri)) { + URI url = new URI(pathOrUri); + path = url.getPath(); + } + return (new File(path)).getName(); + } + + /** + * @param arcFile File to test. + * @return True if arcFile is compressed ARC. + * @throws IOException + */ + public static boolean isCompressed(File arcFile) throws IOException { + return testCompressedARCFile(arcFile); + } + + /** + * Check file is compressed and in ARC GZIP format. + * + * @param arcFile File to test if its Internet Archive ARC file + * GZIP compressed. + * + * @return True if this is an Internet Archive GZIP'd ARC file (It begins + * w/ the Internet Archive GZIP header and has the + * COMPRESSED_ARC_FILE_EXTENSION suffix). + * + * @exception IOException If file does not exist or is not unreadable. + */ + public static boolean testCompressedARCFile(File arcFile) + throws IOException { + return testCompressedARCFile(arcFile, false); + } + + /** + * Check file is compressed and in ARC GZIP format. + * + * @param arcFile File to test if its Internet Archive ARC file + * GZIP compressed. + * @param skipSuffixCheck Set to true if we're not to test on the + * '.arc.gz' suffix. + * + * @return True if this is an Internet Archive GZIP'd ARC file (It begins + * w/ the Internet Archive GZIP header). + * + * @exception IOException If file does not exist or is not unreadable. + */ + public static boolean testCompressedARCFile(File arcFile, + boolean skipSuffixCheck) + throws IOException { + boolean compressedARCFile = false; + isReadable(arcFile); + if(!skipSuffixCheck && !arcFile.getName().toLowerCase() + .endsWith(COMPRESSED_ARC_FILE_EXTENSION)) { + return compressedARCFile; + } + + final InputStream is = new FileInputStream(arcFile); + try { + compressedARCFile = testCompressedARCStream(is); + } finally { + is.close(); + } + return compressedARCFile; + } + + /** + * Tests passed stream is gzip stream by reading in the HEAD. + * Does not reposition the stream. That is left up to the caller. + * @param is An InputStream. + * @return True if compressed stream. + * @throws IOException + */ + public static boolean testCompressedARCStream(final InputStream is) + throws IOException { + boolean compressedARCFile = false; + GzipHeader gh = null; + try { + gh = new GzipHeader(is); + } catch (NoGzipMagicException e ) { + return compressedARCFile; + } + + byte[] fextra = gh.getFextra(); + // Now make sure following bytes are IA GZIP comment. + // First check length. ARC_GZIP_EXTRA_FIELD includes length + // so subtract two and start compare to ARC_GZIP_EXTRA_FIELD + // at +2. + if (fextra != null && + ARC_GZIP_EXTRA_FIELD.length - 2 == fextra.length) { + compressedARCFile = true; + for (int i = 0; i < fextra.length; i++) { + if (fextra[i] != ARC_GZIP_EXTRA_FIELD[i + 2]) { + compressedARCFile = false; + break; + } + } + } + return compressedARCFile; + } + + /** + * Tests passed stream is gzip stream by reading in the HEAD. + * Does reposition of stream when done. + * @param rs An InputStream that is Repositionable. + * @return True if compressed stream. + * @throws IOException + */ + public static boolean testCompressedRepositionalStream( + final RepositionableStream rs) + throws IOException { + boolean compressedARCFile = false; + long p = rs.position(); + try { + compressedARCFile = testCompressedStream((InputStream)rs); + } finally { + rs.position(p); + } + return compressedARCFile; + } + + /** + * Tests passed stream is gzip stream by reading in the HEAD. + * Does reposition of stream when done. + * @param is An InputStream. + * @return True if compressed stream. + * @throws IOException + */ + public static boolean testCompressedStream(final InputStream is) + throws IOException { + boolean compressedARCFile = false; + try { + new GzipHeader(is); + compressedARCFile = true; + } catch (NoGzipMagicException e) { + return compressedARCFile; + } + return compressedARCFile; + } + + /** + * Check file is uncompressed ARC file. + * + * @param arcFile + * File to test if its Internet Archive ARC file uncompressed. + * + * @return True if this is an Internet Archive ARC file. + * + * @exception IOException + * If file does not exist or is not unreadable. + */ + public static boolean testUncompressedARCFile(File arcFile) + throws IOException { + boolean uncompressedARCFile = false; + isReadable(arcFile); + if(arcFile.getName().toLowerCase().endsWith(ARC_FILE_EXTENSION)) { + FileInputStream fis = new FileInputStream(arcFile); + try { + byte [] b = new byte[ARC_MAGIC_NUMBER.length()]; + int read = fis.read(b, 0, ARC_MAGIC_NUMBER.length()); + fis.close(); + if (read == ARC_MAGIC_NUMBER.length()) { + StringBuffer beginStr + = new StringBuffer(ARC_MAGIC_NUMBER.length()); + for (int i = 0; i < ARC_MAGIC_NUMBER.length(); i++) { + beginStr.append((char)b[i]); + } + + if (beginStr.toString(). + equalsIgnoreCase(ARC_MAGIC_NUMBER)) { + uncompressedARCFile = true; + } + } + } finally { + fis.close(); + } + } + + return uncompressedARCFile; + } + + + /** + * @param arcFile File to test. + * @exception IOException If file does not exist or is not unreadable. + */ + private static void isReadable(File arcFile) throws IOException { + if (!arcFile.exists()) { + throw new FileNotFoundException(arcFile.getAbsolutePath() + + " does not exist."); + } + + if (!arcFile.canRead()) { + throw new FileNotFoundException(arcFile.getAbsolutePath() + + " is not readable."); + } + } +} diff --git a/src/main/java/org/archive/io/arc/ARCWriter.java b/src/main/java/org/archive/io/arc/ARCWriter.java new file mode 100644 index 00000000..b5825d50 --- /dev/null +++ b/src/main/java/org/archive/io/arc/ARCWriter.java @@ -0,0 +1,459 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io.arc; + +import java.io.BufferedInputStream; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.Closeable; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.PrintStream; +import java.io.UnsupportedEncodingException; +import java.util.Iterator; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.logging.Logger; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.archive.io.ReplayInputStream; +import org.archive.io.WriterPoolMember; +import org.archive.io.WriterPoolSettings; +import org.archive.util.ArchiveUtils; +import org.archive.util.DevUtils; +import org.archive.util.MimetypeUtils; + + +/** + * Write ARC files. + * + * Assumption is that the caller is managing access to this ARCWriter ensuring + * only one thread of control accessing this ARC file instance at any one time. + * + *

ARC files are described here: + * Arc + * File Format. This class does version 1 of the ARC file format. It also + * writes version 1.1 which is version 1 with data stuffed into the body of the + * first arc record in the file, the arc file meta record itself. + * + *

An ARC file is three lines of meta data followed by an optional 'body' and + * then a couple of '\n' and then: record, '\n', record, '\n', record, etc. + * If we are writing compressed ARC files, then each of the ARC file records is + * individually gzipped and concatenated together to make up a single ARC file. + * In GZIP terms, each ARC record is a GZIP member of a total gzip'd + * file. + * + *

The GZIPping of the ARC file meta data is exceptional. It is GZIPped + * w/ an extra GZIP header, a special Internet Archive (IA) extra header field + * (e.g. FEXTRA is set in the GZIP header FLG field and an extra field is + * appended to the GZIP header). The extra field has little in it but its + * presence denotes this GZIP as an Internet Archive gzipped ARC. See RFC1952 + * to learn about the GZIP header structure. + * + *

This class then does its GZIPping in the following fashion. Each GZIP + * member is written w/ a new instance of GZIPOutputStream -- actually + * ARCWriterGZIPOututStream so we can get access to the underlying stream. + * The underlying stream stays open across GZIPoutputStream instantiations. + * For the 'special' GZIPing of the ARC file meta data, we cheat by catching the + * GZIPOutputStream output into a byte array, manipulating it adding the + * IA GZIP header, before writing to the stream. + * + *

I tried writing a resettable GZIPOutputStream and could make it work w/ + * the SUN JDK but the IBM JDK threw NPE inside in the deflate.reset -- its zlib + * native call doesn't seem to like the notion of resetting -- so I gave up on + * it. + * + *

Because of such as the above and troubles with GZIPInputStream, we should + * write our own GZIP*Streams, ones that resettable and consious of gzip + * members. + * + *

This class will write until we hit >= maxSize. The check is done at + * record boundary. Records do not span ARC files. We will then close current + * file and open another and then continue writing. + * + *

TESTING: Here is how to test that produced ARC files are good + * using the + * alexa + * ARC c-tools: + *

+ * % av_procarc hx20040109230030-0.arc.gz | av_ziparc > \
+ *     /tmp/hx20040109230030-0.dat.gz
+ * % av_ripdat /tmp/hx20040109230030-0.dat.gz > /tmp/hx20040109230030-0.cdx
+ * 
+ * Examine the produced cdx file to make sure it makes sense. Search + * for 'no-type 0'. If found, then we're opening a gzip record w/o data to + * write. This is bad. + * + *

You can also do gzip -t FILENAME and it will tell you if the + * ARC makes sense to GZIP. + * + *

While being written, ARCs have a '.open' suffix appended. + * + * @author stack + */ +public class ARCWriter extends WriterPoolMember implements ARCConstants, Closeable { + private static final Logger logger = + Logger.getLogger(ARCWriter.class.getName()); + + /** + * Metadata line pattern. + */ + private static final Pattern METADATA_LINE_PATTERN = + Pattern.compile("^\\S+ \\S+ \\S+ \\S+ \\S+(" + LINE_SEPARATOR + "?)$"); + + + /** + * Constructor. + * Takes a stream. Use with caution. There is no upperbound check on size. + * Will just keep writing. + * + * @param serialNo used to generate unique file name sequences + * @param out Where to write. + * @param arc File the out is connected to. + * @param cmprs Compress the content written. + * @param metadata File meta data. Can be null. Is list of File and/or + * String objects. + * @param a14DigitDate If null, we'll write current time. + * @throws IOException + */ + public ARCWriter(final AtomicInteger serialNo, final PrintStream out, + final File arc, final WriterPoolSettings settings) + throws IOException { + super(serialNo, out, arc, settings); + writeFirstRecord(ArchiveUtils.get14DigitDate()); + } + + /** + * Constructor. + * + * @param serialNo used to generate unique file name sequences + * @param settings all creation parameters + */ + public ARCWriter(final AtomicInteger serialNo, final WriterPoolSettings settings) { + super(serialNo, settings, ARC_FILE_EXTENSION); + + } + + protected String createFile() + throws IOException { + String name = super.createFile(); + writeFirstRecord(currentTimestamp); + return name; + } + + private void writeFirstRecord(final String ts) + throws IOException { + write(generateARCFileMetaData(ts)); + } + + /** + * Write out the ARCMetaData. + * + *

Generate ARC file meta data. Currently we only do version 1 of the + * ARC file formats or version 1.1 when metadata has been supplied (We + * write it into the body of the first record in the arc file). + * + *

Version 1 metadata looks roughly like this: + * + *

filedesc://testWriteRecord-JunitIAH20040110013326-2.arc 0.0.0.0 \\
+     *  20040110013326 text/plain 77
+     * 1 0 InternetArchive
+     * URL IP-address Archive-date Content-type Archive-length
+     * 
+ * + *

If compress is set, then we generate a header that has been gzipped + * in the Internet Archive manner. Such a gzipping enables the FEXTRA + * flag in the FLG field of the gzip header. It then appends an extra + * header field: '8', '0', 'L', 'X', '0', '0', '0', '0'. The first two + * bytes are the length of the field and the last 6 bytes the Internet + * Archive header. To learn about GZIP format, see RFC1952. To learn + * about the Internet Archive extra header field, read the source for + * av_ziparc which can be found at + * alexa/vista/alexa-tools-1.2/src/av_ziparc.cc. + * + *

We do things in this roundabout manner because the java + * GZIPOutputStream does not give access to GZIP header fields. + * + * @param date Date to put into the ARC metadata; if 17-digit will be + * truncated to traditional 14-digits + * + * @return Byte array filled w/ the arc header. + * @throws IOException + */ + private byte [] generateARCFileMetaData(String date) + throws IOException { + if(date!=null && date.length()>14) { + date = date.substring(0,14); + } + int metadataBodyLength = getMetadataLength(); + // If metadata body, then the minor part of the version is '1' rather + // than '0'. + String metadataHeaderLinesTwoAndThree = + getMetadataHeaderLinesTwoAndThree("1 " + + ((metadataBodyLength > 0)? "1": "0")); + int recordLength = metadataBodyLength + + metadataHeaderLinesTwoAndThree.getBytes(DEFAULT_ENCODING).length; + String metadataHeaderStr = ARC_MAGIC_NUMBER + getBaseFilename() + + " 0.0.0.0 " + date + " text/plain " + recordLength + + metadataHeaderLinesTwoAndThree; + ByteArrayOutputStream metabaos = + new ByteArrayOutputStream(recordLength); + // Write the metadata header. + metabaos.write(metadataHeaderStr.getBytes(DEFAULT_ENCODING)); + // Write the metadata body, if anything to write. + if (metadataBodyLength > 0) { + writeMetaData(metabaos); + } + + // Write out a LINE_SEPARATORs to end this record. + metabaos.write(LINE_SEPARATOR); + + // Now get bytes of all just written and compress if flag set. + byte [] bytes = metabaos.toByteArray(); + + if(isCompressed()) { + // GZIP the header but catch the gzipping into a byte array so we + // can add the special IA GZIP header to the product. After + // manipulations, write to the output stream (The JAVA GZIP + // implementation does not give access to GZIP header. It + // produces a 'default' header only). We can get away w/ these + // maniupulations because the GZIP 'default' header doesn't + // do the 'optional' CRC'ing of the header. + byte [] gzippedMetaData = ArchiveUtils.gzip(bytes); + if (gzippedMetaData[3] != 0) { + throw new IOException("The GZIP FLG header is unexpectedly " + + " non-zero. Need to add smarter code that can deal " + + " when already extant extra GZIP header fields."); + } + // Set the GZIP FLG header to '4' which says that the GZIP header + // has extra fields. Then insert the alex {'L', 'X', '0', '0', '0, + // '0'} 'extra' field. The IA GZIP header will also set byte + // 9 (zero-based), the OS byte, to 3 (Unix). We'll do the same. + gzippedMetaData[3] = 4; + gzippedMetaData[9] = 3; + byte [] assemblyBuffer = new byte[gzippedMetaData.length + + ARC_GZIP_EXTRA_FIELD.length]; + // '10' in the below is a pointer past the following bytes of the + // GZIP header: ID1 ID2 CM FLG + MTIME(4-bytes) XFL OS. See + // RFC1952 for explaination of the abbreviations just used. + System.arraycopy(gzippedMetaData, 0, assemblyBuffer, 0, 10); + System.arraycopy(ARC_GZIP_EXTRA_FIELD, 0, assemblyBuffer, 10, + ARC_GZIP_EXTRA_FIELD.length); + System.arraycopy(gzippedMetaData, 10, assemblyBuffer, + 10 + ARC_GZIP_EXTRA_FIELD.length, gzippedMetaData.length - 10); + bytes = assemblyBuffer; + } + return bytes; + } + + public String getMetadataHeaderLinesTwoAndThree(String version) { + StringBuffer buffer = new StringBuffer(); + buffer.append(LINE_SEPARATOR); + buffer.append(version); + buffer.append(" InternetArchive"); + buffer.append(LINE_SEPARATOR); + buffer.append("URL IP-address Archive-date Content-type Archive-length"); + buffer.append(LINE_SEPARATOR); + return buffer.toString(); + } + + /** + * Write all metadata to passed baos. + * + * @param baos Byte array to write to. + * @throws UnsupportedEncodingException + * @throws IOException + */ + private void writeMetaData(ByteArrayOutputStream baos) + throws UnsupportedEncodingException, IOException { + if (settings.getMetadata() == null) { + return; + } + + for (Iterator i = settings.getMetadata().iterator(); + i.hasNext();) { + Object obj = i.next(); + if (obj instanceof String) { + baos.write(((String)obj).getBytes(DEFAULT_ENCODING)); + } else if (obj instanceof File) { + InputStream is = null; + try { + is = new BufferedInputStream( + new FileInputStream((File)obj)); + byte [] buffer = new byte[4096]; + for (int read = -1; (read = is.read(buffer)) != -1;) { + baos.write(buffer, 0, read); + } + } finally { + if (is != null) { + is.close(); + } + } + } else if (obj != null) { + logger.severe("Unsupported metadata type: " + obj); + } + } + return; + } + + /** + * @return Total length of metadata. + * @throws UnsupportedEncodingException + */ + private int getMetadataLength() + throws UnsupportedEncodingException { + int result = -1; + if (settings.getMetadata() == null) { + result = 0; + } else { + for (Iterator i = settings.getMetadata().iterator(); + i.hasNext();) { + Object obj = i.next(); + if (obj instanceof String) { + result += ((String)obj).getBytes(DEFAULT_ENCODING).length; + } else if (obj instanceof File) { + result += ((File)obj).length(); + } else { + logger.severe("Unsupported metadata type: " + obj); + } + } + } + return result; + } + + /** + * @deprecated use input-stream version directly instead + */ + public void write(String uri, String contentType, String hostIP, + long fetchBeginTimeStamp, long recordLength, + ByteArrayOutputStream baos) + throws IOException { + write(uri, contentType, hostIP, fetchBeginTimeStamp, recordLength, + new ByteArrayInputStream(baos.toByteArray()), false); + } + + public void write(String uri, String contentType, String hostIP, + long fetchBeginTimeStamp, long recordLength, InputStream in) + throws IOException { + write(uri,contentType,hostIP,fetchBeginTimeStamp,recordLength,in,true); + } + + /** + * Write a record with the given metadata/content. + * + * @param uri + * URI for metadata-line + * @param contentType + * MIME content-type for metadata-line + * @param hostIP + * IP for metadata-line + * @param fetchBeginTimeStamp + * timestamp for metadata-line + * @param recordLength + * length for metadata-line; also may be enforced + * @param in + * source InputStream for record content + * @param enforceLength + * whether to enforce the declared length; should be true + * unless intentionally writing bad records for testing + * @throws IOException + */ + public void write(String uri, String contentType, String hostIP, + long fetchBeginTimeStamp, long recordLength, InputStream in, + boolean enforceLength) throws IOException { + preWriteRecordTasks(); + try { + write(getMetaLine(uri, contentType, hostIP, fetchBeginTimeStamp, + recordLength).getBytes(UTF8)); + copyFrom(in, recordLength, enforceLength); + if (in instanceof ReplayInputStream) { + // check for consumption of entire recorded material + long remaining = ((ReplayInputStream) in).remaining(); + // Should be zero at this stage. If not, something is + // wrong. + if (remaining != 0) { + String message = "Gap between expected and actual: " + + remaining + LINE_SEPARATOR + DevUtils.extraInfo() + + " writing arc " + + this.getFile().getAbsolutePath(); + DevUtils.warnHandle(new Throwable(message), message); + throw new IOException(message); + } + } + write(LINE_SEPARATOR); + } finally { + postWriteRecordTasks(); + } + } + + /** + * @param uri + * @param contentType + * @param hostIP + * @param fetchBeginTimeStamp + * @param recordLength + * @return Metadata line for an ARCRecord made of passed components. + * @exception IOException + */ + protected String getMetaLine(String uri, String contentType, String hostIP, + long fetchBeginTimeStamp, long recordLength) + throws IOException { + if (fetchBeginTimeStamp <= 0) { + throw new IOException("Bogus fetchBeginTimestamp: " + + Long.toString(fetchBeginTimeStamp)); + } + + return validateMetaLine(createMetaline(uri, hostIP, + ArchiveUtils.get14DigitDate(fetchBeginTimeStamp), + MimetypeUtils.truncate(contentType), + Long.toString(recordLength))); + } + + public String createMetaline(String uri, String hostIP, + String timeStamp, String mimetype, String recordLength) { + return uri + HEADER_FIELD_SEPARATOR + hostIP + + HEADER_FIELD_SEPARATOR + timeStamp + + HEADER_FIELD_SEPARATOR + mimetype + + HEADER_FIELD_SEPARATOR + recordLength + LINE_SEPARATOR; + } + + /** + * Test that the metadata line is valid before writing. + * @param metaLineStr + * @throws IOException + * @return The passed in metaline. + */ + protected String validateMetaLine(String metaLineStr) + throws IOException { + if (metaLineStr.length() > MAX_METADATA_LINE_LENGTH) { + throw new IOException("Metadata line too long (" + + metaLineStr.length() + ">" + MAX_METADATA_LINE_LENGTH + + "): " + metaLineStr); + } + Matcher m = METADATA_LINE_PATTERN.matcher(metaLineStr); + if (!m.matches()) { + throw new IOException("Metadata line doesn't match expected" + + " pattern: " + metaLineStr); + } + return metaLineStr; + } +} diff --git a/src/main/java/org/archive/io/arc/ARCWriterPool.java b/src/main/java/org/archive/io/arc/ARCWriterPool.java new file mode 100644 index 00000000..b55b3ed4 --- /dev/null +++ b/src/main/java/org/archive/io/arc/ARCWriterPool.java @@ -0,0 +1,69 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io.arc; + +import java.util.concurrent.atomic.AtomicInteger; + +import org.archive.io.WriterPool; +import org.archive.io.WriterPoolMember; +import org.archive.io.WriterPoolSettings; + + +/** + * A pool of ARCWriters. + * + * @author stack + */ +public class ARCWriterPool extends WriterPool { + /** + * Constructor + * + * @param settings Settings for this pool. + * @param poolMaximumActive + * @param poolMaximumWait + */ + public ARCWriterPool(final WriterPoolSettings settings, + final int poolMaximumActive, final int poolMaximumWait) { + this(new AtomicInteger(), settings, poolMaximumActive, poolMaximumWait); + } + + /** + * Constructor + * + * @param serial Used to generate unique filename sequences + * @param settings Settings for this pool. + * @param poolMaximumActive + * @param poolMaximumWait + */ + public ARCWriterPool(final AtomicInteger serial, + final WriterPoolSettings settings, + final int poolMaximumActive, final int poolMaximumWait) { + super(serial, settings, poolMaximumActive, poolMaximumWait); + } + + /* (non-Javadoc) + * @see org.archive.io.WriterPool#makeWriter() + */ + protected WriterPoolMember makeWriter() { + return new ARCWriter(serialNo, settings); + } + + + +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/arc/WriterPoolSettingsData.java b/src/main/java/org/archive/io/arc/WriterPoolSettingsData.java new file mode 100644 index 00000000..7396f2d8 --- /dev/null +++ b/src/main/java/org/archive/io/arc/WriterPoolSettingsData.java @@ -0,0 +1,80 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io.arc; + +import java.io.File; +import java.util.List; + +import org.archive.io.WriterPoolSettings; + +public class WriterPoolSettingsData implements WriterPoolSettings { + protected long maxFileSizeBytes; + protected String prefix; + protected String template; + protected List outputDirs; + protected boolean compress; + protected List metadata; + protected boolean frequentFlushes = true; + protected int writeBufferSize = 16*1024; + + public WriterPoolSettingsData(String prefix, String template, + long maxFileSizeBytes, boolean compress, List outputDirs, + List metadata) { + super(); + this.maxFileSizeBytes = maxFileSizeBytes; + this.prefix = prefix; + this.template = template; + this.outputDirs = outputDirs; + this.compress = compress; + this.metadata = metadata; + } + + @Override + public boolean getCompress() { + return compress; + } + @Override + public long getMaxFileSizeBytes() { + return maxFileSizeBytes; + } + @Override + public List getMetadata() { + return metadata; + } + @Override + public List calcOutputDirs() { + return outputDirs; + } + @Override + public String getPrefix() { + return prefix; + } + @Override + public String getTemplate() { + return template; + } + @Override + public boolean getFrequentFlushes() { + return frequentFlushes; + } + @Override + public int getWriteBufferSize() { + return writeBufferSize; + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/package.html b/src/main/java/org/archive/io/package.html new file mode 100644 index 00000000..d1798b80 --- /dev/null +++ b/src/main/java/org/archive/io/package.html @@ -0,0 +1,9 @@ + + + +org.archive.io.arc package + + +ARC file reading and writing. + + diff --git a/src/main/java/org/archive/io/warc/WARCConstants.java b/src/main/java/org/archive/io/warc/WARCConstants.java new file mode 100644 index 00000000..83cc8a6d --- /dev/null +++ b/src/main/java/org/archive/io/warc/WARCConstants.java @@ -0,0 +1,24 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io.warc; + +@Deprecated +public interface WARCConstants extends org.archive.format.warc.WARCConstants { +} diff --git a/src/main/java/org/archive/io/warc/WARCReader.java b/src/main/java/org/archive/io/warc/WARCReader.java new file mode 100644 index 00000000..a34854ef --- /dev/null +++ b/src/main/java/org/archive/io/warc/WARCReader.java @@ -0,0 +1,287 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io.warc; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.util.Iterator; +import java.util.List; + +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.Options; +import org.apache.commons.cli.ParseException; +import org.apache.commons.cli.PosixParser; +import org.apache.commons.lang.NotImplementedException; +import org.archive.io.ArchiveReader; +import org.archive.io.ArchiveRecord; + +/** + * WARCReader. + * Go via {@link WARCReaderFactory} to get instance. + * @author stack + * @version $Date: 2006-11-27 18:03:03 -0800 (Mon, 27 Nov 2006) $ $Version$ + */ +public class WARCReader extends ArchiveReader implements WARCConstants { + protected WARCReader() { + super(); + } + + @Override + protected void initialize(String i) { + super.initialize(i); + setVersion(WARC_VERSION); + } + + /** + * Skip over any trailing new lines at end of the record so we're lined up + * ready to read the next. + * @param record + * @throws IOException + */ + protected void gotoEOR(ArchiveRecord record) throws IOException { + if (record.available() != 0) { + throw new IOException("Record should be exhausted before coming " + + "in here"); + } + + // Records end in 2*CRLF. Suck it up. + readExpectedChar(getIn(), CRLF.charAt(0)); + readExpectedChar(getIn(), CRLF.charAt(1)); + readExpectedChar(getIn(), CRLF.charAt(0)); + readExpectedChar(getIn(), CRLF.charAt(1)); + } + + protected void readExpectedChar(final InputStream is, final int expected) + throws IOException { + int c = is.read(); + if (c != expected) { + throw new IOException("Unexpected character " + + Integer.toHexString(c) + "(Expecting " + + Integer.toHexString(expected) + ")"); + } + } + + /** + * Create new WARC record. + * Encapsulate housekeeping that has to do w/ creating new Record. + * @param is InputStream to use. + * @param offset Absolute offset into WARC file. + * @return A WARCRecord. + * @throws IOException + */ + protected WARCRecord createArchiveRecord(InputStream is, long offset) + throws IOException { + return (WARCRecord)currentRecord(new WARCRecord(is, + getReaderIdentifier(), offset, isDigest(), isStrict())); + } + + @Override + public void dump(boolean compress) + throws IOException, java.text.ParseException { + for (final Iterator i = iterator(); i.hasNext();) { + ArchiveRecord r = i.next(); + System.out.println(r.getHeader().toString()); + r.dump(); + System.out.println(); + } + } + + + @Override + public ArchiveReader getDeleteFileOnCloseReader(final File f) { + throw new NotImplementedException("TODO"); + } + + @Override + public String getDotFileExtension() { + return DOT_WARC_FILE_EXTENSION; + } + + @Override + public String getFileExtension() { + return WARC_FILE_EXTENSION; + } + + // Static methods follow. Mostly for command-line processing. + + /** + * + * @param formatter Help formatter instance. + * @param options Usage options. + * @param exitCode Exit code. + */ + private static void usage(HelpFormatter formatter, Options options, + int exitCode) { + formatter.printHelp("java org.archive.io.arc.WARCReader" + + " [--digest=true|false] \\\n" + + " [--format=cdx|cdxfile|dump|gzipdump]" + + " [--offset=#] \\\n[--strict] [--parse] WARC_FILE|WARC_URL", + options); + System.exit(exitCode); + } + + /** + * Write out the arcfile. + * + * @param reader + * @param format Format to use outputting. + * @throws IOException + * @throws java.text.ParseException + */ + protected static void output(WARCReader reader, String format) + throws IOException, java.text.ParseException { + if (!reader.output(format)) { + throw new IOException("Unsupported format: " + format); + } + } + + /** + * Generate a CDX index file for an ARC file. + * + * @param urlOrPath The ARC file to generate a CDX index for + * @throws IOException + * @throws java.text.ParseException + */ + public static void createCDXIndexFile(String urlOrPath) + throws IOException, java.text.ParseException { + WARCReader r = WARCReaderFactory.get(urlOrPath); + r.setStrict(false); + r.setDigest(true); + output(r, CDX_FILE); + } + + /** + * Command-line interface to WARCReader. + * + * Here is the command-line interface: + *

+     * usage: java org.archive.io.arc.WARCReader [--offset=#] ARCFILE
+     *  -h,--help      Prints this message and exits.
+     *  -o,--offset    Outputs record at this offset into arc file.
+ * + *

Outputs using a pseudo-CDX format as described here: + * CDX + * Legent and here + * Example. + * Legend used in below is: 'CDX b e a m s c V (or v if uncompressed) n g'. + * Hash is hard-coded straight SHA-1 hash of content. + * + * @param args Command-line arguments. + * @throws ParseException Failed parse of the command line. + * @throws IOException + * @throws java.text.ParseException + */ + public static void main(String [] args) + throws ParseException, IOException, java.text.ParseException { + Options options = getOptions(); + PosixParser parser = new PosixParser(); + CommandLine cmdline = parser.parse(options, args, false); + @SuppressWarnings("unchecked") + List cmdlineArgs = cmdline.getArgList(); + Option [] cmdlineOptions = cmdline.getOptions(); + HelpFormatter formatter = new HelpFormatter(); + + // If no args, print help. + if (cmdlineArgs.size() <= 0) { + usage(formatter, options, 0); + } + + // Now look at options passed. + long offset = -1; + boolean digest = false; + boolean strict = false; + String format = CDX; + for (int i = 0; i < cmdlineOptions.length; i++) { + switch(cmdlineOptions[i].getId()) { + case 'h': + usage(formatter, options, 0); + break; + + case 'o': + offset = + Long.parseLong(cmdlineOptions[i].getValue()); + break; + + case 's': + strict = true; + break; + + case 'd': + digest = getTrueOrFalse(cmdlineOptions[i].getValue()); + break; + + case 'f': + format = cmdlineOptions[i].getValue().toLowerCase(); + boolean match = false; + // List of supported formats. + final String [] supportedFormats = + {CDX, DUMP, GZIP_DUMP, CDX_FILE}; + for (int ii = 0; ii < supportedFormats.length; ii++) { + if (supportedFormats[ii].equals(format)) { + match = true; + break; + } + } + if (!match) { + usage(formatter, options, 1); + } + break; + + default: + throw new RuntimeException("Unexpected option: " + + + cmdlineOptions[i].getId()); + } + } + + if (offset >= 0) { + if (cmdlineArgs.size() != 1) { + System.out.println("Error: Pass one arcfile only."); + usage(formatter, options, 1); + } + WARCReader r = WARCReaderFactory.get( + new File((String)cmdlineArgs.get(0)), offset); + r.setStrict(strict); + outputRecord(r, format); + } else { + for (Iterator i = cmdlineArgs.iterator(); i.hasNext();) { + String urlOrPath = (String)i.next(); + try { + WARCReader r = WARCReaderFactory.get(urlOrPath); + r.setStrict(strict); + r.setDigest(digest); + output(r, format); + } catch (RuntimeException e) { + // Write out name of file we failed on to help with + // debugging. Then print stack trace and try to keep + // going. We do this for case where we're being fed + // a bunch of ARCs; just note the bad one and move + // on to the next. + System.err.println("Exception processing " + urlOrPath + + ": " + e.getMessage()); + e.printStackTrace(System.err); + System.exit(1); + } + } + } + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/warc/WARCReaderFactory.java b/src/main/java/org/archive/io/warc/WARCReaderFactory.java new file mode 100644 index 00000000..9c6c7e77 --- /dev/null +++ b/src/main/java/org/archive/io/warc/WARCReaderFactory.java @@ -0,0 +1,307 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io.warc; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.Iterator; + +import org.archive.io.ArchiveReader; +import org.archive.io.ArchiveReaderFactory; +import org.archive.io.ArchiveRecord; +import org.archive.io.warc.WARCConstants; +import org.archive.util.ArchiveUtils; +import org.archive.util.FileUtils; +import org.archive.util.zip.GZIPMembersInputStream; + +import com.google.common.io.CountingInputStream; + +/** + * Factory for WARC Readers. + * Figures whether to give out a compressed file Reader or an uncompressed + * Reader. + * @author stack + * @version $Date: 2006-08-23 17:59:04 -0700 (Wed, 23 Aug 2006) $ $Version$ + */ +public class WARCReaderFactory extends ArchiveReaderFactory +implements WARCConstants { + private static final WARCReaderFactory factory = new WARCReaderFactory(); + + /** + * Shutdown any access to default constructor. + * This factory is Singleton. + */ + private WARCReaderFactory() { + super(); + } + + public static WARCReader get(String arcFileOrUrl) + throws MalformedURLException, IOException { + return (WARCReader)WARCReaderFactory.factory. + getArchiveReader(arcFileOrUrl); + } + + public static WARCReader get(final File f) throws IOException { + return (WARCReader)WARCReaderFactory.factory.getArchiveReader(f); + } + + /** + * @param f An arcfile to read. + * @param offset Have returned Reader set to start reading at this offset. + * @return A WARCReader. + * @throws IOException + */ + public static WARCReader get(final File f, final long offset) + throws IOException { + return (WARCReader)WARCReaderFactory.factory. + getArchiveReader(f, offset); + } + + protected ArchiveReader getArchiveReader(final File f, final long offset) + throws IOException { + boolean compressed = testCompressedWARCFile(f); + if (!compressed) { + if (!FileUtils.isReadableWithExtensionAndMagic(f, + DOT_WARC_FILE_EXTENSION, WARC_MAGIC)) { + throw new IOException(f.getAbsolutePath() + + " is not a WARC file."); + } + } + return (WARCReader)(compressed? + WARCReaderFactory.factory.new CompressedWARCReader(f, offset): + WARCReaderFactory.factory.new UncompressedWARCReader(f, offset)); + } + + public static ArchiveReader get(final String s, final InputStream is, + final boolean atFirstRecord) + throws IOException { + return WARCReaderFactory.factory.getArchiveReader(s, is, + atFirstRecord); + } + + protected ArchiveReader getArchiveReader(final String f, + final InputStream is, final boolean atFirstRecord) + throws IOException { + // For now, assume stream is compressed. Later add test of input + // stream or handle exception thrown when figure not compressed stream. + return new CompressedWARCReader(f, is, atFirstRecord); + } + + public static WARCReader get(final URL arcUrl, final long offset) + throws IOException { + return (WARCReader)WARCReaderFactory.factory.getArchiveReader(arcUrl, + offset); + } + + /** + * Get an ARCReader. + * Pulls the ARC local into whereever the System Property + * java.io.tmpdir points. It then hands back an ARCReader that + * points at this local copy. A close on this ARCReader instance will + * remove the local copy. + * @param arcUrl An URL that points at an ARC. + * @return An ARCReader. + * @throws IOException + */ + public static WARCReader get(final URL arcUrl) + throws IOException { + return (WARCReader)WARCReaderFactory.factory.getArchiveReader(arcUrl); + } + + /** + * Check file is compressed WARC. + * + * @param f File to test. + * + * @return True if this is compressed WARC (TODO: Just tests if file is + * GZIP'd file (It begins w/ GZIP MAGIC)). + * + * @exception IOException If file does not exist or is not unreadable. + */ + public static boolean testCompressedWARCFile(final File f) + throws IOException { + FileUtils.assertReadable(f); + boolean compressed = false; + final InputStream is = new FileInputStream(f); + try { + compressed = ArchiveUtils.isGzipped(is); + } finally { + is.close(); + } + return compressed; + } + + /** + * Uncompressed WARC file reader. + * @author stack + */ + public class UncompressedWARCReader extends WARCReader { + /** + * Constructor. + * @param f Uncompressed arcfile to read. + * @throws IOException + */ + public UncompressedWARCReader(final File f) + throws IOException { + this(f, 0); + } + + /** + * Constructor. + * + * @param f Uncompressed file to read. + * @param offset Offset at which to position Reader. + * @throws IOException + */ + public UncompressedWARCReader(final File f, final long offset) + throws IOException { + // File has been tested for existence by time it has come to here. + setIn(new CountingInputStream(getInputStream(f, offset))); + getIn().skip(offset); + initialize(f.getAbsolutePath()); + } + + /** + * Constructor. + * + * @param f Uncompressed file to read. + * @param is InputStream. + */ + public UncompressedWARCReader(final String f, final InputStream is) { + // Arc file has been tested for existence by time it has come + // to here. + setIn(new CountingInputStream(is)); + initialize(f); + } + } + + /** + * Compressed WARC file reader. + * + * @author stack + */ + public class CompressedWARCReader extends WARCReader { + /** + * Constructor. + * + * @param f Compressed file to read. + * @throws IOException + */ + public CompressedWARCReader(final File f) throws IOException { + this(f, 0); + } + + /** + * Constructor. + * + * @param f Compressed arcfile to read. + * @param offset Position at where to start reading file. + * @throws IOException + */ + public CompressedWARCReader(final File f, final long offset) + throws IOException { + // File has been tested for existence by time it has come to here. + setIn(new GZIPMembersInputStream(getInputStream(f, offset))); + ((GZIPMembersInputStream)getIn()).compressedSeek(offset); + setCompressed((offset == 0)); // TODO: does this make sense?!?! + initialize(f.getAbsolutePath()); + } + + /** + * Constructor. + * + * @param f Compressed arcfile. + * @param is InputStream to use. + * @param atFirstRecord + * @throws IOException + */ + public CompressedWARCReader(final String f, final InputStream is, + final boolean atFirstRecord) + throws IOException { + // Arc file has been tested for existence by time it has come + // to here. + setIn(new GZIPMembersInputStream(is)); + setCompressed(true); + initialize(f); + // TODO: Ignore atFirstRecord. Probably doesn't apply in WARC world. + } + + /** + * Get record at passed offset. + * + * @param offset Byte index into file at which a record starts. + * @return A WARCRecord reference. + * @throws IOException + */ + public WARCRecord get(long offset) throws IOException { + cleanupCurrentRecord(); + ((GZIPMembersInputStream)getIn()).compressedSeek(offset); + return (WARCRecord) createArchiveRecord(getIn(), offset); + } + + public Iterator iterator() { + /** + * Override ArchiveRecordIterator so can base returned iterator on + * GzippedInputStream iterator. + */ + return new ArchiveRecordIterator() { + private GZIPMembersInputStream gis = + (GZIPMembersInputStream)getIn(); + + private Iterator gzipIterator = this.gis.memberIterator(); + + protected boolean innerHasNext() { + return this.gzipIterator.hasNext(); + } + + protected ArchiveRecord innerNext() throws IOException { + // Get the position before gzipIterator.next moves + // it on past the gzip header. + InputStream is = (InputStream) this.gzipIterator.next(); + return createArchiveRecord(is, Math.max(gis.getCurrentMemberStart(), gis.getCurrentMemberEnd())); + } + }; + } + + protected void gotoEOR(ArchiveRecord rec) throws IOException { + long skipped = 0; + while (getIn().read()>-1) { + skipped++; + } + if(skipped>4) { + System.err.println("unexpected extra data after record "+rec); + } + return; + } + } + + public static boolean isWARCSuffix(final String f) { + return (f == null)? + false: + (f.toLowerCase().endsWith(DOT_COMPRESSED_WARC_FILE_EXTENSION))? + true: + (f.toLowerCase().endsWith(DOT_WARC_FILE_EXTENSION))? + true: false; + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/warc/WARCRecord.java b/src/main/java/org/archive/io/warc/WARCRecord.java new file mode 100644 index 00000000..635d1c3b --- /dev/null +++ b/src/main/java/org/archive/io/warc/WARCRecord.java @@ -0,0 +1,233 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io.warc; + +import it.unimi.dsi.fastutil.io.RepositionableStream; + +import java.io.IOException; +import java.io.InputStream; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.commons.httpclient.Header; +import org.apache.commons.httpclient.HttpParser; +import org.archive.io.ArchiveRecord; +import org.archive.io.ArchiveRecordHeader; +import org.archive.util.LaxHttpParser; + + +/** + * A WARC file Record. + * + * @author stack + */ +public class WARCRecord extends ArchiveRecord implements WARCConstants { + private Pattern WHITESPACE = Pattern.compile("\\s"); + + /** + * Constructor. + * + * @param in Stream cue'd up to be at the start of the record this instance + * is to represent. + * @throws IOException + */ + public WARCRecord(InputStream in, final String identifier, + final long offset) + throws IOException { + this(in, identifier, offset, true, false); + } + + /** + * Constructor. + * @param in Stream cue'd up just past Header Line and Named Fields. + * @param headers Header Line and ANVL Named fields. + * @throws IOException + */ + public WARCRecord(InputStream in, ArchiveRecordHeader headers) + throws IOException { + super(in, headers, 0, true, false); + } + + /** + * Constructor. + * + * @param in Stream cue'd up to be at the start of the record this instance + * is to represent or, if headers is not null, just past the + * Header Line and Named Fields. + * @param identifier Identifier for this the hosting Reader. + * @param offset Current offset into in (Used to keep + * position properly aligned). Usually 0. + * @param digest True if we're to calculate digest for this record. Not + * digesting saves about ~15% of cpu during parse. + * @param strict Be strict parsing (Parsing stops if file inproperly + * formatted). + * @throws IOException + */ + public WARCRecord(final InputStream in, final String identifier, + final long offset, boolean digest, boolean strict) + throws IOException { + super(in, null, 0, digest, strict); + setHeader(parseHeaders(in, identifier, offset, strict)); + } + + /** + * Parse WARC Header Line and Named Fields. + * @param in Stream to read. + * @param identifier Identifier for the hosting Reader. + * @param offset Absolute offset into Reader. + * @param strict Whether to be loose parsing or not. + * @return An ArchiveRecordHeader. + * @throws IOException + */ + protected ArchiveRecordHeader parseHeaders(final InputStream in, + final String identifier, final long offset, final boolean strict) + throws IOException { + final Map m = new HashMap(); + m.put(ABSOLUTE_OFFSET_KEY, new Long(offset)); + m.put(READER_IDENTIFIER_FIELD_KEY, identifier); + + long startPosition = -1; + if (in instanceof RepositionableStream) { + startPosition = ((RepositionableStream)in).position(); + } + String firstLine = + new String(LaxHttpParser.readLine(in, WARC_HEADER_ENCODING)); + if (firstLine == null || firstLine.length() <=0) { + throw new IOException("Failed to read WARC_MAGIC"); + } + if (!firstLine.startsWith(WARC_MAGIC)) { + throw new IOException("Failed to find WARC MAGIC: " + firstLine); + } + // Here we start reading off the inputstream but we're reading the + // stream direct rather than going via WARCRecord#read. The latter will + // keep count of bytes read, digest and fail properly if EOR too soon... + // We don't want digesting while reading Headers. + // + Header [] h = LaxHttpParser.parseHeaders(in, WARC_HEADER_ENCODING); + for (int i = 0; i < h.length; i++) { + m.put(h[i].getName(), h[i].getValue()); + } + int headerLength = -1; + if (in instanceof RepositionableStream) { + headerLength = + (int)(((RepositionableStream)in).position() - startPosition); + } + final int contentOffset = headerLength; + incrementPosition(contentOffset); + + return new ArchiveRecordHeader() { + private Map headers = m; + private int contentBegin = contentOffset; + + public String getDate() { + return (String)this.headers.get(HEADER_KEY_DATE); + } + + public String getDigest() { + return null; + // TODO: perhaps return block-digest? + // superclass def implies this is calculated ("only after + // read in totality"), not pulled from header, so + // below prior implementation was misleading +// return (String)this.headers.get(HEADER_KEY_CHECKSUM); + } + + public String getReaderIdentifier() { + return (String)this.headers.get(READER_IDENTIFIER_FIELD_KEY); + } + + public Set getHeaderFieldKeys() { + return this.headers.keySet(); + } + + public Map getHeaderFields() { + return this.headers; + } + + public Object getHeaderValue(String key) { + return this.headers.get(key); + } + + // Returns just the Content-Length of the warc record + public long getContentLength() { + Object o = this.headers.get(CONTENT_LENGTH); + if (o == null) { + return -1; + } + long contentLength = (o instanceof Long)? + ((Long)o).longValue(): Long.parseLong((String)o); + return contentLength; + } + + // Returns the full record length + public long getLength() + { + return getContentLength() + contentOffset; + } + + public String getMimetype() { + return (String)this.headers.get(CONTENT_TYPE); + } + + public long getOffset() { + Object o = this.headers.get(ABSOLUTE_OFFSET_KEY); + if (o == null) { + return -1; + } + return (o instanceof Long)? + ((Long)o).longValue(): Long.parseLong((String)o); + } + + public String getRecordIdentifier() { + return (String)this.headers.get(RECORD_IDENTIFIER_FIELD_KEY); + } + + public String getUrl() { + return (String)this.headers.get(HEADER_KEY_URI); + } + + public String getVersion() { + return (String)this.headers.get(VERSION_FIELD_KEY); + } + + public int getContentBegin() { + return this.contentBegin; + } + + @Override + public String toString() { + return this.headers.toString(); + } + }; + } + + @Override + protected String getMimetype4Cdx(ArchiveRecordHeader h) { + final String m = super.getMimetype4Cdx(h); + // Mimetypes can have spaces in WARCs. Emitting for CDX, just + // squash them for now. Later, quote them since squashing spaces won't + // work for params that have quoted-string values. + Matcher matcher = WHITESPACE.matcher(m); + return matcher.replaceAll(""); + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/warc/WARCRecordInfo.java b/src/main/java/org/archive/io/warc/WARCRecordInfo.java new file mode 100644 index 00000000..a6198c44 --- /dev/null +++ b/src/main/java/org/archive/io/warc/WARCRecordInfo.java @@ -0,0 +1,139 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io.warc; + +import java.io.InputStream; +import java.net.URI; + +import org.archive.format.warc.WARCConstants.WARCRecordType; +import org.archive.util.anvl.ANVLRecord; + +public class WARCRecordInfo { + + protected WARCRecordType type; + protected String url; + protected String create14DigitDate; + protected String mimetype; + protected URI recordId; + protected ANVLRecord extraHeaders; + protected InputStream contentStream; + protected long contentLength; + protected boolean enforceLength; + protected String warcFilename; + protected Long warcFileOffset; + + public void setType(WARCRecordType type) { + this.type = type; + } + + public void setUrl(String url) { + this.url = url; + } + + public String getCreate14DigitDate() { + return create14DigitDate; + } + + public void setCreate14DigitDate(String create14DigitDate) { + this.create14DigitDate = create14DigitDate; + } + + public String getMimetype() { + return mimetype; + } + + public void setMimetype(String mimetype) { + this.mimetype = mimetype; + } + + public URI getRecordId() { + return recordId; + } + + public void setRecordId(URI recordId) { + this.recordId = recordId; + } + + public ANVLRecord getExtraHeaders() { + return extraHeaders; + } + + public void setExtraHeaders(ANVLRecord extraHeaders) { + this.extraHeaders = extraHeaders; + } + + public InputStream getContentStream() { + return contentStream; + } + + public void setContentStream(InputStream contentStream) { + this.contentStream = contentStream; + } + + public long getContentLength() { + return contentLength; + } + + public void setContentLength(long contentLength) { + this.contentLength = contentLength; + } + + public boolean isEnforceLength() { + return enforceLength; + } + + public boolean getEnforceLength() { + return enforceLength; + } + + public void setEnforceLength(boolean enforceLength) { + this.enforceLength = enforceLength; + } + + public WARCRecordType getType() { + return type; + } + + public String getUrl() { + return url; + } + + public void addExtraHeader(String label, String value) { + if (extraHeaders == null) { + extraHeaders = new ANVLRecord(); + } + extraHeaders.addLabelValue(label, value); + } + + public void setWARCFilename(String warcFilenameWithoutOccupiedSuffix) { + this.warcFilename = warcFilenameWithoutOccupiedSuffix; + } + + public String getWARCFilename() { + return warcFilename; + } + + public void setWARCFileOffset(Long startPosition) { + this.warcFileOffset = startPosition; + } + + public Long getWARCFileOffset() { + return warcFileOffset; + } +} diff --git a/src/main/java/org/archive/io/warc/WARCWriter.java b/src/main/java/org/archive/io/warc/WARCWriter.java new file mode 100644 index 00000000..b9558263 --- /dev/null +++ b/src/main/java/org/archive/io/warc/WARCWriter.java @@ -0,0 +1,436 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io.warc; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.IOException; +import java.io.OutputStream; +import java.net.URI; +import java.util.HashMap; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.Map; +import java.util.Map.Entry; +import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; +import java.util.logging.Level; +import java.util.logging.Logger; + +import org.apache.commons.lang.StringUtils; +import org.archive.io.ArchiveFileConstants; +import org.archive.io.UTF8Bytes; +import org.archive.io.WriterPoolMember; +import org.archive.util.ArchiveUtils; +import org.archive.util.anvl.Element; + + +/** + * WARC implementation. + * + *

Assumption is that the caller is managing access to this + * WARCWriter ensuring only one thread accessing this WARC instance + * at any one time. + * + *

While being written, WARCs have a '.open' suffix appended. + * + * @contributor stack + * @version $Revision: 4604 $ $Date: 2006-09-05 22:38:18 -0700 (Tue, 05 Sep 2006) $ + */ +public class WARCWriter extends WriterPoolMember +implements WARCConstants { + public static final String TOTALS = "totals"; + public static final String SIZE_ON_DISK = "sizeOnDisk"; + public static final String TOTAL_BYTES = "totalBytes"; + public static final String CONTENT_BYTES = "contentBytes"; + public static final String NUM_RECORDS = "numRecords"; + + private static final Logger logger = + Logger.getLogger(WARCWriter.class.getName()); + + /** + * NEWLINE as bytes. + */ + public static byte [] CRLF_BYTES; + static { + try { + CRLF_BYTES = CRLF.getBytes(DEFAULT_ENCODING); + } catch(Exception e) { + e.printStackTrace(); + } + }; + + /** + * Temporarily accumulates stats managed externally by + * {@link WARCWriterProcessor}. WARCWriterProcessor will call + * {@link #resetTmpStats()}, write some records, then add + * {@link #getTmpStats()} into its long-term running totals. + */ + private Map> tmpStats; + + /** Temporarily accumulates info on written warc records for use externally. */ + private LinkedList tmpRecordLog = new LinkedList(); + + /** + * Constructor. + * Takes a stream. Use with caution. There is no upperbound check on size. + * Will just keep writing. Only pass Streams that are bounded. + * @param serialNo used to generate unique file name sequences + * @param out Where to write. + * @param f File the out is connected to. + * @param cmprs Compress the content written. + * @param a14DigitDate If null, we'll write current time. + * @throws IOException + */ + public WARCWriter(final AtomicInteger serialNo, + final OutputStream out, final File f, + final WARCWriterPoolSettings settings) + throws IOException { + super(serialNo, out, f, settings); + } + + /** + * Constructor. + * + * @param dirs Where to drop files. + * @param prefix File prefix to use. + * @param cmprs Compress the records written. + * @param maxSize Maximum size for ARC files written. + * @param suffix File tail to use. If null, unused. + * @param warcinfoData File metadata for warcinfo record. + */ + public WARCWriter(final AtomicInteger serialNo, + final WARCWriterPoolSettings settings) { + super(serialNo, settings, WARC_FILE_EXTENSION); + } + + @Override + protected String createFile(File file) throws IOException { + String filename = super.createFile(file); + writeWarcinfoRecord(filename); + return filename; + } + + protected void baseCharacterCheck(final char c, final String parameter) + throws IllegalArgumentException { + // TODO: Too strict? UNICODE control characters? + if (Character.isISOControl(c) || !Character.isValidCodePoint(c)) { + throw new IllegalArgumentException("Contains illegal character 0x" + + Integer.toHexString(c) + ": " + parameter); + } + } + + protected String checkHeaderValue(final String value) + throws IllegalArgumentException { + for (int i = 0; i < value.length(); i++) { + final char c = value.charAt(i); + baseCharacterCheck(c, value); + if (Character.isWhitespace(c)) { + throw new IllegalArgumentException("Contains disallowed white space 0x" + + Integer.toHexString(c) + ": " + value); + } + } + return value; + } + + protected String checkHeaderLineMimetypeParameter(final String parameter) + throws IllegalArgumentException { + StringBuilder sb = new StringBuilder(parameter.length()); + boolean wasWhitespace = false; + for (int i = 0; i < parameter.length(); i++) { + char c = parameter.charAt(i); + if (Character.isWhitespace(c)) { + // Map all to ' ' and collapse multiples into one. + // TODO: Make sure white space occurs in legal location -- + // before parameter or inside quoted-string. + if (wasWhitespace) { + continue; + } + wasWhitespace = true; + c = ' '; + } else { + wasWhitespace = false; + baseCharacterCheck(c, parameter); + } + sb.append(c); + } + + return sb.toString(); + } + +// protected String createRecordHeader(final String type, +// final String url, final String create14DigitDate, +// final String mimetype, final URI recordId, +// final ANVLRecord xtraHeaders, final long contentLength) + protected String createRecordHeader(WARCRecordInfo metaRecord) + throws IllegalArgumentException { + final StringBuilder sb = + new StringBuilder(2048/*A SWAG: TODO: Do analysis.*/); + sb.append(WARC_ID).append(CRLF); + sb.append(HEADER_KEY_TYPE).append(COLON_SPACE).append(metaRecord.getType()). + append(CRLF); + // Do not write a subject-uri if not one present. + if (!StringUtils.isEmpty(metaRecord.getUrl())) { + sb.append(HEADER_KEY_URI).append(COLON_SPACE). + append(checkHeaderValue(metaRecord.getUrl())).append(CRLF); + } + sb.append(HEADER_KEY_DATE).append(COLON_SPACE). + append(metaRecord.getCreate14DigitDate()).append(CRLF); + if (metaRecord.getExtraHeaders() != null) { + for (final Iterator i = metaRecord.getExtraHeaders().iterator(); i.hasNext();) { + sb.append(i.next()).append(CRLF); + } + } + + sb.append(HEADER_KEY_ID).append(COLON_SPACE).append('<'). + append(metaRecord.getRecordId().toString()).append('>').append(CRLF); + if (metaRecord.getContentLength() > 0) { + sb.append(CONTENT_TYPE).append(COLON_SPACE).append( + checkHeaderLineMimetypeParameter(metaRecord.getMimetype())).append(CRLF); + } + sb.append(CONTENT_LENGTH).append(COLON_SPACE). + append(Long.toString(metaRecord.getContentLength())).append(CRLF); + + return sb.toString(); + } + + public void writeRecord(WARCRecordInfo recordInfo) + throws IOException { + + if (recordInfo.getContentLength() == 0 && + (recordInfo.getExtraHeaders() == null || recordInfo.getExtraHeaders().size() <= 0)) { + throw new IllegalArgumentException("Cannot write record " + + "of content-length zero and base headers only."); + } + + String header; + try { + header = createRecordHeader(recordInfo); + + } catch (IllegalArgumentException e) { + logger.log(Level.SEVERE,"could not write record type: " + recordInfo.getType() + + "for URL: " + recordInfo.getUrl(), e); + return; + } + + long contentBytes = 0; + long totalBytes = 0; + long startPosition; + + try { + startPosition = getPosition(); + preWriteRecordTasks(); + + // TODO: Revisit encoding of header. + byte[] bytes = header.getBytes(WARC_HEADER_ENCODING); + write(bytes); + totalBytes += bytes.length; + + if (recordInfo.getContentStream() != null && recordInfo.getContentLength() > 0) { + // Write out the header/body separator. + write(CRLF_BYTES); // TODO: should this be written even for zero-length? + totalBytes += CRLF_BYTES.length; + contentBytes += copyFrom(recordInfo.getContentStream(), + recordInfo.getContentLength(), + recordInfo.getEnforceLength()); + totalBytes += contentBytes; + } + + // Write out the two blank lines at end of all records. + write(CRLF_BYTES); + write(CRLF_BYTES); + totalBytes += 2 * CRLF_BYTES.length; + + tally(recordInfo.getType(), contentBytes, totalBytes, getPosition() - startPosition); + + recordInfo.setWARCFilename(getFilenameWithoutOccupiedSuffix()); + recordInfo.setWARCFileOffset(startPosition); + tmpRecordLog.add(recordInfo); + } finally { + postWriteRecordTasks(); + } + } + + public String getFilenameWithoutOccupiedSuffix() { + String name = getFile().getName(); + if (name.endsWith(ArchiveFileConstants.OCCUPIED_SUFFIX)) { + name = name.substring(0, name.length() - ArchiveFileConstants.OCCUPIED_SUFFIX.length()); + } + return name; + } + + // if compression is enabled, sizeOnDisk means compressed bytes; if not, it + // should be the same as totalBytes (right?) + protected void tally(WARCRecordType warcRecordType, long contentBytes, long totalBytes, long sizeOnDisk) { + if (tmpStats == null) { + tmpStats = new HashMap>(); + } + + // add to stats for this record type + Map substats = tmpStats.get(warcRecordType.toString()); + if (substats == null) { + substats = new HashMap(); + tmpStats.put(warcRecordType.toString(), substats); + } + subtally(substats, contentBytes, totalBytes, sizeOnDisk); + + // add to totals + substats = tmpStats.get(TOTALS); + if (substats == null) { + substats = new HashMap(); + tmpStats.put(TOTALS, substats); + } + subtally(substats, contentBytes, totalBytes, sizeOnDisk); + } + + protected void subtally(Map substats, long contentBytes, + long totalBytes, long sizeOnDisk) { + + if (substats.get(NUM_RECORDS) == null) { + substats.put(NUM_RECORDS, 1l); + } else { + substats.put(NUM_RECORDS, substats.get(NUM_RECORDS) + 1); + } + + if (substats.get(CONTENT_BYTES) == null) { + substats.put(CONTENT_BYTES, contentBytes); + } else { + substats.put(CONTENT_BYTES, substats.get(CONTENT_BYTES) + contentBytes); + } + + if (substats.get(TOTAL_BYTES) == null) { + substats.put(TOTAL_BYTES, totalBytes); + } else { + substats.put(TOTAL_BYTES, substats.get(TOTAL_BYTES) + totalBytes); + } + + if (substats.get(SIZE_ON_DISK) == null) { + substats.put(SIZE_ON_DISK, sizeOnDisk); + } else { + substats.put(SIZE_ON_DISK, substats.get(SIZE_ON_DISK) + sizeOnDisk); + } + } + + protected URI generateRecordId(final Map qualifiers) + throws IOException { + return ((WARCWriterPoolSettings)settings).getRecordIDGenerator().getQualifiedRecordID(qualifiers); + } + + protected URI generateRecordId(final String key, final String value) + throws IOException { + return ((WARCWriterPoolSettings)settings).getRecordIDGenerator().getQualifiedRecordID(key, value); + } + + public URI writeWarcinfoRecord(String filename) + throws IOException { + return writeWarcinfoRecord(filename, null); + } + + public URI writeWarcinfoRecord(String filename, final String description) + throws IOException { + WARCRecordInfo recordInfo = new WARCRecordInfo(); + recordInfo.setType(WARCRecordType.warcinfo); + recordInfo.setCreate14DigitDate(ArchiveUtils.getLog14Date()); + recordInfo.setMimetype("application/warc-fields"); + + // Strip .open suffix if present. + if (filename.endsWith(WriterPoolMember.OCCUPIED_SUFFIX)) { + filename = filename.substring(0, + filename.length() - WriterPoolMember.OCCUPIED_SUFFIX.length()); + } + recordInfo.addExtraHeader(HEADER_KEY_FILENAME, filename); + if (description != null && description.length() > 0) { + recordInfo.addExtraHeader(CONTENT_DESCRIPTION, description); + } + + // Add warcinfo body. + byte [] warcinfoBody = null; + if (settings.getMetadata() == null) { + // TODO: What to write into a warcinfo? What to associate? + warcinfoBody = "TODO: Unimplemented".getBytes(); + } else { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + for (final Iterator i = settings.getMetadata().iterator(); + i.hasNext();) { + baos.write(i.next().toString().getBytes(UTF8Bytes.UTF8)); + } + warcinfoBody = baos.toByteArray(); + } + recordInfo.setContentStream(new ByteArrayInputStream(warcinfoBody)); + recordInfo.setContentLength((long) warcinfoBody.length); + recordInfo.setEnforceLength(true); + + recordInfo.setRecordId(generateRecordId(TYPE, WARCRecordType.warcinfo.toString())); + + writeRecord(recordInfo); + + // TODO: If at start of file, and we're writing compressed, + // write out our distinctive GZIP extensions. + return recordInfo.getRecordId(); + } + + /** + * @see WARCWriter#tmpStats for usage model + */ + public void resetTmpStats() { + if (tmpStats != null) { + for (Map substats : tmpStats.values()) { + for (Entry entry : substats.entrySet()) { + entry.setValue(0l); + } + } + } + } + + public Map> getTmpStats() { + return tmpStats; + } + + public static long getStat(Map> map, String key, + String subkey) { + if (map != null && map.get(key) != null + && map.get(key).get(subkey) != null) { + return map.get(key).get(subkey); + } else { + return 0l; + } + } + + public static long getStat( + ConcurrentMap> map, + String key, String subkey) { + if (map != null && map.get(key) != null + && map.get(key).get(subkey) != null) { + return map.get(key).get(subkey).get(); + } else { + return 0l; + } + } + + public void resetTmpRecordLog() { + tmpRecordLog.clear(); + } + + public Iterable getTmpRecordLog() { + return tmpRecordLog; + } +} diff --git a/src/main/java/org/archive/io/warc/WARCWriterPool.java b/src/main/java/org/archive/io/warc/WARCWriterPool.java new file mode 100644 index 00000000..fdc97162 --- /dev/null +++ b/src/main/java/org/archive/io/warc/WARCWriterPool.java @@ -0,0 +1,64 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io.warc; + +import java.util.concurrent.atomic.AtomicInteger; + +import org.archive.io.WriterPool; +import org.archive.io.WriterPoolMember; + + +/** + * A pool of WARCWriters. + * @contributor stack + * @contributor gojomo + * @version $Revision: 4566 $ $Date: 2006-08-31 09:51:41 -0700 (Thu, 31 Aug 2006) $ + */ +public class WARCWriterPool extends WriterPool { + /** + * Constructor + * @param settings Settings for this pool. + * @param poolMaximumActive + * @param poolMaximumWait + */ + public WARCWriterPool(final WARCWriterPoolSettings settings, + final int poolMaximumActive, final int poolMaximumWait) { + this(new AtomicInteger(), settings, poolMaximumActive, poolMaximumWait); + } + + /** + * Constructor + * @param serial Used to generate unique filename sequences + * @param settings Settings for this pool. + * @param poolMaximumActive + * @param poolMaximumWait + */ + public WARCWriterPool(final AtomicInteger serial, + final WARCWriterPoolSettings settings, + final int poolMaximumActive, final int poolMaximumWait) { + super(serial, settings, poolMaximumActive, poolMaximumWait); + } + + /* (non-Javadoc) + * @see org.archive.io.WriterPool#makeWriter() + */ + protected WriterPoolMember makeWriter() { + return new WARCWriter(serialNo, (WARCWriterPoolSettings)settings); + } +} diff --git a/src/main/java/org/archive/io/warc/WARCWriterPoolSettings.java b/src/main/java/org/archive/io/warc/WARCWriterPoolSettings.java new file mode 100644 index 00000000..b028a8b7 --- /dev/null +++ b/src/main/java/org/archive/io/warc/WARCWriterPoolSettings.java @@ -0,0 +1,32 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io.warc; + +import org.archive.io.WriterPoolSettings; +import org.archive.uid.RecordIDGenerator; + +/** + * Settings object for a {@link WARCWriterPool}. + * Used creating {@link WARCWriter}s. + * + * @version $Date: 2010-08-19 17:21:43 -0700 (Thu, 19 Aug 2010) $, $Revision: 6927 $ + */ +public interface WARCWriterPoolSettings extends WriterPoolSettings { + public RecordIDGenerator getRecordIDGenerator(); +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/warc/WARCWriterPoolSettingsData.java b/src/main/java/org/archive/io/warc/WARCWriterPoolSettingsData.java new file mode 100644 index 00000000..d56c9971 --- /dev/null +++ b/src/main/java/org/archive/io/warc/WARCWriterPoolSettingsData.java @@ -0,0 +1,40 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io.warc; + +import java.io.File; +import java.util.List; + +import org.archive.io.arc.WriterPoolSettingsData; +import org.archive.uid.RecordIDGenerator; + +public class WARCWriterPoolSettingsData extends WriterPoolSettingsData implements WARCWriterPoolSettings { + RecordIDGenerator generator; + + public WARCWriterPoolSettingsData(String prefix, String template, + long maxFileSizeBytes, boolean compress, List outputDirs, + List metadata, RecordIDGenerator generator) { + super(prefix,template,maxFileSizeBytes,compress,outputDirs,metadata); + this.generator = generator; + } + @Override + public RecordIDGenerator getRecordIDGenerator() { + return generator; + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/io/warc/package.html b/src/main/java/org/archive/io/warc/package.html new file mode 100644 index 00000000..f52aa95b --- /dev/null +++ b/src/main/java/org/archive/io/warc/package.html @@ -0,0 +1,38 @@ + + + +org.archive.io.warc package + + +Experimental WARC Writer and Readers. Code and specification subject to change +with no guarantees of backward compatibility: i.e. newer readers +may not be able to parse WARCs written with older writers. This package +contains prototyping code for revision 0.12 of the WARC specification. +See latest revision +for current state (Version 0.10 code and its documentation has been moved into the +v10 subpackage). + + +

Implementation Notes

+

Tools

+

Initial implementations of Arc2Warc and Warc2Arc +tools can be found in the package above this one, at +{@link org.archive.io.Arc2Warc} and {@link org.archive.io.Warc2Arc} +respectively. Pass --help to learn how to use each tool. +

+ +

TODO

+
    +
  • Is MIME-Version header needed? MIME Parsers seem fine without (python email +lib and java mail).
  • +
  • Should we write out a Content-Transfer-Encoding +header (Currently we do not). Need section in spec. explicit about our +interpretation of MIME and deviations (e.g. content-transfer-encoding should +be assumed binary in case of WARCs, multipart is not disallowed but not +encouraged, etc.)
  • +
  • Minor: Do WARC-Version: 0.12 like MIME-Version: 1.0 rather than +WARC/0.12 for lead in to an ARCRecord?
  • +
+ + + diff --git a/src/main/java/org/archive/net/DownloadURLConnection.java b/src/main/java/org/archive/net/DownloadURLConnection.java new file mode 100644 index 00000000..fbcee421 --- /dev/null +++ b/src/main/java/org/archive/net/DownloadURLConnection.java @@ -0,0 +1,131 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.net; + +import java.io.BufferedInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; +import java.net.URLConnection; +import java.util.Arrays; +import java.util.logging.Level; +import java.util.logging.Logger; + +import org.archive.util.ProcessUtils; +import org.archive.util.ProcessUtils.ProcessResult; + +/** + * An URL Connection that pre-downloads URL reference before passing back a + * Stream reference. When closed, it removes the local download file. + * @author stack + * @version $Date$, $Revision$ + */ +public abstract class DownloadURLConnection extends URLConnection { + private final String CLASSNAME = DownloadURLConnection.class.getName(); + private final Logger LOGGER = Logger.getLogger(CLASSNAME); + private static final File TMPDIR = + new File(System.getProperty("java.io.tmpdir", "/tmp")); + private File downloadFile = null; + + protected DownloadURLConnection(URL u) { + super(u); + } + + protected String getScript() { + return System.getProperty(this.getClass().getName() + ".path", + "UNDEFINED"); + } + + protected String [] getCommand(final URL thisUrl, + final File downloadFile) { + return new String[] {getScript(), thisUrl.getPath(), + downloadFile.getAbsolutePath()}; + } + + /** + * Do script copy to local file. + * File is available via {@link #getFile()}. + * @throws IOException + */ + public void connect() throws IOException { + if (this.connected) { + return; + } + + this.downloadFile = File.createTempFile(CLASSNAME, null, TMPDIR); + try { + String [] cmd = getCommand(this.url, this.downloadFile); + if (LOGGER.isLoggable(Level.FINE)) { + StringBuffer buffer = new StringBuffer(); + for (int i = 0; i < cmd.length; i++) { + if (i > 0) { + buffer.append(" "); + } + buffer.append(cmd[i]); + } + LOGGER.fine("Command: " + buffer.toString()); + } + ProcessResult pr = ProcessUtils.exec(cmd); + if (pr.getResult() != 0) { + LOGGER.info(Arrays.toString(cmd) + " returned non-null " + pr.getResult()); + } + // Assume download went smoothly. + this.connected = true; + } catch (IOException ioe) { + // Clean up my tmp file. + this.downloadFile.delete(); + this.downloadFile = null; + // Rethrow. + throw ioe; + } + } + + public File getFile() { + return this.downloadFile; + } + + protected void setFile(final File f) { + this.downloadFile = f; + } + + public InputStream getInputStream() throws IOException { + if (!this.connected) { + connect(); + } + + // Return BufferedInputStream so 'delegation' is done for me, so + // I don't have to implement all IS methods and pass to my + // 'delegate' instance. + final DownloadURLConnection connection = this; + return new BufferedInputStream(new FileInputStream(this.downloadFile)) { + private DownloadURLConnection ruc = connection; + + public void close() throws IOException { + super.close(); + if (this.ruc != null && this.ruc.getFile()!= null && + this.ruc.getFile().exists()) { + this.ruc.getFile().delete(); + this.ruc.setFile(null); + } + } + }; + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/net/FTPException.java b/src/main/java/org/archive/net/FTPException.java new file mode 100644 index 00000000..2d104390 --- /dev/null +++ b/src/main/java/org/archive/net/FTPException.java @@ -0,0 +1,56 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.net; + +import java.io.IOException; + +/** + * Indicates that a FTP operation failed due to a protocol violation. + * For instance, if authentication fails. + * + * @author pjack + */ +public class FTPException extends IOException { + private static final long serialVersionUID = 1L; + + /** + * The reply code from the FTP server. + */ + private int code; + + /** + * Constructs a new FTPException. + * + * @param code the error code from the FTP server + */ + public FTPException(int code) { + super("FTP error code: " + code); + this.code = code; + } + + + /** + * Returns the error code from the FTP server. + * + * @return the error code from the FTP server + */ + public int getReplyCode() { + return code; + } +} diff --git a/src/main/java/org/archive/net/md5/Handler.java b/src/main/java/org/archive/net/md5/Handler.java new file mode 100644 index 00000000..8afcdebb --- /dev/null +++ b/src/main/java/org/archive/net/md5/Handler.java @@ -0,0 +1,87 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.net.md5; + +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; +import java.net.URLConnection; +import java.net.URLStreamHandler; + +/** + * A protocol handler for an 'md5' URI scheme. + * Md5 URLs look like this: md5:deadbeefdeadbeefdeadbeefdeadbeef + * When this handler is invoked against an md5 URL, it passes the raw md5 to + * the configured script as an argument. The configured script then does the + * work to bring the item pointed to by the md5 local so we can open a Stream + * on the local copy. Local file is deleted when we finish. Do + * {@link org.archive.net.DownloadURLConnection#getFile()} to get name of + * temporary file. + * + *

You need to define the system property + * -Djava.protocol.handler.pkgs=org.archive.net to add this handler + * to the java.net.URL set. Also define system properties + * -Dorg.archive.net.md5.Md5URLConnection.path=PATH_TO_SCRIPT to + * pass path of script to run as well as + * -Dorg.archive.net.md5.Md5URLConnection.options=OPTIONS for + * any options you'd like to include. The pointed-to PATH_TO_SCRIPT + * will be invoked as follows: PATH_TO_SCRIPT OPTIONS MD5 + * LOCAL_TMP_FILE. The LOCAL_TMP_FILE file is made in + * java.io.tmpdir using java tmp name code. + * @author stack + */ +public class Handler extends URLStreamHandler { + protected URLConnection openConnection(URL u) { + return new Md5URLConnection(u); + } + + /** + * Main dumps rsync file to STDOUT. + * @param args + * @throws IOException + */ + public static void main(String[] args) + throws IOException { + if (args.length != 1) { + System.out.println("Usage: java java " + + "-Djava.protocol.handler.pkgs=org.archive.net " + + "org.archive.net.md5.Handler " + + "md5:deadbeefdeadbeefdeadbeefdeadbeef"); + System.exit(1); + } + System.setProperty("org.archive.net.md5.Md5URLConnection.path", + "/tmp/manifest"); + System.setProperty("java.protocol.handler.pkgs", "org.archive.net"); + URL u = new URL(args[0]); + URLConnection connect = u.openConnection(); + // Write download to stdout. + final int bufferlength = 4096; + byte [] buffer = new byte [bufferlength]; + InputStream is = connect.getInputStream(); + try { + for (int count = is.read(buffer, 0, bufferlength); + (count = is.read(buffer, 0, bufferlength)) != -1;) { + System.out.write(buffer, 0, count); + } + System.out.flush(); + } finally { + is.close(); + } + } +} diff --git a/src/main/java/org/archive/net/md5/Md5URLConnection.java b/src/main/java/org/archive/net/md5/Md5URLConnection.java new file mode 100644 index 00000000..e4fe98e3 --- /dev/null +++ b/src/main/java/org/archive/net/md5/Md5URLConnection.java @@ -0,0 +1,34 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.net.md5; + +import java.net.URL; + +import org.archive.net.DownloadURLConnection; + +/** + * Md5 URL connection. + * @author stack + * @version $Date$, $Revision$ + */ +public class Md5URLConnection extends DownloadURLConnection { + protected Md5URLConnection(URL u) { + super(u); + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/net/rsync/Handler.java b/src/main/java/org/archive/net/rsync/Handler.java new file mode 100644 index 00000000..9eb35f5d --- /dev/null +++ b/src/main/java/org/archive/net/rsync/Handler.java @@ -0,0 +1,71 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.net.rsync; + +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; +import java.net.URLConnection; +import java.net.URLStreamHandler; + +/** + * A protocol handler that uses native rsync client to do copy. + * You need to define the system property + * -Djava.protocol.handler.pkgs=org.archive.net to add this handler + * to the java.net.URL set. Assumes rsync is in path. Define + * system property + * -Dorg.archive.net.rsync.RsyncUrlConnection.path=PATH_TO_RSYNC to + * pass path to rsync. Downloads to java.io.tmpdir. + * @author stack + */ +public class Handler extends URLStreamHandler { + protected URLConnection openConnection(URL u) { + return new RsyncURLConnection(u); + } + + /** + * Main dumps rsync file to STDOUT. + * @param args + * @throws IOException + */ + public static void main(String[] args) + throws IOException { + if (args.length != 1) { + System.out.println("Usage: java java " + + "-Djava.protocol.handler.pkgs=org.archive.net " + + "org.archive.net.rsync.Handler RSYNC_URL"); + System.exit(1); + } + URL u = new URL(args[0]); + URLConnection connect = u.openConnection(); + // Write download to stdout. + final int bufferlength = 4096; + byte [] buffer = new byte [bufferlength]; + InputStream is = connect.getInputStream(); + try { + for (int count = is.read(buffer, 0, bufferlength); + (count = is.read(buffer, 0, bufferlength)) != -1;) { + System.out.write(buffer, 0, count); + } + System.out.flush(); + } finally { + is.close(); + } + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/net/rsync/RsyncURLConnection.java b/src/main/java/org/archive/net/rsync/RsyncURLConnection.java new file mode 100644 index 00000000..c6097e96 --- /dev/null +++ b/src/main/java/org/archive/net/rsync/RsyncURLConnection.java @@ -0,0 +1,51 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.net.rsync; + +import java.io.File; +import java.net.URL; + +import org.archive.net.DownloadURLConnection; + +/** + * Rsync URL connection. + * @author stack + * @version $Date$, $Revision$ + */ +public class RsyncURLConnection extends DownloadURLConnection { + private final String RSYNC_TIMEOUT = + System.getProperty(RsyncURLConnection.class.getName() + ".timeout", + "300"); + + protected RsyncURLConnection(URL u) { + super(u); + } + + protected String getScript() { + return System.getProperty(this.getClass().getName() + ".path", + "rsync"); + } + + @Override + protected String[] getCommand(final URL thisUrl, + final File downloadFile) { + return new String[] {getScript(), "--timeout=" + RSYNC_TIMEOUT, + this.url.getPath(), downloadFile.getAbsolutePath()}; + } +} diff --git a/src/main/java/org/archive/uid/RecordIDGenerator.java b/src/main/java/org/archive/uid/RecordIDGenerator.java new file mode 100644 index 00000000..97f1a022 --- /dev/null +++ b/src/main/java/org/archive/uid/RecordIDGenerator.java @@ -0,0 +1,72 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.uid; + +import java.net.URI; +import java.net.URISyntaxException; +import java.util.Map; + +/** + * A record-id generator. + * + * @contributor stack + * @contributor gojomo + * @version $Revision$ $Date$ + */ +public interface RecordIDGenerator { + /** + * @return A URI that can serve as a record-id. + * @throws URISyntaxException + */ + public URI getRecordID(); + + /** + * @param qualifiers Qualifiers to add. + * @return A URI qualified with passed qualifiers that can + * serve as a record-id, or, a new, unique record-id without qualifiers + * (if qualifiers not easily implemented using passed URI scheme). + */ + public URI getQualifiedRecordID(final Map qualifiers); + + /** + * @param key Name of qualifier + * @param value Value of qualifier + * @return A URI qualified with passed qualifiers that can + * serve as a record-id, or, a new, unique record-id without qualifiers + * (if qualifiers not easily implemented using passed URI scheme). + */ + public URI getQualifiedRecordID(final String key, final String value); + + /** + * Append (or if already present, update) qualifiers to passed + * recordId. Use with caution. Guard against turning up a + * result that already exists. Use when writing a group of records inside + * a single transaction. + * + * How qualifiers are appended/updated varies with URI scheme. Its allowed + * that an invocation of this method does nought but call + * {@link #getRecordID()}, returning a new URI unrelated to the passed + * recordId and passed qualifier. + * @param recordId URI to append qualifier to. + * @param qualifiers Map of qualifier values keyed by qualifier name. + * @return New URI based off passed uri and passed qualifier. + */ + public URI qualifyRecordID(final URI recordId, + final Map qualifiers); +} diff --git a/src/main/java/org/archive/uid/UUIDGenerator.java b/src/main/java/org/archive/uid/UUIDGenerator.java new file mode 100644 index 00000000..26d29e60 --- /dev/null +++ b/src/main/java/org/archive/uid/UUIDGenerator.java @@ -0,0 +1,72 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.uid; + +import java.net.URI; +import java.net.URISyntaxException; +import java.util.Map; +import java.util.UUID; + +/** + * Generates UUIDs, using + * {@link java.util.UUID java.util.UUID}, formatted as URNs from the UUID + * namespace [See RFC4122]. + * Here is an examples of the type of ID it makes: + * urn:uuid:0161811f-5da6-4c6e-9808-a2fab97114cf. Always makes a + * new identifier even when passed qualifiers. + * + * @author stack + * @version $Revision$ $Date$ + * @see RFC4122 + */ +public class UUIDGenerator implements RecordIDGenerator { + private static final String SCHEME = "urn:uuid"; + private static final String SCHEME_COLON = SCHEME + ":"; + + public UUIDGenerator() { + super(); + } + + public URI qualifyRecordID(URI recordId, + final Map qualifiers) { + return getRecordID(); + } + + private String getUUID() { + return UUID.randomUUID().toString(); + } + + public URI getRecordID() { + try { + return new URI(SCHEME_COLON + getUUID()); + } catch (URISyntaxException e) { + // should be impossible + throw new RuntimeException(e); + } + } + + public URI getQualifiedRecordID( + final String key, final String value){ + return getRecordID(); + } + + public URI getQualifiedRecordID(Map qualifiers){ + return getRecordID(); + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/uid/package.html b/src/main/java/org/archive/uid/package.html new file mode 100644 index 00000000..dc49f07b --- /dev/null +++ b/src/main/java/org/archive/uid/package.html @@ -0,0 +1,28 @@ + + + +org.archive.uid package + + +A unique ID generator. +Default is {@link org.archive.uid.UUIDGenerator}. +To use another ID Generator, set the System Property +org.archive.uid.GeneratorFactory.generator to point +at an alternate implementation of {@link org.archive.uid.Generator}. + +

TODO

+
    +
  • MIME boundaries have upper-bound of 70 characters total including + 'blank line' (CRLFCRLF) and two leading hyphens. Add to + {@link org.archive.uid.Generator} + interface an upper-bound on generated ID length.
  • +
  • Add example of an actionable uid generator: +e.g. http://archive.org/UID-SCHEME/ID +where scheme might be UUID and an ID might be +f9472055-fbb6-4810-90e8-68fd39e145a6;type=metadata or, +using ARK: +http://archive.org/ark:/13030/f9472055-fbb6-4810-90e8-68fd39e145a6;type=metadata. +
  • +
+ + diff --git a/src/main/java/org/archive/util/DevUtils.java b/src/main/java/org/archive/util/DevUtils.java new file mode 100644 index 00000000..d630a0b1 --- /dev/null +++ b/src/main/java/org/archive/util/DevUtils.java @@ -0,0 +1,116 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.util; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.PrintWriter; +import java.io.StringWriter; +import java.util.logging.Logger; + + +/** + * Write a message and stack trace to the 'org.archive.util.DevUtils' logger. + * + * @author gojomo + * @version $Revision$ $Date$ + */ +public class DevUtils { + public static Logger logger = + Logger.getLogger(DevUtils.class.getName()); + + /** + * Log a warning message to the logger 'org.archive.util.DevUtils' made of + * the passed 'note' and a stack trace based off passed exception. + * + * @param ex Exception we print a stacktrace on. + * @param note Message to print ahead of the stacktrace. + */ + public static void warnHandle(Throwable ex, String note) { + logger.warning(TextUtils.exceptionToString(note, ex)); + } + + /** + * @return Extra information gotten from current Thread. May not + * always be available in which case we return empty string. + */ + public static String extraInfo() { + StringWriter sw = new StringWriter(); + PrintWriter pw = new PrintWriter(sw); + final Thread current = Thread.currentThread(); + if (current instanceof Reporter) { + Reporter tt = (Reporter)current; + try { + tt.reportTo(pw); + } catch (IOException e) { + // Not really possible w/ a StringWriter + e.printStackTrace(); + } + } + if (current instanceof ProgressStatisticsReporter) { + ProgressStatisticsReporter tt = (ProgressStatisticsReporter)current; + try { + tt.progressStatisticsLegend(pw); + tt.progressStatisticsLine(pw); + } catch (IOException e) { + // Not really possible w/ a StringWriter + e.printStackTrace(); + } + } + pw.flush(); + return sw.toString(); + } + + /** + * Nothing to see here, move along. + * @deprecated This method was never used. + */ + @Deprecated + public static void betterPrintStack(RuntimeException re) { + re.printStackTrace(System.err); + } + + /** + * Send this JVM process a SIGQUIT; giving a thread dump and possibly + * a heap histogram (if using -XX:+PrintClassHistogram). + * + * Used to automatically dump info, for example when a serious error + * is encountered. Would use 'jmap'/'jstack', but have seen JVM + * lockups -- perhaps due to lost thread wake signals -- when using + * those against Sun 1.5.0+03 64bit JVM. + */ + public static void sigquitSelf() { + try { + Process p = Runtime.getRuntime().exec( + new String[] {"perl", "-e", "print getppid(). \"\n\";"}); + BufferedReader br = + new BufferedReader(new InputStreamReader(p.getInputStream())); + String ppid = br.readLine(); + Runtime.getRuntime().exec( + new String[] {"sh", "-c", "kill -3 "+ppid}).waitFor(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } +} diff --git a/src/main/java/org/archive/util/FileUtils.java b/src/main/java/org/archive/util/FileUtils.java new file mode 100644 index 00000000..3de276a9 --- /dev/null +++ b/src/main/java/org/archive/util/FileUtils.java @@ -0,0 +1,712 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.util; + +import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.FileFilter; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.channels.ClosedByInterruptException; +import java.nio.channels.FileChannel; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Properties; +import java.util.logging.Level; +import java.util.logging.Logger; +import java.util.regex.Pattern; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.io.filefilter.IOFileFilter; +import org.apache.commons.lang.math.LongRange; + + +/** Utility methods for manipulating files and directories. + * + * @contributor John Erik Halse + * @contributor gojomo + */ +public class FileUtils { + private static final Logger LOGGER = + Logger.getLogger(FileUtils.class.getName()); + + /** + * Constructor made private because all methods of this class are static. + */ + private FileUtils() { + super(); + } + + /** + * Copy the src file to the destination. Deletes any preexisting + * file at destination. + * + * @param src + * @param dest + * @return True if the extent was greater than actual bytes copied. + * @throws FileNotFoundException + * @throws IOException + */ + public static boolean copyFile(final File src, final File dest) + throws FileNotFoundException, IOException { + return copyFile(src, dest, -1, true); + } + + /** + * Copy up to extent bytes of the source file to the destination. + * Deletes any preexisting file at destination. + * + * @param src + * @param dest + * @param extent Maximum number of bytes to copy + * @return True if the extent was greater than actual bytes copied. + * @throws FileNotFoundException + * @throws IOException + */ + public static boolean copyFile(final File src, final File dest, + long extent) + throws FileNotFoundException, IOException { + return copyFile(src, dest, extent, true); + } + + /** + * Copy up to extent bytes of the source file to the destination + * + * @param src + * @param dest + * @param extent Maximum number of bytes to copy + * @param overwrite If target file already exits, and this parameter is + * true, overwrite target file (We do this by first deleting the target + * file before we begin the copy). + * @return True if the extent was greater than actual bytes copied. + * @throws FileNotFoundException + * @throws IOException + */ + public static boolean copyFile(final File src, final File dest, + long extent, final boolean overwrite) + throws FileNotFoundException, IOException { + boolean result = false; + if (LOGGER.isLoggable(Level.FINE)) { + LOGGER.fine("Copying file " + src + " to " + dest + " extent " + + extent + " exists " + dest.exists()); + } + if (dest.exists()) { + if (overwrite) { + dest.delete(); + LOGGER.finer(dest.getAbsolutePath() + " removed before copy."); + } else { + // Already in place and we're not to overwrite. Return. + return result; + } + } + FileInputStream fis = null; + FileOutputStream fos = null; + FileChannel fcin = null; + FileChannel fcout = null; + try { + // Get channels + fis = new FileInputStream(src); + fos = new FileOutputStream(dest); + fcin = fis.getChannel(); + fcout = fos.getChannel(); + if (extent < 0) { + extent = fcin.size(); + } + + // Do the file copy + long trans = fcin.transferTo(0, extent, fcout); + if (trans < extent) { + result = false; + } + result = true; + } catch (IOException e) { + // Add more info to the exception. Preserve old stacktrace. + // We get 'Invalid argument' on some file copies. See + // http://intellij.net/forums/thread.jsp?forum=13&thread=63027&message=853123 + // for related issue. + String message = "Copying " + src.getAbsolutePath() + " to " + + dest.getAbsolutePath() + " with extent " + extent + + " got IOE: " + e.getMessage(); + if ((e instanceof ClosedByInterruptException) || + ((e.getMessage()!=null) + &&e.getMessage().equals("Invalid argument"))) { + LOGGER.severe("Failed copy, trying workaround: " + message); + workaroundCopyFile(src, dest); + } else { + IOException newE = new IOException(message); + newE.initCause(e); + throw newE; + } + } finally { + // finish up + if (fcin != null) { + fcin.close(); + } + if (fcout != null) { + fcout.close(); + } + if (fis != null) { + fis.close(); + } + if (fos != null) { + fos.close(); + } + } + return result; + } + + protected static void workaroundCopyFile(final File src, + final File dest) + throws IOException { + FileInputStream from = null; + FileOutputStream to = null; + try { + from = new FileInputStream(src); + to = new FileOutputStream(dest); + byte[] buffer = new byte[4096]; + int bytesRead; + while ((bytesRead = from.read(buffer)) != -1) { + to.write(buffer, 0, bytesRead); + } + } finally { + if (from != null) { + try { + from.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + if (to != null) { + try { + to.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + } + + /** + * Get a list of all files in directory that have passed prefix. + * + * @param dir Dir to look in. + * @param prefix Basename of files to look for. Compare is case insensitive. + * + * @return List of files in dir that start w/ passed basename. + */ + public static File [] getFilesWithPrefix(File dir, final String prefix) { + FileFilter prefixFilter = new FileFilter() { + public boolean accept(File pathname) + { + return pathname.getName().toLowerCase(). + startsWith(prefix.toLowerCase()); + } + }; + return dir.listFiles(prefixFilter); + } + + /** Get a @link java.io.FileFilter that filters files based on a regular + * expression. + * + * @param regex the regular expression the files must match. + * @return the newly created filter. + */ + public static IOFileFilter getRegexFileFilter(String regex) { + // Inner class defining the RegexFileFilter + class RegexFileFilter implements IOFileFilter { + Pattern pattern; + + protected RegexFileFilter(String re) { + pattern = Pattern.compile(re); + } + + public boolean accept(File pathname) { + return pattern.matcher(pathname.getName()).matches(); + } + + public boolean accept(File dir, String name) { + return accept(new File(dir,name)); + } + } + + return new RegexFileFilter(regex); + } + + /** + * Test file exists and is readable. + * @param f File to test. + * @exception FileNotFoundException If file does not exist or is not unreadable. + */ + public static File assertReadable(final File f) throws FileNotFoundException { + if (!f.exists()) { + throw new FileNotFoundException(f.getAbsolutePath() + + " does not exist."); + } + + if (!f.canRead()) { + throw new FileNotFoundException(f.getAbsolutePath() + + " is not readable."); + } + + return f; + } + + /** + * @param f File to test. + * @return True if file is readable, has uncompressed extension, + * and magic string at file start. + * @exception IOException If file not readable or other problem. + */ + public static boolean isReadableWithExtensionAndMagic(final File f, + final String uncompressedExtension, final String magic) + throws IOException { + boolean result = false; + FileUtils.assertReadable(f); + if(f.getName().toLowerCase().endsWith(uncompressedExtension)) { + FileInputStream fis = new FileInputStream(f); + try { + byte [] b = new byte[magic.length()]; + int read = fis.read(b, 0, magic.length()); + fis.close(); + if (read == magic.length()) { + StringBuffer beginStr + = new StringBuffer(magic.length()); + for (int i = 0; i < magic.length(); i++) { + beginStr.append((char)b[i]); + } + + if (beginStr.toString(). + equalsIgnoreCase(magic)) { + result = true; + } + } + } finally { + fis.close(); + } + } + + return result; + } + + /** + * Turn path into a File, relative to context (which may be ignored + * if path is absolute). + * + * @param context File context if path is relative + * @param path String path to make into a File + * @return File created + */ + public static File maybeRelative(File context, String path) { + File f = new File(path); + if(f.isAbsolute()) { + return f; + } + return new File(context, path); + } + + /** + * Load Properties instance from a File + * + * @param file + * @return Properties + * @throws IOException + */ + public static Properties loadProperties(File file) throws IOException { + FileInputStream finp = new FileInputStream(file); + try { + Properties p = new Properties(); + p.load(finp); + return p; + } finally { + ArchiveUtils.closeQuietly(finp); + } + } + + /** + * Store Properties instance to a File + * @param p + * @param file destination File + * @throws IOException + */ + public static void storeProperties(Properties p, File file) throws IOException { + FileOutputStream fos = new FileOutputStream(file); + try { + p.store(fos,""); + } finally { + ArchiveUtils.closeQuietly(fos); + } + } + + // TODO: comment + public static boolean moveAsideIfExists(File file) throws IOException { + if(!file.exists()) { + return true; + } + String newName = + file.getCanonicalPath() + "." + + ArchiveUtils.get14DigitDate(file.lastModified()); + boolean retVal = file.renameTo(new File(newName)); + if(!retVal) { + LOGGER.warning("unable to move aside: "+file+" to "+newName); + } + return retVal; + + } + + /** + * Retrieve a number of lines from the file around the given + * position, as when paging forward or backward through a file. + * + * @param file File to retrieve lines + * @param position offset to anchor lines + * @param signedDesiredLineCount lines requested; if negative, + * want this number of lines ending with a line containing + * the position; if positive, want this number of lines, + * all starting at or after position. + * @param lines List to insert found lines + * @param lineEstimate int estimate of line size, 0 means use default + * of 128 + * @return LongRange indicating the file offsets corresponding to + * the beginning of the first line returned, and the point + * after the end of the last line returned + * @throws IOException + */ + @SuppressWarnings("unchecked") + public static LongRange pagedLines(File file, long position, + int signedDesiredLineCount, List lines, int lineEstimate) + throws IOException { + // consider negative positions as from end of file; -1 = last byte + if (position < 0) { + position = file.length() + position; + } + + // calculate a reasonably sized chunk likely to have all desired lines + if(lineEstimate == 0) { + lineEstimate = 128; + } + int desiredLineCount = Math.abs(signedDesiredLineCount); + long startPosition; + long fileEnd = file.length(); + int bufferSize = (desiredLineCount + 5) * lineEstimate; + if(signedDesiredLineCount>0) { + // reading forward; include previous char in case line-end + startPosition = position - 1; + } else { + // reading backward + startPosition = position - bufferSize + (2 * lineEstimate); + } + if(startPosition<0) { + startPosition = 0; + } + if(startPosition+bufferSize > fileEnd) { + bufferSize = (int)(fileEnd - startPosition); + } + + // read that reasonable chunk + FileInputStream fis = new FileInputStream(file); + fis.getChannel().position(startPosition); + byte[] buf = new byte[bufferSize]; + ArchiveUtils.readFully(fis, buf); + IOUtils.closeQuietly(fis); + + // find all line starts fully in buffer + // (positions after a line-end, per line-end definition in + // BufferedReader.readLine) + LinkedList lineStarts = new LinkedList(); + if(startPosition==0) { + lineStarts.add(0); + } + boolean atLineEnd = false; + boolean eatLF = false; + int i; + for(i = 0; i < bufferSize; i++) { + if ((char) buf[i] == '\n' && eatLF) { + eatLF = false; + continue; + } + if(atLineEnd) { + atLineEnd = false; + lineStarts.add(i); + if(signedDesiredLineCount<0 && startPosition+i > position) { + // reached next line past position, read no more + break; + } + } + if ((char) buf[i] == '\r') { + atLineEnd = true; + eatLF = true; + continue; + } + if ((char) buf[i] == '\n') { + atLineEnd = true; + } + } + if(startPosition+i == fileEnd) { + // add phantom lineStart after end + lineStarts.add(bufferSize); + } + int foundFullLines = lineStarts.size()-1; + + // if found no lines + if(foundFullLines<1) { + if(signedDesiredLineCount>0) { + if(startPosition+bufferSize == fileEnd) { + // nothing more to read: return nothing + return new LongRange(fileEnd,fileEnd); + } else { + // retry with larger lineEstimate + return pagedLines(file, position, signedDesiredLineCount, lines, Math.max(bufferSize,lineEstimate)); + } + + } else { + // try again with much larger line estimate + // TODO: fail gracefully before growing to multi-MB buffers + return pagedLines(file, position, signedDesiredLineCount, lines, bufferSize); + } + } + + // trim unneeded lines + while(signedDesiredLineCount>0 && startPosition+lineStarts.getFirst()desiredLineCount+1) { + if (signedDesiredLineCount < 0 && (startPosition+lineStarts.get(1) <= position) ) { + // discard from front until reach line containing target position + lineStarts.removeFirst(); + } else { + lineStarts.removeLast(); + } + } + int firstLine = lineStarts.getFirst(); + int partialLine = lineStarts.getLast(); + LongRange range = new LongRange(startPosition + firstLine, startPosition + partialLine); + List foundLines = + IOUtils.readLines(new ByteArrayInputStream(buf,firstLine,partialLine-firstLine)); + + if(foundFullLines< 0 && startPosition > 0) { + // if needed and reading backward, read more lines from earlier + range = expandRange( + range, + pagedLines(file, + range.getMinimumLong()-1, + signedDesiredLineCount+foundFullLines, + lines, + bufferSize/foundFullLines)); + + } + + lines.addAll(foundLines); + + if(signedDesiredLineCount < 0 && range.getMaximumLong() < position) { + // did not get line containining start position + range = expandRange( + range, + pagedLines(file, + partialLine, + 1, + lines, + bufferSize/foundFullLines)); + } + + if(signedDesiredLineCount > 0 && foundFullLines < desiredLineCount && range.getMaximumLong() < fileEnd) { + // need more forward lines + range = expandRange( + range, + pagedLines(file, + range.getMaximumLong(), + desiredLineCount - foundFullLines, + lines, + bufferSize/foundFullLines)); + } + + return range; + } + + public static LongRange expandRange(LongRange range1, LongRange range2) { + return new LongRange(Math.min(range1.getMinimumLong(), range2.getMinimumLong()), + Math.max(range1.getMaximumLong(), range2.getMaximumLong())); + + } + + public static LongRange pagedLines(File file, long position, int signedDesiredLongCount, List lines) throws IOException { + return pagedLines(file, position, signedDesiredLongCount, lines, 0); + } + + /** + * Delete the file now -- but in the event of failure, keep trying + * in the future. + * + * VERY IMPORTANT: Do not use with any file whose name/path may be + * reused, because the lagged delete could then wind up deleting the + * newer file. Essentially, only to be used with uniquely-named temp + * files. + * + * Necessary because some platforms (looking at you, + * JVM-on-Windows) will have deletes fail because of things like + * file-mapped buffers remaining, and there's no explicit way to + * unmap a buffer. (See 6-year-old Sun-stumping Java bug + * http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4724038 ) + * We just have to wait and retry. + * + * (Why not just File.deleteOnExit? There could be an arbitrary, + * unbounded number of files in such a situation, that are only + * deletable a few seconds or minutes after our first attempt. + * Waiting for JVM exist could mean disk exhaustion. It's also + * unclear if the native FS class implementations of deleteOnExit + * use RAM per pending file.) + * + * @param fileToDelete + */ + public static synchronized void deleteSoonerOrLater(File fileToDelete) { + pendingDeletes.add(fileToDelete); + // if things are getting out of hand, force gc/finalization + if(pendingDeletes.size()>50) { + LOGGER.warning(">50 pending Files to delete; forcing gc/finalization"); + System.gc(); + System.runFinalization(); + } + // try all pendingDeletes + Iterator iter = pendingDeletes.listIterator(); + while(iter.hasNext()) { + File pending = iter.next(); + if(pending.delete()) { + iter.remove(); + } + } + // if things are still out of hand, complain loudly + if(pendingDeletes.size()>50) { + LOGGER.severe(">50 pending Files to delete even after gc/finalization"); + } + } + protected static LinkedList pendingDeletes = new LinkedList(); + + /** + * Read the entire stream to EOF into the passed file. + * Closes is when done or if an exception. + * @param is Stream to read. + * @param toFile File to write to. + * @throws IOException + */ + public static long readFullyToFile(InputStream is, File toFile) + throws IOException { + OutputStream os = org.apache.commons.io.FileUtils.openOutputStream(toFile); + try { + return IOUtils.copyLarge(is, os); + } finally { + IOUtils.closeQuietly(os); + IOUtils.closeQuietly(is); + } + } + + /** + * Ensure writeable directory. + * + * If doesn't exist, we attempt creation. + * + * @param dir Directory to test for exitence and is writeable. + * + * @return The passed dir. + * + * @exception IOException If passed directory does not exist and is not + * createable, or directory is not writeable or is not a directory. + */ + public static File ensureWriteableDirectory(String dir) + throws IOException { + return FileUtils.ensureWriteableDirectory(new File(dir)); + } + + /** + * Ensure writeable directories. + * + * If doesn't exist, we attempt creation. + * + * @param dirs List of Files to test. + * + * @return The passed dirs. + * + * @exception IOException If passed directory does not exist and is not + * createable, or directory is not writeable or is not a directory. + */ + public static List ensureWriteableDirectory(List dirs) + throws IOException { + for (Iterator i = dirs.iterator(); i.hasNext();) { + FileUtils.ensureWriteableDirectory(i.next()); + } + return dirs; + } + + /** + * Ensure writeable directory. + * + * If doesn't exist, we attempt creation. + * + * @param dir Directory to test for exitence and is writeable. + * + * @return The passed dir. + * + * @exception IOException If passed directory does not exist and is not + * createable, or directory is not writeable or is not a directory. + */ + public static File ensureWriteableDirectory(File dir) + throws IOException { + if (!dir.exists()) { + boolean success = dir.mkdirs(); + if (!success) { + throw new IOException("Failed to create directory: " + dir); + } + } else { + if (!dir.canWrite()) { + throw new IOException("Dir " + dir.getAbsolutePath() + + " not writeable."); + } else if (!dir.isDirectory()) { + throw new IOException("Dir " + dir.getAbsolutePath() + + " is not a directory."); + } + } + + return dir; + } + + public static File tryToCanonicalize(File file) { + try { + return file.getCanonicalFile(); + } catch (IOException e) { + return file; + } + } + + public static void appendTo(File fileToAppendTo, File fileToAppendFrom) throws IOException { + // optimal io block size according to http://lingrok.org/xref/coreutils/src/ioblksize.h + byte[] buf = new byte[65536]; + FileOutputStream out = new FileOutputStream(fileToAppendTo, true); + FileInputStream in = new FileInputStream(fileToAppendFrom); + for (int n = in.read(buf); n > 0; n = in.read(buf)) { + out.write(buf, 0, n); + } + in.close(); + out.flush(); + out.close(); + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/util/InetAddressUtil.java b/src/main/java/org/archive/util/InetAddressUtil.java new file mode 100644 index 00000000..585ba772 --- /dev/null +++ b/src/main/java/org/archive/util/InetAddressUtil.java @@ -0,0 +1,116 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.util; + +import java.net.InetAddress; +import java.net.NetworkInterface; +import java.net.SocketException; +import java.net.UnknownHostException; +import java.util.ArrayList; +import java.util.Enumeration; +import java.util.List; +import java.util.logging.Logger; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * InetAddress utility. + * @author stack + * @version $Date$, $Revision$ + */ +public class InetAddressUtil { + private static Logger logger = + Logger.getLogger(InetAddressUtil.class.getName()); + + /** + * ipv4 address. + */ + public static Pattern IPV4_QUADS = Pattern.compile( + "([0-9]{1,3})\\.([0-9]{1,3})\\.([0-9]{1,3})\\.([0-9]{1,3})"); + + private InetAddressUtil () { + super(); + } + + /** + * Returns InetAddress for passed host IF its in + * IPV4 quads format (e.g. 128.128.128.128). + *

TODO: Move to an AddressParsingUtil class. + * @param host Host name to examine. + * @return InetAddress IF the passed name was an IP address, else null. + */ + public static InetAddress getIPHostAddress(String host) { + InetAddress result = null; + Matcher matcher = IPV4_QUADS.matcher(host); + if (matcher == null || !matcher.matches()) { + return result; + } + try { + // Doing an Inet.getByAddress() avoids a lookup. + result = InetAddress.getByAddress(host, + new byte[] { + (byte)(new Integer(matcher.group(1)).intValue()), + (byte)(new Integer(matcher.group(2)).intValue()), + (byte)(new Integer(matcher.group(3)).intValue()), + (byte)(new Integer(matcher.group(4)).intValue())}); + } catch (NumberFormatException e) { + logger.warning(e.getMessage()); + } catch (UnknownHostException e) { + logger.warning(e.getMessage()); + } + return result; + } + + /** + * @return All known local names for this host or null if none found. + */ + public static List getAllLocalHostNames() { + List localNames = new ArrayList(); + Enumeration e = null; + try { + e = NetworkInterface.getNetworkInterfaces(); + } catch(SocketException exception) { + throw new RuntimeException(exception); + } + for (; e.hasMoreElements();) { + for (Enumeration ee = e.nextElement().getInetAddresses(); + ee.hasMoreElements();) { + InetAddress ia = ee.nextElement(); + if (ia != null) { + if (ia.getHostName() != null) { + localNames.add(ia.getCanonicalHostName()); + } + if (ia.getHostAddress() != null) { + localNames.add(ia.getHostAddress()); + } + } + } + } + final String localhost = "localhost"; + if (!localNames.contains(localhost)) { + localNames.add(localhost); + } + final String localhostLocaldomain = "localhost.localdomain"; + if (!localNames.contains(localhostLocaldomain)) { + localNames.add(localhostLocaldomain); + } + return localNames; + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/util/IterableLineIterator.java b/src/main/java/org/archive/util/IterableLineIterator.java new file mode 100644 index 00000000..6e0d9dc8 --- /dev/null +++ b/src/main/java/org/archive/util/IterableLineIterator.java @@ -0,0 +1,26 @@ +package org.archive.util; + +import java.io.Reader; +import java.util.Iterator; + +import org.apache.commons.io.LineIterator; + +/** + * A LineIterator that also implements Iterable, so that it can be used with + * the java enhanced for-each loop syntax. + * + * @contributor nlevitt + */ +public class IterableLineIterator extends LineIterator + implements Iterable { + + public IterableLineIterator(final Reader reader) + throws IllegalArgumentException { + super(reader); + } + + @SuppressWarnings("unchecked") + public Iterator iterator() { + return this; + } +} diff --git a/src/main/java/org/archive/util/LaxHttpParser.java b/src/main/java/org/archive/util/LaxHttpParser.java new file mode 100644 index 00000000..c1f768f0 --- /dev/null +++ b/src/main/java/org/archive/util/LaxHttpParser.java @@ -0,0 +1,242 @@ +/* + * $Header: /home/jerenkrantz/tmp/commons/commons-convert/cvs/home/cvs/jakarta-commons//httpclient/src/java/org/apache/commons/httpclient/LaxHttpParser.java,v 1.13 2005/01/11 13:57:06 oglueck Exp $ + * $Revision$ + * $Date$ + * + * ==================================================================== + * + * Copyright 1999-2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * ==================================================================== + * + * This software consists of voluntary contributions made by many + * individuals on behalf of the Apache Software Foundation. For more + * information on the Apache Software Foundation, please see + * . + * + */ +/* + * + */ + +package org.archive.util; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; + +import org.apache.commons.httpclient.Header; +import org.apache.commons.httpclient.HttpException; +import org.apache.commons.httpclient.util.EncodingUtil; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +/** + * A Modified version of HttpParser which doesn't throw exceptions on bad header lines + * + * A utility class for parsing http header values according to + * RFC-2616 Section 4 and 19.3. + * + * @author Michael Becke + * @author Oleg Kalnichevski + * + * @since 2.0beta1 + */ +public class LaxHttpParser { + + /** Log object for this class. */ + private static final Log LOG = LogFactory.getLog(LaxHttpParser.class); + + /** + * Constructor for LaxHttpParser. + */ + protected LaxHttpParser() { } + + /** + * Return byte array from an (unchunked) input stream. + * Stop reading when "\n" terminator encountered + * If the stream ends before the line terminator is found, + * the last part of the string will still be returned. + * If no input data available, null is returned. + * + * @param inputStream the stream to read from + * + * @throws IOException if an I/O problem occurs + * @return a byte array from the stream + */ + public static byte[] readRawLine(InputStream inputStream) throws IOException { + LOG.trace("enter LaxHttpParser.readRawLine()"); + + ByteArrayOutputStream buf = new ByteArrayOutputStream(); + int ch; + while ((ch = inputStream.read()) >= 0) { + buf.write(ch); + if (ch == '\n') { // be tolerant (RFC-2616 Section 19.3) + break; + } + } + if (buf.size() == 0) { + return null; + } + return buf.toByteArray(); + } + + /** + * Read up to "\n" from an (unchunked) input stream. + * If the stream ends before the line terminator is found, + * the last part of the string will still be returned. + * If no input data available, null is returned. + * + * @param inputStream the stream to read from + * @param charset charset of HTTP protocol elements + * + * @throws IOException if an I/O problem occurs + * @return a line from the stream + * + * @since 3.0 + */ + public static String readLine(InputStream inputStream, String charset) throws IOException { + LOG.trace("enter LaxHttpParser.readLine(InputStream, String)"); + byte[] rawdata = readRawLine(inputStream); + if (rawdata == null) { + return null; + } + // strip CR and LF from the end + int len = rawdata.length; + int offset = 0; + if (len > 0) { + if (rawdata[len - 1] == '\n') { + offset++; + if (len > 1) { + if (rawdata[len - 2] == '\r') { + offset++; + } + } + } + } + return EncodingUtil.getString(rawdata, 0, len - offset, charset); + } + + /** + * Read up to "\n" from an (unchunked) input stream. + * If the stream ends before the line terminator is found, + * the last part of the string will still be returned. + * If no input data available, null is returned + * + * @param inputStream the stream to read from + * + * @throws IOException if an I/O problem occurs + * @return a line from the stream + * + * @deprecated use #readLine(InputStream, String) + */ + + public static String readLine(InputStream inputStream) throws IOException { + LOG.trace("enter LaxHttpParser.readLine(InputStream)"); + return readLine(inputStream, "US-ASCII"); + } + + /** + * Parses headers from the given stream. Headers with the same name are not + * combined. + * + * @param is the stream to read headers from + * @param charset the charset to use for reading the data + * + * @return an array of headers in the order in which they were parsed + * + * @throws IOException if an IO error occurs while reading from the stream + * @throws HttpException if there is an error parsing a header value + * + * @since 3.0 + */ + public static Header[] parseHeaders(InputStream is, String charset) throws IOException, HttpException { + LOG.trace("enter HeaderParser.parseHeaders(InputStream, String)"); + + ArrayList

headers = new ArrayList
(); + String name = null; + StringBuffer value = null; + for (; ;) { + String line = LaxHttpParser.readLine(is, charset); + if ((line == null) || (line.trim().length() < 1)) { + break; + } + + // Parse the header name and value + // Check for folded headers first + // Detect LWS-char see HTTP/1.0 or HTTP/1.1 Section 2.2 + // discussion on folded headers + if ((line.charAt(0) == ' ') || (line.charAt(0) == '\t')) { + // we have continuation folded header + // so append value + if (value != null) { + value.append(' '); + value.append(line.trim()); + } + } else { + // make sure we save the previous name,value pair if present + if (name != null) { + headers.add(new Header(name, value.toString())); + } + + // Otherwise we should have normal HTTP header line + // Parse the header name and value + int colon = line.indexOf(":"); + + // START IA/HERITRIX change + // Don't throw an exception if can't parse. We want to keep + // going even though header is bad. Rather, create + // pseudo-header. + if (colon < 0) { + // throw new ProtocolException("Unable to parse header: " + + // line); + name = "HttpClient-Bad-Header-Line-Failed-Parse"; + value = new StringBuffer(line); + + } else { + name = line.substring(0, colon).trim(); + value = new StringBuffer(line.substring(colon + 1).trim()); + } + // END IA/HERITRIX change + } + + } + + // make sure we save the last name,value pair if present + if (name != null) { + headers.add(new Header(name, value.toString())); + } + + return (Header[]) headers.toArray(new Header[headers.size()]); + } + + /** + * Parses headers from the given stream. Headers with the same name are not + * combined. + * + * @param is the stream to read headers from + * + * @return an array of headers in the order in which they were parsed + * + * @throws IOException if an IO error occurs while reading from the stream + * @throws HttpException if there is an error parsing a header value + * + * @deprecated use #parseHeaders(InputStream, String) + */ + public static Header[] parseHeaders(InputStream is) throws IOException, HttpException { + LOG.trace("enter HeaderParser.parseHeaders(InputStream, String)"); + return parseHeaders(is, "US-ASCII"); + } +} diff --git a/src/main/java/org/archive/util/MimetypeUtils.java b/src/main/java/org/archive/util/MimetypeUtils.java new file mode 100644 index 00000000..adfa1a0f --- /dev/null +++ b/src/main/java/org/archive/util/MimetypeUtils.java @@ -0,0 +1,75 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.util; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Class of mimetype utilities. + * @author stack + */ +public class MimetypeUtils { + /** + * The 'no-type' content-type. + * + * Defined in the ARC file spec at + * http://www.archive.org/web/researcher/ArcFileFormat.php. + */ + public static final String NO_TYPE_MIMETYPE = "no-type"; + + /** + * Truncation regex. + */ + protected final static Pattern TRUNCATION_REGEX = Pattern.compile("^([^\\s;,]+).*"); + + + /** + * Truncate passed mimetype. + * + * Ensure no spaces. Strip encoding. Truncation required by + * ARC files. + * + *

Truncate at delimiters [;, ]. + * Truncate multi-part content type header at ';'. + * Apache httpclient collapses values of multiple instances of the + * header into one comma-separated value,therefore truncated at ','. + * Current ia_tools that work with arc files expect 5-column + * space-separated meta-lines, therefore truncate at ' '. + * + * @param contentType Raw content-type. + * + * @return Computed content-type made from passed content-type after + * running it through a set of rules. + */ + public static String truncate(String contentType) { + if (contentType == null) { + contentType = NO_TYPE_MIMETYPE; + } else { + Matcher matcher = TRUNCATION_REGEX.matcher(contentType); + if (matcher.matches()) { + contentType = matcher.group(1); + } else { + contentType = NO_TYPE_MIMETYPE; + } + } + + return contentType; + } +} diff --git a/src/main/java/org/archive/util/ProcessUtils.java b/src/main/java/org/archive/util/ProcessUtils.java new file mode 100644 index 00000000..af792981 --- /dev/null +++ b/src/main/java/org/archive/util/ProcessUtils.java @@ -0,0 +1,151 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.util; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.Arrays; +import java.util.logging.Level; +import java.util.logging.Logger; + +/** + * Class to run an external process. + * @author stack + * @version $Date$ $Revision$ + */ +public class ProcessUtils { + private static final Logger LOGGER = + Logger.getLogger(ProcessUtils.class.getName()); + + protected ProcessUtils() { + super(); + } + + /** + * Thread to gobble up an output stream. + * See http://www.javaworld.com/javaworld/jw-12-2000/jw-1229-traps.html + */ + protected class StreamGobbler extends Thread { + private final InputStream is; + private final StringBuffer sink = new StringBuffer(); + + protected StreamGobbler(InputStream is, String name) { + this.is = is; + setName(name); + } + + public void run() { + try { + BufferedReader br = + new BufferedReader(new InputStreamReader(this.is)); + for (String line = null; (line = br.readLine()) != null;) { + this.sink.append(line); + } + } catch (IOException ioe) { + ioe.printStackTrace(); + } + } + + public String getSink() { + return this.sink.toString(); + } + } + + /** + * Data structure to hold result of a process exec. + * @author stack + * @version $Date$ $Revision$ + */ + public class ProcessResult { + private final String [] args; + private final int result; + private final String stdout; + private final String stderr; + + protected ProcessResult(String [] args, int result, String stdout, + String stderr) { + this.args = args; + this.result = result; + this.stderr = stderr; + this.stdout = stdout; + } + + public int getResult() { + return this.result; + } + + public String getStdout() { + return this.stdout; + } + + public String getStderr() { + return this.stderr; + } + + public String toString() { + StringBuffer sb = new StringBuffer(); + for (int i = 0; i < this.args.length; i++) { + sb.append(this.args[i]); + sb.append(", "); + } + return sb.toString() + " exit code: " + this.result + + ((this.stderr != null && this.stderr.length() > 0)? + "\nSTDERR: " + this.stderr: "") + + ((this.stdout != null && this.stdout.length() > 0)? + "\nSTDOUT: " + this.stdout: ""); + } + } + + /** + * Runs process. + * @param args List of process args. + * @return A ProcessResult data structure. + * @throws IOException If interrupted, we throw an IOException. If non-zero + * exit code, we throw an IOException (This may need to change). + */ + public static ProcessUtils.ProcessResult exec(String [] args) + throws IOException { + Process p = Runtime.getRuntime().exec(args); + ProcessUtils pu = new ProcessUtils(); + // Gobble up any output. + StreamGobbler err = pu.new StreamGobbler(p.getErrorStream(), "stderr"); + err.setDaemon(true); + err.start(); + StreamGobbler out = pu.new StreamGobbler(p.getInputStream(), "stdout"); + out.setDaemon(true); + out.start(); + int exitVal; + try { + exitVal = p.waitFor(); + } catch (InterruptedException e) { + throw new IOException("Wait on process " + Arrays.toString(args) + " interrupted: " + + e.getMessage()); + } + ProcessUtils.ProcessResult result = + pu.new ProcessResult(args, exitVal, out.getSink(), err.getSink()); + if (exitVal != 0) { + throw new IOException(result.toString()); + } else if (LOGGER.isLoggable(Level.INFO)) { + LOGGER.info(result.toString()); + } + return result; + } +} diff --git a/src/main/java/org/archive/util/ProgressStatisticsReporter.java b/src/main/java/org/archive/util/ProgressStatisticsReporter.java new file mode 100644 index 00000000..dc1e51f7 --- /dev/null +++ b/src/main/java/org/archive/util/ProgressStatisticsReporter.java @@ -0,0 +1,36 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.util; + +import java.io.IOException; +import java.io.PrintWriter; + +public interface ProgressStatisticsReporter { + /** + * @param writer Where to write statistics. + * @throws IOException + */ + public void progressStatisticsLine(PrintWriter writer) throws IOException; + + /** + * @param writer Where to write statistics legend. + * @throws IOException + */ + public void progressStatisticsLegend(PrintWriter writer) throws IOException; +} diff --git a/src/main/java/org/archive/util/PropertyUtils.java b/src/main/java/org/archive/util/PropertyUtils.java new file mode 100644 index 00000000..083615f6 --- /dev/null +++ b/src/main/java/org/archive/util/PropertyUtils.java @@ -0,0 +1,114 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.util; + +import java.util.Properties; +import java.util.regex.Matcher; + +import org.apache.commons.lang.StringUtils; + +/** + * Utilities for dealing with Java Properties (incl. System Properties) + * + * @contributor stack + * @contributor gojomo + * @version $Date$ $Revision$ + */ +public class PropertyUtils { + /*** + * @param key Property key. + * @return Named property or null if the property is null or empty. + */ + public static String getPropertyOrNull(final String key) { + String value = System.getProperty(key); + return (value == null || value.length() <= 0)? null: value; + } + + /*** + * @param key Property key. + * @return Boolean value or false if null or unreadable. + */ + public static boolean getBooleanProperty(final String key) { + return (getPropertyOrNull(key) == null)? + false: Boolean.valueOf(getPropertyOrNull(key)).booleanValue(); + } + + /** + * @param key Key to use looking up system property. + * @param fallback If no value found for passed key, return + * fallback. + * @return Value of property or fallback. + */ + public static int getIntProperty(final String key, final int fallback) { + return getPropertyOrNull(key) == null? + fallback: Integer.parseInt(getPropertyOrNull(key)); + } + + /** + * Given a string which may contain expressions of the form + * ${key}, replace each expression with the value corresponding to the + * given key in System Properties. If no value is present, + * the expression is replaced with the empty-string. + * + * @param original String + * @param properties Properties to try in order; first value found (if any) is used + * @return modified String + */ + public static String interpolateWithProperties(String original) { + return interpolateWithProperties(original,System.getProperties()); + } + + protected static String propRefPattern = "\\$\\{([^{}]+)\\}"; + + /** + * Given a string which may contain expressions of the form + * ${key}, replace each expression with the value corresponding to the + * given key in the supplied Properties instance. If no value is present, + * the expression is replaced with the empty-string. + * + * @param original String + * @param props Properties to try in order; first value found (if any) is used + * @return modified String + */ + public static String interpolateWithProperties(String original, + Properties... props) { + String result = original; + // cap number of interpolations as guard against unending loop + inter: for(int i =0; i < original.length()*2; i++) { + Matcher m = TextUtils.getMatcher(propRefPattern, result); + while(m.find()) { + String key = m.group(1); + String value = ""; + for(Properties properties : props) { + value = properties.getProperty(key, ""); + if(StringUtils.isNotEmpty(value)) { + break; + } + } + result = result.substring(0,m.start()) + + value + + result.substring(m.end()); + continue inter; + } + // we only hit here if there were no interpolations last while loop + break; + } + return result; + } +} diff --git a/src/main/java/org/archive/util/Recorder.java b/src/main/java/org/archive/util/Recorder.java new file mode 100644 index 00000000..425344bb --- /dev/null +++ b/src/main/java/org/archive/util/Recorder.java @@ -0,0 +1,593 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.util; + +import java.io.BufferedInputStream; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.OutputStream; +import java.nio.charset.Charset; +import java.util.HashSet; +import java.util.Set; +import java.util.logging.Level; +import java.util.logging.Logger; +import java.util.zip.DeflaterInputStream; +import java.util.zip.GZIPInputStream; + +import org.apache.commons.httpclient.ChunkedInputStream; +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang.StringUtils; +import org.archive.io.GenericReplayCharSequence; +import org.archive.io.RecordingInputStream; +import org.archive.io.RecordingOutputStream; +import org.archive.io.ReplayCharSequence; +import org.archive.io.ReplayInputStream; + +import com.google.common.base.Charsets; + + +/** + * Pairs together a RecordingInputStream and RecordingOutputStream + * to capture exactly a single HTTP transaction. + * + * Initially only supports HTTP/1.0 (one request, one response per stream) + * + * Call {@link #markContentBegin()} to demarc the transition between HTTP + * header and body. + * + * @author gojomo + */ +public class Recorder { + protected static Logger logger = + Logger.getLogger("org.archive.util.HttpRecorder"); + + private static final int DEFAULT_OUTPUT_BUFFER_SIZE = 16384; + private static final int DEFAULT_INPUT_BUFFER_SIZE = 524288; + + private RecordingInputStream ris = null; + private RecordingOutputStream ros = null; + + /** + * Backing file basename. + * + * Keep it around so can clean up backing files left on disk. + */ + private String backingFileBasename = null; + + /** + * Backing file output stream suffix. + */ + private static final String RECORDING_OUTPUT_STREAM_SUFFIX = ".ros"; + + /** + * Backing file input stream suffix. + */ + private static final String RECORDING_INPUT_STREAM_SUFFIX = ".ris"; + + /** + * recording-input (ris) content character encoding. + */ + protected String characterEncoding = null; + + /** + * Charset to use for CharSequence provision. Will be UTF-8 if no + * encoding ever requested; a Charset matching above characterEncoding + * if possible; ISO_8859 if above characterEncoding is unsatisfiable. + * TODO: unify to UTF-8 for unspecified and bad-specified cases? + * (current behavior is for consistency with our prior but perhaps not + * optimal behavior) + */ + protected Charset charset = Charsets.UTF_8; + + /** whether recording-input (ris) message-body is chunked */ + protected boolean inputIsChunked = false; + + /** recording-input (ris) entity content-encoding (eg gzip, deflate), if any */ + protected String contentEncoding = null; + + private ReplayCharSequence replayCharSequence; + + + /** + * Create an HttpRecorder. + * + * @param tempDir Directory into which we drop backing files for + * recorded input and output. + * @param backingFilenameBase Backing filename base to which we'll append + * suffices ris for recorded input stream and + * ros for recorded output stream. + * @param outBufferSize Size of output buffer to use. + * @param inBufferSize Size of input buffer to use. + */ + public Recorder(File tempDir, String backingFilenameBase, + int outBufferSize, int inBufferSize) { + this(new File(ensure(tempDir), backingFilenameBase), + outBufferSize, inBufferSize); + } + + + private static File ensure(File tempDir) { + try { + org.archive.util.FileUtils.ensureWriteableDirectory(tempDir); + } catch (IOException e) { + throw new IllegalStateException(e); + } + + return tempDir; + } + + public Recorder(File file, int outBufferSize, int inBufferSize) { + super(); + this.backingFileBasename = file.getAbsolutePath(); + this.ris = new RecordingInputStream(inBufferSize, + this.backingFileBasename + RECORDING_INPUT_STREAM_SUFFIX); + this.ros = new RecordingOutputStream(outBufferSize, + this.backingFileBasename + RECORDING_OUTPUT_STREAM_SUFFIX); + } + + /** + * Create an HttpRecorder. + * + * @param tempDir + * Directory into which we drop backing files for recorded input + * and output. + * @param backingFilenameBase + * Backing filename base to which we'll append suffices + * ris for recorded input stream and + * ros for recorded output stream. + */ + public Recorder(File tempDir, String backingFilenameBase) { + this(tempDir, backingFilenameBase, DEFAULT_INPUT_BUFFER_SIZE, + DEFAULT_OUTPUT_BUFFER_SIZE); + } + + + /** + * Wrap the provided stream with the internal RecordingInputStream + * + * open() throws an exception if RecordingInputStream is already open. + * + * @param is InputStream to wrap. + * + * @return The input stream wrapper which itself is an input stream. + * Pass this in place of the passed stream so input can be recorded. + * + * @throws IOException + */ + public InputStream inputWrap(InputStream is) + throws IOException { + logger.fine(Thread.currentThread().getName() + " wrapping input"); + + // discard any state from previously-recorded input + this.characterEncoding = null; + this.inputIsChunked = false; + this.contentEncoding = null; + + this.ris.open(is); + return this.ris; + } + + /** + * Wrap the provided stream with the internal RecordingOutputStream + * + * open() throws an exception if RecordingOutputStream is already open. + * + * @param os The output stream to wrap. + * + * @return The output stream wrapper which is itself an output stream. + * Pass this in place of the passed stream so output can be recorded. + * + * @throws IOException + */ + public OutputStream outputWrap(OutputStream os) + throws IOException { + this.ros.open(os); + return this.ros; + } + + /** + * Close all streams. + */ + public void close() { + logger.fine(Thread.currentThread().getName() + " closing"); + try { + this.ris.close(); + } catch (IOException e) { + // TODO: Can we not let the exception out of here and report it + // higher up in the caller? + DevUtils.logger.log(Level.SEVERE, "close() ris" + + DevUtils.extraInfo(), e); + } + try { + this.ros.close(); + } catch (IOException e) { + DevUtils.logger.log(Level.SEVERE, "close() ros" + + DevUtils.extraInfo(), e); + } + } + + /** + * Return the internal RecordingInputStream + * + * @return A RIS. + */ + public RecordingInputStream getRecordedInput() { + return this.ris; + } + + /** + * @return The RecordingOutputStream. + */ + public RecordingOutputStream getRecordedOutput() { + return this.ros; + } + + /** + * Mark current position as the point where the HTTP headers end. + */ + public void markContentBegin() { + this.ris.markContentBegin(); + } + + public long getResponseContentLength() { + return this.ris.getResponseContentLength(); + } + + /** + * Close both input and output recorders. + * + * Recorders are the output streams to which we are recording. + * {@link #close()} closes the stream that is being recorded and the + * recorder. This method explicitly closes the recorder only. + */ + public void closeRecorders() { + try { + this.ris.closeRecorder(); + this.ros.closeRecorder(); + } catch (IOException e) { + DevUtils.warnHandle(e, "Convert to runtime exception?"); + } + } + + /** + * Cleanup backing files. + * + * Call when completely done w/ recorder. Removes any backing files that + * may have been dropped. + */ + public void cleanup() { + this.close(); + this.delete(this.backingFileBasename + RECORDING_OUTPUT_STREAM_SUFFIX); + this.delete(this.backingFileBasename + RECORDING_INPUT_STREAM_SUFFIX); + } + + /** + * Delete file if exists. + * + * @param name Filename to delete. + */ + private void delete(String name) { + File f = new File(name); + if (f.exists()) { + f.delete(); + } + } + + + protected static ThreadLocal currentRecorder = new ThreadLocal(); + + public static void setHttpRecorder(Recorder httpRecorder) { + currentRecorder.set(httpRecorder); + } + + /** + * Get the current threads' HttpRecorder. + * + * @return This threads' HttpRecorder. Returns null if can't find a + * HttpRecorder in current instance. + */ + public static Recorder getHttpRecorder() { + return currentRecorder.get(); + } + + /** + * @param characterEncoding Character encoding of input recording. + * @return actual charset in use after attempt to set + */ + public void setCharset(Charset cs) { + this.charset = cs; + } + + /** + * @return effective Charset of input recording + */ + public Charset getCharset() { + return this.charset; + } + + /** + * @param characterEncoding Character encoding of input recording. + */ + public void setInputIsChunked(boolean chunked) { + this.inputIsChunked = chunked; + } + + protected static Set SUPPORTED_ENCODINGS = new HashSet(); + static { + SUPPORTED_ENCODINGS.add("gzip"); + SUPPORTED_ENCODINGS.add("x-gzip"); + SUPPORTED_ENCODINGS.add("deflate"); + SUPPORTED_ENCODINGS.add("identity"); + SUPPORTED_ENCODINGS.add("none"); // unofficial but common + } + /** + * @param contentEncoding declared content-encoding of input recording. + */ + public void setContentEncoding(String contentEncoding) { + String lowerCoding = contentEncoding.toLowerCase(); + if(!SUPPORTED_ENCODINGS.contains(contentEncoding.toLowerCase())) { + throw new IllegalArgumentException("contentEncoding unsupported: "+contentEncoding); + } + this.contentEncoding = lowerCoding; + } + + /** + * @return Returns the characterEncoding. + */ + public String getContentEncoding() { + return this.contentEncoding; + } + + + /** + * @return + * @throws IOException + * @deprecated use getContentReplayCharSequence + */ + public ReplayCharSequence getReplayCharSequence() throws IOException { + return getContentReplayCharSequence(); + } + + /** + * @return A ReplayCharSequence. Caller may call + * {@link ReplayCharSequence#close()} when finished. However, in + * heritrix, the ReplayCharSequence is closed automatically when url + * processing has finished; in that context it's preferable not + * to close, so that processors can reuse the same instance. + * @throws IOException + * @see {@link #endReplays()} + */ + public ReplayCharSequence getContentReplayCharSequence() throws IOException { + if (replayCharSequence == null || !replayCharSequence.isOpen() + || !replayCharSequence.getCharset().equals(charset)) { + if(replayCharSequence!=null && replayCharSequence.isOpen()) { + // existing sequence must not have matched now-configured Charset; close + replayCharSequence.close(); + } + replayCharSequence = getContentReplayCharSequence(this.charset); + } + return replayCharSequence; + } + + + /** + * @param characterEncoding Encoding of recorded stream. + * @return A ReplayCharSequence Will return null if an IOException. Call + * close on returned RCS when done. + * @throws IOException + */ + public ReplayCharSequence getContentReplayCharSequence(Charset requestedCharset) throws IOException { + // raw data overflows to disk; use temp file + InputStream ris = getContentReplayInputStream(); + ReplayCharSequence rcs = new GenericReplayCharSequence( + ris, + calcRecommendedCharBufferSize(this.getRecordedInput()), + this.backingFileBasename + RECORDING_OUTPUT_STREAM_SUFFIX, + requestedCharset); + ris.close(); + return rcs; + } + + /** + * Calculate a recommended size for an in-memory decoded-character buffer + * of this content. We seek a size that is itself no larger (in 2-byte chars) + * than the memory already used by the RecordingInputStream's internal raw + * byte buffer, and also no larger than likely necessary. So, we take the + * minimum of the actual recorded byte size and the RecordingInputStream's + * max buffer size. + * + * @param inStream + * @return int length for in-memory decoded-character buffer + */ + static protected int calcRecommendedCharBufferSize(RecordingInputStream inStream) { + return (int) Math.min(inStream.getRecordedBufferLength()/2, inStream.getSize()); + } + + /** + * Get a raw replay of all recorded data (including, for example, HTTP + * protocol headers) + * + * @return A replay input stream. + * @throws IOException + */ + public ReplayInputStream getReplayInputStream() throws IOException { + return getRecordedInput().getReplayInputStream(); + } + + /** + * Get a raw replay of the 'message-body'. For the common case of + * HTTP, this is the raw, possibly chunked-transfer-encoded message + * contents not including the leading headers. + * + * @return A replay input stream. + * @throws IOException + */ + public ReplayInputStream getMessageBodyReplayInputStream() throws IOException { + return getRecordedInput().getMessageBodyReplayInputStream(); + } + + /** + * Get a raw replay of the 'entity'. For the common case of + * HTTP, this is the message-body after any (usually-unnecessary) + * transfer-decoding but before any content-encoding (eg gzip) decoding + * + * @return A replay input stream. + * @throws IOException + */ + public InputStream getEntityReplayInputStream() throws IOException { + if(inputIsChunked) { + return new ChunkedInputStream(getRecordedInput().getMessageBodyReplayInputStream()); + } else { + return getRecordedInput().getMessageBodyReplayInputStream(); + } + } + + /** + * Get a replay cued up for the 'content' (after all leading headers) + * + * @return A replay input stream. + * @throws IOException + */ + public InputStream getContentReplayInputStream() throws IOException { + InputStream entityStream = getEntityReplayInputStream(); + if(StringUtils.isEmpty(contentEncoding)) { + return entityStream; + } else if ("gzip".equalsIgnoreCase(contentEncoding) || "x-gzip".equalsIgnoreCase(contentEncoding)) { + try { + return new GZIPInputStream(entityStream); + } catch (IOException ioe) { + logger.log(Level.WARNING,"gzip problem; using raw entity instead",ioe); + IOUtils.closeQuietly(entityStream); // close partially-read stream + return getEntityReplayInputStream(); + } + } else if ("deflate".equalsIgnoreCase(contentEncoding)) { + return new DeflaterInputStream(entityStream); + } else if ("identity".equalsIgnoreCase(contentEncoding) || "none".equalsIgnoreCase(contentEncoding)) { + return entityStream; + } else { + // shouldn't be reached given check on setContentEncoding + logger.log(Level.INFO,"Unknown content-encoding '"+contentEncoding+"' declared; using raw entity instead"); + return entityStream; + } + } + + /** + * Return a short prefix of the presumed-textual content as a String. + * + * @param size max length of String to return + * @return String prefix, or empty String (with logged exception) on any error + */ + public String getContentReplayPrefixString(int size) { + return getContentReplayPrefixString(size, this.charset); + } + + /** + * Return a short prefix of the presumed-textual content as a String. + * + * @param size max length of String to return + * @return String prefix, or empty String (with logged exception) on any error + */ + public String getContentReplayPrefixString(int size, Charset cs) { + try { + InputStreamReader isr = new InputStreamReader(getContentReplayInputStream(), cs); + char[] chars = new char[size]; + int count = isr.read(chars); + isr.close(); + if (count > 0) { + return new String(chars,0,count); + } else { + return ""; + } + } catch (IOException e) { + logger.log(Level.SEVERE,"unable to get replay prefix string", e); + return ""; + } + } + + /** + * @param tempFile + * @throws IOException + */ + public void copyContentBodyTo(File tempFile) throws IOException { + InputStream inStream = null; + OutputStream outStream = null; + try { + inStream = getContentReplayInputStream(); + outStream = FileUtils.openOutputStream(tempFile); + IOUtils.copy(inStream, outStream); + } finally { + IOUtils.closeQuietly(inStream); + IOUtils.closeQuietly(outStream); + } + } + + /** + * Record the input stream for later playback by an extractor, etc. + * This is convenience method used to setup an artificial HttpRecorder + * scenario used in unit tests, etc. + * @param dir Directory to write backing file to. + * @param basename of what we're recording. + * @param in Stream to read. + * @param encoding Stream encoding. + * @throws IOException + * @return An {@link org.archive.util.Recorder}. + */ + public static Recorder wrapInputStreamWithHttpRecord(File dir, + String basename, InputStream in, String encoding) + throws IOException { + Recorder rec = new Recorder(dir, basename); + if (encoding != null && encoding.length() > 0) { + rec.setCharset(Charset.forName(encoding)); + } + // Do not use FastBufferedInputStream here. It does not + // support mark. + InputStream is = rec.inputWrap(new BufferedInputStream(in)); + final int BUFFER_SIZE = 1024 * 4; + byte [] buffer = new byte[BUFFER_SIZE]; + while(true) { + // Just read it all down. + int x = is.read(buffer); + if (x == -1) { + break; + } + } + is.close(); + return rec; + } + + public void endReplays() { + ArchiveUtils.closeQuietly(replayCharSequence); + replayCharSequence = null; + + // like closeQuietly + try { + ris.clearForReuse(); + } catch (IOException ioe) { + } + + // like closeQuietly + try { + ros.clearForReuse(); + } catch (IOException e) { + } + } +} diff --git a/src/main/java/org/archive/util/Reporter.java b/src/main/java/org/archive/util/Reporter.java new file mode 100644 index 00000000..2fcb8cd8 --- /dev/null +++ b/src/main/java/org/archive/util/Reporter.java @@ -0,0 +1,56 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.util; + +import java.io.IOException; +import java.io.PrintWriter; +import java.util.Map; + +public interface Reporter { + /** + * Make a default report to the passed-in Writer. Should + * be equivalent to reportTo(null, writer) + * + * @param writer to receive report + */ + public void reportTo(PrintWriter writer) throws IOException; + + /** + * Write a short single-line summary report + * + * @param writer to receive report + */ + @Deprecated + public void shortReportLineTo(PrintWriter pw) throws IOException; + + + /** + * @return Same data that's in the single line report, as key-value pairs + */ + public Map shortReportMap(); + + + /** + * Return a legend for the single-line summary report as a String. + * + * @return String single-line summary legend + */ + public String shortReportLegend(); +} diff --git a/src/main/java/org/archive/util/anvl/ANVLRecord.java b/src/main/java/org/archive/util/anvl/ANVLRecord.java new file mode 100644 index 00000000..de2d3101 --- /dev/null +++ b/src/main/java/org/archive/util/anvl/ANVLRecord.java @@ -0,0 +1,336 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.util.anvl; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.UnsupportedEncodingException; +import java.util.Collection; +import java.util.HashMap; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.logging.Level; +import java.util.logging.Logger; + +import org.archive.io.UTF8Bytes; + +/** + * An ordered {@link List} with 'data' {@link Element} values. + * ANVLRecords end with a blank line. + * + * @see A Name-Value + * Language (ANVL) + * @author stack + */ +public class ANVLRecord extends LinkedList implements UTF8Bytes { + private static final Logger logger = + Logger.getLogger(ANVLRecord.class.getName()); + + public static final String MIMETYPE = "application/warc-fields"; + + public static final ANVLRecord EMPTY_ANVL_RECORD = new ANVLRecord(); + + /** + * Arbitrary upper bound on maximum size of ANVL Record. + * Will throw an IOException if exceed this size. + */ + public static final long MAXIMUM_SIZE = 1024 * 10; + + /** + * An ANVL 'newline'. + * @see http://en.wikipedia.org/wiki/CRLF + */ + protected static final String CRLF = "\r\n"; + + protected static final String FOLD_PREFIX = CRLF + ' '; + + public ANVLRecord() { + super(); + } + + public ANVLRecord(Collection c) { + super(c); + } + + /** @deprecated */ + public ANVLRecord(int initialCapacity) { + super(); + } + + public boolean addLabel(final String l) { + return super.add(new Element(new Label(l))); + } + + public boolean addLabelValue(final String l, final String v) { + try { + return super.add(new Element(new Label(l), new Value(v))); + } catch (IllegalArgumentException e) { + logger.log(Level.WARNING, "bad label " + l + " or value " + v, e); + return false; + } + } + + @Override + public String toString() { + // TODO: What to emit for empty ANVLRecord? + StringBuilder sb = new StringBuilder(); + for (final Iterator i = iterator(); i.hasNext();) { + sb.append(i.next()); + sb.append(CRLF); + } + // 'ANVL Records end in a blank line'. + sb.append(CRLF); + return sb.toString(); + } + + public Map asMap() { + Map m = new HashMap(size()); + for (final Iterator i = iterator(); i.hasNext();) { + Element e = i.next(); + m.put(e.getLabel().toString(), + e.isValue()? e.getValue().toString(): (String)null); + } + return m; + } + + @Override + public ANVLRecord clone() { + return (ANVLRecord) super.clone(); + } + + /** + * @return This ANVLRecord as UTF8 bytes. + */ + public byte [] getUTF8Bytes() + throws UnsupportedEncodingException { + return toString().getBytes(UTF8); + } + + /** + * Parses a single ANVLRecord from passed InputStream. + * Read as a single-byte stream until we get to a CRLFCRLF which + * signifies End-of-ANVLRecord. Then parse all read as a UTF-8 Stream. + * Doing it this way, while requiring a double-scan, it makes it so do not + * need to be passed a RepositionableStream or a Stream that supports + * marking. Also no danger of over-reading which can happen when we + * wrap passed Stream with an InputStreamReader for doing UTF-8 + * character conversion (See the ISR class comment). + * @param is InputStream + * @return An ANVLRecord instance. + * @throws IOException + */ + public static ANVLRecord load(final InputStream is) + throws IOException { + // It doesn't look like a CRLF sequence is possible in UTF-8 without + // it signifying CRLF: The top bits are set in multibyte characters. + // Was thinking of recording CRLF as I was running through this first + // parse but the offsets would then be incorrect if any multibyte + // characters in the intervening gaps between CRLF. + boolean isCRLF = false; + boolean recordStart = false; + ByteArrayOutputStream baos = new ByteArrayOutputStream(1024); + boolean done = false; + int read = 0; + for (int c = -1, previousCharacter; !done;) { + if (read++ >= MAXIMUM_SIZE) { + throw new IOException("Read " + MAXIMUM_SIZE + + " bytes without finding \\r\\n\\r\\n " + + "End-Of-ANVLRecord"); + } + previousCharacter = c; + c = is.read(); + if (c == -1) { + throw new IOException("End-Of-Stream before \\r\\n\\r\\n " + + "End-Of-ANVLRecord:\n" + + new String(baos.toByteArray(), UTF8)); + } + if (isLF((char)c) && isCR((char)previousCharacter)) { + if (isCRLF) { + // If we just had a CRLF, then its two CRLFs and its end of + // record. We're done. + done = true; + } else { + isCRLF = true; + } + } else if (!recordStart && Character.isWhitespace(c)) { + // Skip any whitespace at start of ANVLRecord. + continue; + } else { + // Clear isCRLF flag if this character is NOT a '\r'. + if (isCRLF && !isCR((char)c)) { + isCRLF = false; + } + // Not whitespace so start record if we haven't already. + if (!recordStart) { + recordStart = true; + } + } + baos.write(c); + } + return load(new String(baos.toByteArray(), UTF8)); + } + + /** + * Parse passed String for an ANVL Record. + * Looked at writing javacc grammer but preprocessing is required to + * handle folding: See + * https://javacc.dev.java.net/servlets/BrowseList?list=users&by=thread&from=56173. + * Looked at Terence Parr's ANTLR. More capable. Can set lookahead count. + * A value of 3 would help with folding. But its a pain defining UNICODE + * grammers -- needed by ANVL -- and support seems incomplete + * anyways: http://www.doc.ic.ac.uk/lab/secondyear/Antlr/lexer.html#unicode. + * For now, go with the below hand-rolled parser. + * @param s String with an ANVLRecord. + * @return ANVLRecord parsed from passed String. + * @throws IOException + */ + public static ANVLRecord load(final String s) + throws IOException { + ANVLRecord record = new ANVLRecord(); + boolean inValue = false, inLabel = false, inComment = false, + inNewLine = false; + String label = null; + StringBuilder sb = new StringBuilder(s.length()); + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + + // Assert I can do look-ahead. + if ((i + 1) > s.length()) { + throw new IOException("Premature End-of-ANVLRecord:\n" + + s.substring(i)); + } + + // If at LF of a CRLF, just go around again. Eat up the LF. + if (inNewLine && isLF(c)) { + continue; + } + + // If we're at a CRLF and we were just on one, exit. Found Record. + if (inNewLine && isCR(c) && isLF(s.charAt(i + 1))) { + break; + } + + // Check if we're on a fold inside a Value. Skip multiple white + // space after CRLF. + if (inNewLine && inValue && Character.isWhitespace(c)) { + continue; + } + + // Else set flag if we're at a CRLF. + inNewLine = isCR(c) && isLF(s.charAt(i + 1)); + + if (inNewLine) { + if (inComment) { + inComment = false; + } else if (label != null && !inValue) { + // Label only 'data element'. + record.addLabel(label); + label = null; + sb.setLength(0); + } else if (inValue) { + // Assert I can do look-ahead past current CRLF. + if ((i + 3) > s.length()) { + throw new IOException("Premature End-of-ANVLRecord " + + "(2):\n" + s.substring(i)); + } + if (!isCR(s.charAt(i + 2)) && !isLF(s.charAt(i + 3)) + && Character.isWhitespace(s.charAt(i + 2))) { + // Its a fold. Let it go around. But add in a CRLF and + // space and do it here. We don't let CRLF fall through + // to the sb.append on the end of this loop. + sb.append(CRLF); + sb.append(' '); + } else { + // Next line is a new SubElement, a new Comment or + // Label. + record.addLabelValue(label, sb.toString()); + sb.setLength(0); + label = null; + inValue = false; + } + } else { + // We're whitespace between label and value or whitespace + // before we've figured whether label or comment. + } + // Don't let the '\r' or CRLF through. + continue; + } + + if (inComment) { + continue; + } else if (inLabel) { + if (c == Label.COLON) { + label = sb.toString(); + sb.setLength(0); + inLabel = false; + continue; + } + } else { + if (!inLabel && !inValue && !inComment) { + // We have no state. Figure one. + if (Character.isWhitespace(c)) { + // If no state, and whitespace, skip. Don't record. + continue; + } else if (label == null && c == '#') { + inComment = true; + // Don't record comments. + continue; + } else if (label == null) { + inLabel = true; + } else { + inValue = true; + } + } + } + sb.append(c); + } + return record; + } + + /** + * @return Count of ANVLRecord bytes. Be careful, an empty ANVLRecord is + * CRLFCRLF so is of size 4. Also, expensive, since it makes String of + * the record so it can count bytes. + */ + public synchronized int getLength() { + int length = -1; + try { + length = getUTF8Bytes().length; + } catch (UnsupportedEncodingException e) { + throw new RuntimeException(e); + } + return length; + } + + public static boolean isCROrLF(final char c) { + return isCR(c) || isLF(c); + } + + public static boolean isCR(final char c) { + return c == ANVLRecord.CRLF.charAt(0); + } + + public static boolean isLF(final char c) { + return c == ANVLRecord.CRLF.charAt(1); + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/util/anvl/Element.java b/src/main/java/org/archive/util/anvl/Element.java new file mode 100644 index 00000000..5881fa9b --- /dev/null +++ b/src/main/java/org/archive/util/anvl/Element.java @@ -0,0 +1,73 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.util.anvl; + + +/** + * ANVL 'data element'. + * Made of a lone {@link Label}, or a {@link Label} plus {@link Value}. + * + * @author stack + * @see A Name-Value + * Language (ANVL) + */ +public class Element { + private final SubElement [] subElements; + + public Element(final Label l) { + this.subElements = new SubElement [] {l}; + } + + public Element(final Label l, final Value v) { + this.subElements = new SubElement [] {l, v}; + } + + public boolean isValue() { + return this.subElements.length > 1; + } + + public Label getLabel() { + return (Label)this.subElements[0]; + } + + public Value getValue() { + if (!isValue()) { + return null; + } + return (Value)this.subElements[1]; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < subElements.length; i++) { + sb.append(subElements[i].toString()); + if (i == 0) { + // Add colon after Label. + sb.append(':'); + if (isValue()) { + // Add space to intro the value. + sb.append(' '); + } + } + } + return sb.toString(); + } +} diff --git a/src/main/java/org/archive/util/anvl/Label.java b/src/main/java/org/archive/util/anvl/Label.java new file mode 100644 index 00000000..fdadb735 --- /dev/null +++ b/src/main/java/org/archive/util/anvl/Label.java @@ -0,0 +1,41 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.util.anvl; + +class Label extends SubElement { + public static final char COLON = ':'; + + @SuppressWarnings("unused") + private Label() { + this(null); + } + + public Label(final String s) { + super(s); + } + + @Override + protected void checkCharacter(char c, String srcStr, int index) { + super.checkCharacter(c, srcStr, index); + if (c == COLON) { + throw new IllegalArgumentException("Label cannot contain " + COLON); + } + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/util/anvl/SubElement.java b/src/main/java/org/archive/util/anvl/SubElement.java new file mode 100644 index 00000000..33b9e9bb --- /dev/null +++ b/src/main/java/org/archive/util/anvl/SubElement.java @@ -0,0 +1,78 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.util.anvl; + +/** + * Abstract ANVL 'data element' sub-part. + * Subclass to make a Comment, a Label, or a Value. + * @author stack + */ +abstract class SubElement { + private final String e; + + protected SubElement() { + this(null); + } + + public SubElement(final String s) { + this.e = baseCheck(s); + } + + protected String baseCheck(final String s) { + // Check for null. + if (s == null) { + throw new IllegalArgumentException("Can't be null"); + } + // Check for CRLF. + for (int i = 0; i < s.length(); i++) { + checkCharacter(s.charAt(i), s, i); + } + return s; + } + + protected void checkCharacter(final char c, final String srcStr, + final int index) { + checkControlCharacter(c, srcStr, index); + checkCRLF(c, srcStr, index); + } + + protected void checkControlCharacter(final char c, final String srcStr, + final int index) { + if (Character.isISOControl(c) && !Character.isWhitespace(c) || + !Character.isValidCodePoint(c)) { + throw new IllegalArgumentException(srcStr + + " contains a control character(s) or invalid code point: 0x" + + Integer.toHexString(c)); + } + } + + protected void checkCRLF(final char c, final String srcStr, + final int index) { + if (ANVLRecord.isCROrLF(c)) { + throw new IllegalArgumentException(srcStr + + " contains disallowed CRLF control character(s): 0x" + + Integer.toHexString(c)); + } + } + + @Override + public String toString() { + return e; + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/util/anvl/Value.java b/src/main/java/org/archive/util/anvl/Value.java new file mode 100644 index 00000000..2a650ba2 --- /dev/null +++ b/src/main/java/org/archive/util/anvl/Value.java @@ -0,0 +1,71 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.util.anvl; + +/** + * TODO: Now values 'fold' but should but perhaps they shouldn't be stored + * folded. Only when we serialize should we fold (But how to know where + * to fold?). + * @author stack + * @version $Date$ $Version$ + */ +class Value extends SubElement { + + private StringBuilder sb; + private boolean folding = false; + + @SuppressWarnings("unused") + private Value() { + this(null); + } + + public Value(final String s) { + super(s); + } + + protected String baseCheck(String s) { + this.sb = new StringBuilder(s.length() * 2); + super.baseCheck(s); + return sb.toString(); + } + + @Override + protected void checkCharacter(char c, String srcStr, int index) { + checkControlCharacter(c, srcStr, index); + // Now, rewrite the value String with folding (If CR or LF or CRLF + // present. + if (ANVLRecord.isCR(c)) { + this.folding = true; + this.sb.append(ANVLRecord.FOLD_PREFIX); + } else if (ANVLRecord.isLF(c)) { + if (!this.folding) { + this.folding = true; + this.sb.append(ANVLRecord.FOLD_PREFIX); + } else { + // Previous character was a CR. Fold prefix has been added. + } + } else if (this.folding && Character.isWhitespace(c)) { + // Only write out one whitespace character. Skip. + } else { + this.folding = false; + this.sb.append(c); + } + } +} \ No newline at end of file diff --git a/src/main/java/org/archive/util/anvl/package.html b/src/main/java/org/archive/util/anvl/package.html new file mode 100644 index 00000000..4a2a8963 --- /dev/null +++ b/src/main/java/org/archive/util/anvl/package.html @@ -0,0 +1,42 @@ + + + +org.archive.util.anvl package + + +Parsers and Writers for the (expired) Internet-Draft A Name-Value +Language (ANVL). Use {@link org.archive.util.anvl.ANVLRecord} +to create new instances of ANVL Records and for parsing. + +

Implementation Details

+

The ANVL Internet-Draft of 14 February, 2005 is inspecific as to the +definition of 'blank line' and 'newline'. This parser implementation +assumes CRNL. +

+

Says "An element consists of a label, a colon, and an optional value". +Should that be: "An element consists of a label and an optional value, or a +comment."

+ +

Specification is unclear regards CR or NL in label or +comment (This implementation disallows CR or NL in labels but lets +them pass in comments).

+ +

A grammar would help. Here is RFC822: +

+     field       =  field-name ":" [ field-body ] CRLF
+     
+     field-name  =  1*<any CHAR, excluding CTLs, SPACE, and ":">
+     
+     field-body  =  field-body-contents
+                    [CRLF LWSP-char field-body]
+     
+     field-body-contents =
+                   <the ASCII characters making up the field-body, as
+                    defined in the following sections, and consisting
+                    of combinations of atom, quoted-string, and
+                    specials tokens, or else consisting of texts>
+
+

+ + From a732a9ee939fe44031fb6a493641598ca120b6dc Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Wed, 11 Dec 2013 16:03:57 +0000 Subject: [PATCH 26/27] Removed older PublicSuffix code. --- .../java/org/archive/url/PublicSuffixes.java | 363 ------------------ .../org/archive/url/PublicSuffixesTest.java | 193 ---------- 2 files changed, 556 deletions(-) delete mode 100644 src/main/java/org/archive/url/PublicSuffixes.java delete mode 100644 src/test/java/org/archive/url/PublicSuffixesTest.java diff --git a/src/main/java/org/archive/url/PublicSuffixes.java b/src/main/java/org/archive/url/PublicSuffixes.java deleted file mode 100644 index 7c3df6b8..00000000 --- a/src/main/java/org/archive/url/PublicSuffixes.java +++ /dev/null @@ -1,363 +0,0 @@ -/* - * This file is part of the Heritrix web crawler (crawler.archive.org). - * - * Licensed to the Internet Archive (IA) by one or more individual - * contributors. - * - * The IA licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.archive.url; - -import java.io.BufferedReader; -import java.io.BufferedWriter; -import java.io.FileInputStream; -import java.io.FileWriter; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.io.OutputStreamWriter; -import java.io.PrintWriter; -import java.io.UnsupportedEncodingException; -import java.util.ArrayList; -import java.util.List; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.apache.commons.io.IOUtils; -import org.archive.util.TextUtils; - -/** - * Utility class for making use of the information about 'public suffixes' at - * http://publicsuffix.org. - * - * The public suffix list (once known as 'effective TLDs') was motivated by the - * need to decide on which broader domains a subdomain was allowed to set - * cookies. For example, a server at 'www.example.com' can set cookies for - * 'www.example.com' or 'example.com' but not 'com'. 'www.example.co.uk' can set - * cookies for 'www.example.co.uk' or 'example.co.uk' but not 'co.uk' or 'uk'. - * The number of rules for all top-level-domains and 2nd- or 3rd- level domains - * has become quite long; essentially the broadest domain a subdomain may assign - * to is the one that was sold/registered to a specific name registrant. - * - * This concept should be useful in other contexts, too. Grouping URIs (or - * queues of URIs to crawl) together with others sharing the same registered - * suffix may be useful for applying the same rules to all, such as assigning - * them to the same queue or crawler in a multi- machine setup. - * - * As of Heritrix3, we prefer the term 'Assignment Level Domain' (ALD) - * for such domains, by analogy to 'Top Level Domain' (TLD) or '2nd Level - * Domain' (2LD), etc. - * - * @author Gojomo - * - * this version of PublicSuffixes uses suffix-tree data structure for generating less - * redundant regular expression. It may be even possible to write a light-weight, - * thread-safe matcher based on this class. - * @author Kenji Nagahashi - */ -public class PublicSuffixes { - protected static Pattern topmostAssignedSurtPrefixPattern; - protected static String topmostAssignedSurtPrefixRegex; - - /** - * prefix tree node. each Node represents sequence of letters (prefix) - * and alternative sequences following it (list of Node's). Nodes in - * {@code branches} are sorted for skip list like lookup and for generating - * effective regular expression (see {@link #compareTo(Node)} and {@link #compareTo(char).) - * - * as is intended for internal use only, there's no access methods. procedures for updating - * prefix tree with new input are defined within this class ({@link #addBranch(CharSequence)}). - * - * terminal node could be represented in two different form: 1) Node with zero branches, - * or 2) Node with zero-length {@code cs}. So, root node must be initialized with empty (not null) - * {@code branches} unless empty string matches the overall pattern. - * {@code cs} must not be null except for root node. - */ - public static class Node implements Comparable { - protected CharSequence cs; - protected List branches; - public Node() { - this("", null); - } - protected Node(CharSequence cs) { - this(cs, null); - } - protected Node(CharSequence cs, List branches) { - this.cs = cs; - this.branches = branches; - } - public void addBranch(CharSequence s) { - if (branches == null) { - branches = new ArrayList(); - branches.add(new Node("", null)); - } - for (int i = 0; i < branches.size(); i++) { - Node alt = branches.get(i); - if (alt.add(s)) return; - if (alt.compareTo(s.charAt(0)) > 0) { - Node alt1 = new Node(s, null); - branches.add(i, alt1); - return; - } - } - Node alt2 = new Node(s, null); - branches.add(alt2); - } - public boolean add(CharSequence s) { - int l = Math.min(s.length(), cs.length()); - int i = 0; - while (i < l && s.charAt(i) == cs.charAt(i)) - i++; - // zero-length match holds only when both cs and s are empty. - if (i == 0) return cs.length() == 0 && s.length() == 0; - if (i < cs.length()) { - CharSequence cs0 = cs.subSequence(0, i); - CharSequence cs1 = cs.subSequence(i, cs.length()); - CharSequence cs2 = s.subSequence(i, s.length()); - cs = cs0; - Node alt1 = new Node(cs1, branches); - (branches = new ArrayList()).add(alt1); - addBranch(cs2); - } else { - assert i == cs.length(); - addBranch(s.subSequence(i, s.length())); - } - return true; - } - public int compareTo(Node other) { - if (other.cs == null || other.cs.length() == 0) - return (cs == null || cs.length() == 0) ? 0 : -1; - return compareTo(other.cs.charAt(0)); - } - public int compareTo(char oc) { - if (cs == null || cs.length() == 0) return 1; - // '!' and '*' must come after ordinary letters, in this order, for regexp - // to work as intended. - char c = cs.charAt(0); - if (c == oc) return 0; - if (c == '!') return oc == '*' ? -1 : 1; - if (c == '*') return 1; - if (oc == '*' || oc == '!') return -1; - return Character.valueOf(c).compareTo(oc); - // for generating the same regexp as previous version. - //return Character.valueOf(oc).compareTo(c); - } - } - - /** - * Utility method for dumping a regex String, based on a published public - * suffix list, which matches any SURT-form hostname up through the broadest - * 'private' (assigned/sold) domain-segment. That is, for any of the - * SURT-form hostnames... - * - * com,example, com,example,www, com,example,california,www - * - * ...the regex will match 'com,example,'. - * - * @param args - * @throws IOException - */ - public static void main(String args[]) throws IOException { - InputStream is; - if (args.length == 0 || "=".equals(args[0])) { - // use bundled list - is = PublicSuffixes.class.getClassLoader().getResourceAsStream( - "effective_tld_names.dat"); - } else { - is = new FileInputStream(args[0]); - } - BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8")); - String regex = getTopmostAssignedSurtPrefixRegex(reader); - IOUtils.closeQuietly(is); - - boolean needsClose = false; - BufferedWriter writer; - if (args.length >= 2) { - // write to specified file - writer = new BufferedWriter(new FileWriter(args[1])); - needsClose = true; - } else { - // write to stdout - writer = new BufferedWriter(new OutputStreamWriter(System.out)); - } - writer.append(regex); - writer.flush(); - if (needsClose) { - writer.close(); - } - } - /** - * Reads a file of the format promulgated by publicsuffix.org, ignoring - * comments and '!' exceptions/notations, converting domain segments to - * SURT-ordering. Leaves glob-style '*' wildcarding in place. Returns root - * node of SURT-ordered prefix tree. - * - * @param reader - * @return root of prefix tree node. - * @throws IOException - */ - protected static Node readPublishedFileToSurtTrie(BufferedReader reader) throws IOException { - // initializing with empty Alt list prevents empty pattern from being - // created for the first addBranch() - Node alt = new Node(null, new ArrayList()); - String line; - while ((line = reader.readLine()) != null) { - // discard whitespace, empty lines, comments, exceptions - line = line.trim(); - if (line.length() == 0 || line.startsWith("//")) continue; - // discard utf8 notation after entry - line = line.split("\\s+")[0]; - // TODO: maybe we don't need to create lower-cased String - line = line.toLowerCase(); - // SURT-order domain segments - String[] segs = line.split("\\."); - StringBuilder sb = new StringBuilder(); - for (int i = segs.length - 1; i >= 0; i--) { - if (segs[i].length() == 0) continue; - sb.append(segs[i]).append(','); - } - alt.addBranch(sb.toString()); - } - return alt; - } - /** - * utility function for dumping prefix tree structure. intended for debug use. - * @param alt root of prefix tree. - * @param lv indent level. 0 for root (no indent). - * @param out writer to send output to. - */ - public static void dump(Node alt, int lv, PrintWriter out) { - for (int i = 0; i < lv; i++) - out.print(" "); - out.println(alt.cs != null ? ('"'+alt.cs.toString()+'"') : "(null)"); - if (alt.branches != null) { - for (Node br : alt.branches) { - dump(br, lv + 1, out); - } - } - } - /** - * bulids regular expression from prefix-tree {@code alt} into buffer {@code sb}. - * @param alt prefix tree root. - * @param sb StringBuffer to store regular expression. - */ - protected static void buildRegex(Node alt, StringBuilder sb) { - String close = null; - if (alt.cs != null) { - // actually '!' always be the first character, because it is - // always used along with '*'. - for (int i = 0; i < alt.cs.length(); i++) { - char c = alt.cs.charAt(i); - if (c == '!') { - if (close != null) - throw new RuntimeException("more than one '!'"); - sb.append("(?="); - close = ")"; - } else if (c == '*') { - sb.append("[-\\w]+"); - } else { - sb.append(c); - } - } - } - if (alt.branches != null) { - // alt.branches.size() should always be > 1 - if (alt.branches.size() > 1) { - sb.append("(?:"); - } - String sep = ""; - for (Node alt1 : alt.branches) { - sb.append(sep); sep = "|"; - buildRegex(alt1, sb); - } - if (alt.branches.size() > 1) { - sb.append(")"); - } - } - if (close != null) - sb.append(close); - } - - /** - * Converts SURT-ordered list of public prefixes into a Java regex which - * matches the public-portion "plus one" segment, giving the domain on which - * cookies can be set or other policy grouping should occur. Also adds to - * regex a fallback matcher that for any new/unknown TLDs assumes the - * second-level domain is assignable. (Eg: 'zzz,example,'). - * - * @param list - * @return - */ - private static String surtPrefixRegexFromTrie(Node trie) { - StringBuilder regex = new StringBuilder(); - regex.append("(?ix)^\n"); - trie.addBranch("*,"); // for new/unknown TLDs - buildRegex(trie, regex); - regex.append("\n([-\\w]+,)"); - return regex.toString(); - } - - public static synchronized Pattern getTopmostAssignedSurtPrefixPattern() { - if (topmostAssignedSurtPrefixPattern == null) { - topmostAssignedSurtPrefixPattern = Pattern - .compile(getTopmostAssignedSurtPrefixRegex()); - } - return topmostAssignedSurtPrefixPattern; - } - - public static synchronized String getTopmostAssignedSurtPrefixRegex() { - if (topmostAssignedSurtPrefixRegex == null) { - // use bundled list - try { - BufferedReader reader = new BufferedReader(new InputStreamReader( - PublicSuffixes.class.getClassLoader().getResourceAsStream( - "effective_tld_names.dat"), "UTF-8")); - topmostAssignedSurtPrefixRegex = getTopmostAssignedSurtPrefixRegex(reader); - IOUtils.closeQuietly(reader); - } catch (UnsupportedEncodingException ex) { - // should never happen - throw new RuntimeException(ex); - } - } - return topmostAssignedSurtPrefixRegex; - } - - public static String getTopmostAssignedSurtPrefixRegex(BufferedReader reader) { - try { - Node trie = readPublishedFileToSurtTrie(reader); - return surtPrefixRegexFromTrie(trie); - } catch (IOException e) { - throw new RuntimeException(e); - } - } - - /** - * Truncate SURT to its topmost assigned domain segment; that is, - * the public suffix plus one segment, but as a SURT-ordered prefix. - * - * if the pattern doesn't match, the passed-in SURT is returned. - * - * @param surt SURT to truncate - * @return truncated-to-topmost-assigned SURT prefix - */ - public static String reduceSurtToAssignmentLevel(String surt) { - Matcher matcher = TextUtils.getMatcher( - getTopmostAssignedSurtPrefixRegex(), surt); - if (matcher.find()) { - surt = matcher.group(); - } - TextUtils.recycleMatcher(matcher); - return surt; - } -} diff --git a/src/test/java/org/archive/url/PublicSuffixesTest.java b/src/test/java/org/archive/url/PublicSuffixesTest.java deleted file mode 100644 index e2bb288a..00000000 --- a/src/test/java/org/archive/url/PublicSuffixesTest.java +++ /dev/null @@ -1,193 +0,0 @@ -/* - * This file is part of the Heritrix web crawler (crawler.archive.org). - * - * Licensed to the Internet Archive (IA) by one or more individual - * contributors. - * - * The IA licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.archive.url; - -import java.io.PrintWriter; -import java.io.StringWriter; -import java.util.ArrayList; -import java.util.regex.Matcher; - -import junit.framework.TestCase; - -import org.archive.url.PublicSuffixes.Node; - -/** - * Test cases for PublicSuffixes utility. Confirm expected matches/nonmatches - * from constructed regex. - * - * @author gojomo - */ -public class PublicSuffixesTest extends TestCase { - // test of low level implementation - - public void testCompare() { - Node n = new Node("hoge"); - assertTrue(n.compareTo('a') > 0); - assertEquals(-1, n.compareTo('*')); - assertEquals(-1, n.compareTo('!')); - assertEquals(-1, n.compareTo(new Node("*,"))); - assertEquals(-1, n.compareTo(new Node("!muga,"))); - assertEquals(-1, n.compareTo(new Node(""))); - - n = new Node("*,"); - assertEquals(1, n.compareTo('a')); - assertEquals(0, n.compareTo('*')); - assertEquals(1, n.compareTo('!')); - assertEquals(0, n.compareTo(new Node("*,"))); - assertEquals(1, n.compareTo(new Node("!muga,"))); - assertEquals(-1, n.compareTo(new Node(""))); - - n = new Node("!hoge"); - assertEquals(1, n.compareTo('a')); - assertEquals(-1, n.compareTo('*')); - assertEquals(0, n.compareTo('!')); - assertEquals(-1, n.compareTo(new Node("*,"))); - assertEquals(0, n.compareTo(new Node("!muga,"))); - assertEquals(-1, n.compareTo(new Node(""))); - - n = new Node(""); - assertEquals(1, n.compareTo('a')); - assertEquals(1, n.compareTo('*')); - assertEquals(1, n.compareTo('!')); - assertEquals(0, n.compareTo(new Node(""))); - } - - protected String dump(Node alt) { - StringWriter w = new StringWriter(); - PublicSuffixes.dump(alt, 0, new PrintWriter(w)); - return w.toString(); - } - public void testTrie1() { - Node alt = new Node(null, new ArrayList()); - alt.addBranch("ac,"); - // specifically, should not have empty string as match. - assertEquals("(null)\n" + - " \"ac,\"\n", dump(alt)); - alt.addBranch("ac,com,"); - assertEquals("(null)\n" + - " \"ac,\"\n" + - " \"com,\"\n" + - " \"\"\n", dump(alt)); - alt.addBranch("ac,edu,"); - assertEquals("(null)\n" + - " \"ac,\"\n" + - " \"com,\"\n" + - " \"edu,\"\n" + - " \"\"\n", dump(alt)); - } - public void testTrie2() { - Node alt = new Node(null, new ArrayList()); - alt.addBranch("ac,"); - alt.addBranch("*,"); - assertEquals("(null)\n" + - " \"ac,\"\n" + - " \"*,\"\n", dump(alt)); - } - - public void testTrie3() { - Node alt = new Node(null, new ArrayList()); - alt.addBranch("ac,"); - alt.addBranch("ac,!hoge,"); - alt.addBranch("ac,*,"); - // exception goes first. - assertEquals("(null)\n" + - " \"ac,\"\n" + - " \"!hoge,\"\n" + - " \"*,\"\n" + - " \"\"\n", dump(alt)); - } - - // test of higher-level functionality - - Matcher m = PublicSuffixes.getTopmostAssignedSurtPrefixPattern() - .matcher(""); - - public void testBasics() { - matchPrefix("com,example,www,", "com,example,"); - matchPrefix("com,example,", "com,example,"); - matchPrefix("org,archive,www,", "org,archive,"); - matchPrefix("org,archive,", "org,archive,"); - matchPrefix("fr,yahoo,www,", "fr,yahoo,"); - matchPrefix("fr,yahoo,", "fr,yahoo,"); - matchPrefix("au,com,foobar,www,", "au,com,foobar,"); - matchPrefix("au,com,foobar,", "au,com,foobar,"); - matchPrefix("uk,co,virgin,www,", "uk,co,virgin,"); - matchPrefix("uk,co,virgin,", "uk,co,virgin,"); - matchPrefix("au,com,example,www,", "au,com,example,"); - matchPrefix("au,com,example,", "au,com,example,"); - matchPrefix("jp,tokyo,public,assigned,www,", - "jp,tokyo,public,assigned,"); - matchPrefix("jp,tokyo,public,assigned,", "jp,tokyo,public,assigned,"); - } - - public void testDomainWithDash() { - matchPrefix("de,bad-site,www", "de,bad-site,"); - } - - public void testDomainWithNumbers() { - matchPrefix("de,archive4u,www", "de,archive4u,"); - } - - public void testIPV4() { - assertEquals("unexpected reduction", - "1.2.3.4", - PublicSuffixes.reduceSurtToAssignmentLevel("1.2.3.4")); - } - - public void testIPV6() { - assertEquals("unexpected reduction", - "[2001:0db8:85a3:08d3:1319:8a2e:0370:7344]", - PublicSuffixes.reduceSurtToAssignmentLevel( - "[2001:0db8:85a3:08d3:1319:8a2e:0370:7344]")); - } - - public void testExceptions() { - matchPrefix("uk,bl,www,", "uk,bl,"); - matchPrefix("uk,bl,", "uk,bl,"); - matchPrefix("jp,tokyo,metro,subdomain,", "jp,tokyo,metro,"); - matchPrefix("jp,tokyo,metro,", "jp,tokyo,metro,"); - } - - public void testFakeTLD() { - // we assume any new/unknonwn TLD should be assumed as 2-level; - // this is preferable for our grouping purpose but might not be - // for a cookie-assigning browser (original purpose of publicsuffixlist) - matchPrefix("zzz,example,www,", "zzz,example,"); - } - - public void testUnsegmentedHostname() { - m.reset("example"); - assertFalse("unexpected match found in 'example'", m.find()); - } - - public void testTopmostAssignedCaching() { - assertSame("topmostAssignedSurtPrefixPattern not cached",PublicSuffixes.getTopmostAssignedSurtPrefixPattern(),PublicSuffixes.getTopmostAssignedSurtPrefixPattern()); - assertSame("topmostAssignedSurtPrefixRegex not cached",PublicSuffixes.getTopmostAssignedSurtPrefixRegex(),PublicSuffixes.getTopmostAssignedSurtPrefixRegex()); - } - - // TODO: test UTF domains? - - protected void matchPrefix(String surtDomain, String expectedAssignedPrefix) { - m.reset(surtDomain); - assertTrue("expected match not found in '" + surtDomain, m.find()); - assertEquals("expected match not found", expectedAssignedPrefix, m - .group()); - } -} From 7653cc0dbc5dc75167761a98de01bec79bf530bd Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Thu, 12 Dec 2013 09:44:14 +0000 Subject: [PATCH 27/27] Added link to parent project. --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ae865f7e..9bd2e12a 100644 --- a/README.md +++ b/README.md @@ -3,4 +3,6 @@ OpenWayback Web Commons [![Build Status](https://travis-ci.org/iipc/iipc-web-commons.png?branch=master)](https://travis-ci.org/iipc/iipc-web-commons/) -This repository contains common utility code for the OpenWayback project. +This repository contains common utility code for the [OpenWayback][1] project. + +[1]: https://github.com/iipc/openwayback