From 28c299518367808b86d92713b4119bcfc7acd9c9 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer
Date: Thu, 3 Oct 2013 16:21:11 -0700
Subject: [PATCH 01/27] ZipNum: fixes: cached locations: don't throw
RunTimeException if required (handled later), add a locCacheMaxDuration to
cache only if below threshold
---
.../format/gzip/zipnum/ZipNumBlockLoader.java | 7 +-
.../format/gzip/zipnum/ZipNumCluster.java | 69 +++++++++++--------
2 files changed, 44 insertions(+), 32 deletions(-)
diff --git a/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java b/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java
index a1682818..91a822a2 100644
--- a/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java
+++ b/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java
@@ -15,7 +15,6 @@
import org.archive.util.binsearch.impl.HTTPSeekableLineReader;
import org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory;
import org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory.HttpLibs;
-import org.archive.util.io.RuntimeIOException;
public class ZipNumBlockLoader {
@@ -169,9 +168,9 @@ public SeekableLineReader attemptLoadBlock(String location, long startOffset, in
currReader = null;
}
- if (isRequired) {
- throw new RuntimeIOException(io);
- }
+// if (isRequired) {
+// throw new RuntimeIOException(io);
+// }
}
return currReader;
diff --git a/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java b/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java
index 5e91c507..09e58064 100644
--- a/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java
+++ b/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java
@@ -122,7 +122,15 @@ class BlockSize
protected boolean newIsDisabled = false;
protected boolean disabled = false;
- final static int DEFAULT_LOC_CACHE_EXPIRE_MILLIS = 5000;
+ //final static int DEFAULT_LOC_CACHE_EXPIRE_MILLIS = 120000;
+
+ protected ConcurrentHashMap locCacheMap;
+
+ protected boolean cacheRemoteLoc = false;
+
+ protected int locCacheExpireMillis = 120000;
+
+ protected int locCacheMaxDuration = 1000;
class LocCacheEntry
{
@@ -151,14 +159,7 @@ public boolean equals(Object obj)
return false;
}
- }
-
- protected ConcurrentHashMap locCacheMap;
-
- protected boolean cacheRemoteLoc = false;
-
- protected int locCacheExpireMillis = DEFAULT_LOC_CACHE_EXPIRE_MILLIS;
-
+ }
@Override
public void init() throws IOException
@@ -287,6 +288,14 @@ public void setLocCacheExpireMillis(int locCacheExpireMillis) {
this.locCacheExpireMillis = locCacheExpireMillis;
}
+ public int getLocCacheMaxDuration() {
+ return locCacheMaxDuration;
+ }
+
+ public void setLocCacheMaxDuration(int locCacheMaxDuration) {
+ this.locCacheMaxDuration = locCacheMaxDuration;
+ }
+
public boolean isCacheRemoteLoc() {
return cacheRemoteLoc;
}
@@ -525,25 +534,19 @@ SeekableLineReader doBlockLoad(String partId, long startOffset, int totalLength)
}
// Attempt cached load for http
- if (cacheRemoteLoc && (locCacheMap != null)) {
- // Non-http requests follow standard load path
- if ((locations.length > 0) && GeneralURIStreamFactory.isHttp(locations[0])) {
- reader = loadCachedBalancedReader(partId, locations, startOffset, totalLength);
- }
- }
-
- if (reader != null) {
- return reader;
- }
-
- for (String location : locations) {
- reader = blockLoader.attemptLoadBlock(location, startOffset, totalLength, true, isRequired());
- if (reader != null) {
- return reader;
+ if (cacheRemoteLoc && (locCacheMap != null) && (locations.length > 0) && GeneralURIStreamFactory.isHttp(locations[0])) {
+ reader = loadCachedBalancedReader(partId, locations, startOffset, totalLength);
+ } else {
+ // Standard block load path
+ for (String location : locations) {
+ reader = blockLoader.attemptLoadBlock(location, startOffset, totalLength, true, isRequired());
+ if (reader != null) {
+ return reader;
+ }
}
}
- return null;
+ return reader;
}
protected String locCacheGet(String key)
@@ -574,12 +577,18 @@ SeekableLineReader loadCachedBalancedReader(String partId, String[] locations, l
String cachedUrl = locCacheGet(partId);
if (cachedUrl != null) {
+ long start = System.currentTimeMillis();
+
reader = blockLoader.attemptLoadBlock(cachedUrl, offset, length, true, isRequired());
+ long duration = System.currentTimeMillis() - start;
+
+ if ((reader == null) || (duration > locCacheMaxDuration)) {
+ locCacheMap.remove(partId, cachedUrl);
+ }
+
if (reader != null) {
return reader;
- } else {
- locCacheMap.remove(partId, cachedUrl);
}
}
@@ -593,12 +602,16 @@ SeekableLineReader loadCachedBalancedReader(String partId, String[] locations, l
}
for (int index : indexs) {
+ long start = System.currentTimeMillis();
+
reader = blockLoader.attemptLoadBlock(locations[index], offset, length, true, isRequired());
+ long duration = System.currentTimeMillis() - start;
+
if (reader != null) {
String connectedUrl = ((HTTPSeekableLineReader)reader).getConnectedUrl();
- if (connectedUrl != null) {
+ if ((duration < locCacheMaxDuration) && (connectedUrl != null)) {
locCachePut(partId, connectedUrl);
}
From 41d3a58a0d03a4cac3b99f2fb05eebb7e569afab Mon Sep 17 00:00:00 2001
From: Ilya Kreymer
Date: Thu, 3 Oct 2013 19:06:06 -0700
Subject: [PATCH 02/27] CDX: Add CloseableCompositeIterator which iterates in
sequence, optimization for zipnum clusters to be loaded sequentially when
only looking for last line
---
.../format/cdx/MultiCDXInputSource.java | 28 +++++++
.../format/gzip/zipnum/ZipNumParams.java | 9 +++
.../iterator/CloseableCompositeIterator.java | 73 +++++++++++++++++++
3 files changed, 110 insertions(+)
create mode 100644 src/main/java/org/archive/util/iterator/CloseableCompositeIterator.java
diff --git a/src/main/java/org/archive/format/cdx/MultiCDXInputSource.java b/src/main/java/org/archive/format/cdx/MultiCDXInputSource.java
index 66367077..7f1ff002 100644
--- a/src/main/java/org/archive/format/cdx/MultiCDXInputSource.java
+++ b/src/main/java/org/archive/format/cdx/MultiCDXInputSource.java
@@ -8,6 +8,7 @@
import org.archive.format.gzip.zipnum.ZipNumIndex;
import org.archive.format.gzip.zipnum.ZipNumParams;
+import org.archive.util.iterator.CloseableCompositeIterator;
import org.archive.util.iterator.CloseableIterator;
import org.archive.util.iterator.SortedCompositeIterator;
@@ -70,9 +71,36 @@ public CloseableIterator getCDXIterator(String key, String prefix, boole
return scitr;
}
+ public CloseableIterator createSeqIterator(String key, String start, String end, ZipNumParams params)
+ {
+ CloseableCompositeIterator composite = new CloseableCompositeIterator();
+ CloseableIterator iter = null;
+
+ for (CDXInputSource cdxReader : cdx) {
+ try {
+ iter = cdxReader.getCDXIterator(key, start, end, params);
+
+ if (!params.isReverse()) {
+ composite.addLast(iter);
+ } else {
+ composite.addFirst(iter);
+ }
+
+ } catch (IOException io) {
+ LOGGER.warning(io.toString());
+ }
+ }
+
+ return composite;
+ }
+
public CloseableIterator getCDXIterator(String key, String start, String end, ZipNumParams params) throws IOException {
+ if (params.isSequential()) {
+ return this.createSeqIterator(key, start, end, params);
+ }
+
SortedCompositeIterator scitr = new SortedCompositeIterator(cdx.size(), params.isReverse() ? reverseComparator : comparator);
CloseableIterator iter = null;
diff --git a/src/main/java/org/archive/format/gzip/zipnum/ZipNumParams.java b/src/main/java/org/archive/format/gzip/zipnum/ZipNumParams.java
index 15e22e1d..668743ae 100644
--- a/src/main/java/org/archive/format/gzip/zipnum/ZipNumParams.java
+++ b/src/main/java/org/archive/format/gzip/zipnum/ZipNumParams.java
@@ -6,6 +6,7 @@ public class ZipNumParams
protected int timestampDedupLength = 0;
protected int maxBlocks = 0;
private boolean reverse = false;
+ private boolean sequential = false;
public ZipNumParams()
{
@@ -56,4 +57,12 @@ public boolean isReverse() {
public void setReverse(boolean reverse) {
this.reverse = reverse;
}
+
+ public boolean isSequential() {
+ return sequential;
+ }
+
+ public void setSequential(boolean sequential) {
+ this.sequential = sequential;
+ }
}
\ No newline at end of file
diff --git a/src/main/java/org/archive/util/iterator/CloseableCompositeIterator.java b/src/main/java/org/archive/util/iterator/CloseableCompositeIterator.java
new file mode 100644
index 00000000..b9f632e2
--- /dev/null
+++ b/src/main/java/org/archive/util/iterator/CloseableCompositeIterator.java
@@ -0,0 +1,73 @@
+package org.archive.util.iterator;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.LinkedList;
+
+public class CloseableCompositeIterator implements CloseableIterator {
+
+ protected LinkedList> iters;
+ protected Iterator> iterPtr;
+ protected CloseableIterator currIter;
+
+ public CloseableCompositeIterator()
+ {
+ iters = new LinkedList>();
+ }
+
+ public void addFirst(CloseableIterator e)
+ {
+ iters.addFirst(e);
+ }
+
+ public void addLast(CloseableIterator e)
+ {
+ iters.addLast(e);
+ }
+
+ @Override
+ public boolean hasNext() {
+
+ if (iterPtr == null) {
+ iterPtr = iters.iterator();
+ currIter = iterPtr.next();
+ }
+
+ if (currIter == null) {
+ return false;
+ }
+
+ while (currIter != null) {
+ if (currIter.hasNext()) {
+ return true;
+ }
+
+ currIter = (iterPtr.hasNext() ? iterPtr.next() : null);
+ }
+
+ return false;
+ }
+
+ @Override
+ public E next() {
+ return currIter.next();
+ }
+
+ @Override
+ public void remove() {
+ currIter.remove();
+ }
+
+ @Override
+ public void close() throws IOException {
+ for (CloseableIterator e : iters) {
+ if (e != null) {
+ try {
+ e.close();
+ } catch (IOException io) {
+
+ }
+ }
+ }
+ }
+}
From b4f639de2493d5cf09dae449c9bab5e464a208eb Mon Sep 17 00:00:00 2001
From: Ilya Kreymer
Date: Sun, 6 Oct 2013 13:07:04 -0700
Subject: [PATCH 03/27] ZIP & SLR improvements: store connectedUrl immediately
on connection, print connectedUrl on error, skip cached url on fallback to
location list
---
.../format/gzip/zipnum/ZipNumBlockLoader.java | 12 +++++++++++-
.../archive/format/gzip/zipnum/ZipNumCluster.java | 5 +++++
.../java/org/archive/url/UrlSurtRangeComputer.java | 2 +-
.../util/binsearch/impl/http/ApacheHttp31SLR.java | 4 ++--
.../util/binsearch/impl/http/HTTPURLConnSLR.java | 3 +--
.../org/archive/util/io/RuntimeIOException.java | 14 +++++++++++++-
6 files changed, 33 insertions(+), 7 deletions(-)
diff --git a/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java b/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java
index 91a822a2..2144bd30 100644
--- a/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java
+++ b/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java
@@ -155,8 +155,18 @@ public SeekableLineReader attemptLoadBlock(String location, long startOffset, in
} catch (IOException io) {
Level level = (isRequired ? Level.SEVERE : Level.WARNING);
+ String actualLocation = null;
+
+ if (currReader instanceof HTTPSeekableLineReader) {
+ actualLocation = ((HTTPSeekableLineReader)currReader).getConnectedUrl();
+ }
+
+ if (actualLocation == null) {
+ actualLocation = location;
+ }
+
if (LOGGER.isLoggable(level)) {
- LOGGER.log(level, io.toString() + " -- -r " + startOffset + "-" + (startOffset + totalLength - 1) + " " + location + " req? " + isRequired);
+ LOGGER.log(level, io.toString() + " -- -r " + startOffset + "-" + (startOffset + totalLength - 1) + " " + actualLocation + " req? " + isRequired);
}
if (currReader != null) {
diff --git a/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java b/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java
index 09e58064..fbe88033 100644
--- a/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java
+++ b/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java
@@ -602,6 +602,11 @@ SeekableLineReader loadCachedBalancedReader(String partId, String[] locations, l
}
for (int index : indexs) {
+ // Skip failed cached url
+ if (cachedUrl != null && locations[index].equals(cachedUrl)) {
+ continue;
+ }
+
long start = System.currentTimeMillis();
reader = blockLoader.attemptLoadBlock(locations[index], offset, length, true, isRequired());
diff --git a/src/main/java/org/archive/url/UrlSurtRangeComputer.java b/src/main/java/org/archive/url/UrlSurtRangeComputer.java
index 74057117..2b960e16 100644
--- a/src/main/java/org/archive/url/UrlSurtRangeComputer.java
+++ b/src/main/java/org/archive/url/UrlSurtRangeComputer.java
@@ -112,7 +112,7 @@ public String[] determineRange(String url, MatchType match, String from, String
return new String[]{startKey, endKey, host};
}
- protected String incLastChar(String input)
+ public static String incLastChar(String input)
{
StringBuilder sb = new StringBuilder(input);
sb.setCharAt(sb.length() - 1, (char)(sb.charAt(sb.length() - 1) + 1));
diff --git a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java
index 0857bfd6..e09c02f9 100644
--- a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java
+++ b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java
@@ -126,12 +126,12 @@ protected InputStream doSeekLoad(long offset, int maxLength) throws IOException
int code = http.executeMethod(activeMethod);
+ connectedUrl = activeMethod.getURI().toString();
+
if ((code != 206) && (code != 200)) {
throw new BadHttpStatusException(code, url + " " + rangeHeader);
}
- connectedUrl = activeMethod.getURI().toString();
-
InputStream is = activeMethod.getResponseBodyAsStream();
cin = new CountingInputStream(is);
return cin;
diff --git a/src/main/java/org/archive/util/binsearch/impl/http/HTTPURLConnSLR.java b/src/main/java/org/archive/util/binsearch/impl/http/HTTPURLConnSLR.java
index f21437f7..c811ef68 100644
--- a/src/main/java/org/archive/util/binsearch/impl/http/HTTPURLConnSLR.java
+++ b/src/main/java/org/archive/util/binsearch/impl/http/HTTPURLConnSLR.java
@@ -76,13 +76,12 @@ protected InputStream doSeekLoad(long offset, int maxLength)
httpUrlConn.connect();
int code = httpUrlConn.getResponseCode();
+ connectedUrl = httpUrlConn.getURL().toString();
if ((code != 206) && (code != 200)) {
throw new BadHttpStatusException(code, url + " " + rangeHeader);
}
- connectedUrl = httpUrlConn.getURL().toString();
-
InputStream is = httpUrlConn.getInputStream();
cin = new CountingInputStream(is);
return cin;
diff --git a/src/main/java/org/archive/util/io/RuntimeIOException.java b/src/main/java/org/archive/util/io/RuntimeIOException.java
index b6efbf74..e93e5639 100644
--- a/src/main/java/org/archive/util/io/RuntimeIOException.java
+++ b/src/main/java/org/archive/util/io/RuntimeIOException.java
@@ -3,13 +3,25 @@
public class RuntimeIOException extends RuntimeException {
private static final long serialVersionUID = 4762025404760379497L;
+ private int status = 503;
+
public RuntimeIOException()
{
}
+ public RuntimeIOException(int status)
+ {
+ this.status = status;
+ }
+
public RuntimeIOException(Throwable cause)
{
super(cause);
- }
+ }
+
+ public int getStatus()
+ {
+ return status;
+ }
}
From 612cdebd1c5b359da57c3b83e818f4491b535e22 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer
Date: Mon, 14 Oct 2013 09:36:50 -0700
Subject: [PATCH 04/27] ADD: CloseableIteratorWrapper utility class for
wrapping regular iterators FIX ZIPNUM: Flush cache, if any, when reloading
locations
---
.../format/gzip/zipnum/ZipNumCluster.java | 4 ++
.../iterator/CloseableIteratorWrapper.java | 42 +++++++++++++++++++
2 files changed, 46 insertions(+)
create mode 100644 src/main/java/org/archive/util/iterator/CloseableIteratorWrapper.java
diff --git a/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java b/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java
index fbe88033..70e21029 100644
--- a/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java
+++ b/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java
@@ -243,6 +243,10 @@ protected void syncLoad(long newModTime)
locRoot = newLocRoot;
}
+ if (this.locCacheMap != null) {
+ locCacheMap.clear();
+ }
+
closeExistingFiles(filesToClose);
lastModTime = newModTime;
diff --git a/src/main/java/org/archive/util/iterator/CloseableIteratorWrapper.java b/src/main/java/org/archive/util/iterator/CloseableIteratorWrapper.java
new file mode 100644
index 00000000..f35c85e5
--- /dev/null
+++ b/src/main/java/org/archive/util/iterator/CloseableIteratorWrapper.java
@@ -0,0 +1,42 @@
+package org.archive.util.iterator;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+/**
+ * Wrap a regular Iterator to create a CloseableIterator where the close() is a no-op
+ * @author ilya
+ *
+ * @param
+ */
+
+public class CloseableIteratorWrapper implements CloseableIterator
+{
+ protected Iterator iter;
+
+ public CloseableIteratorWrapper(Iterator iter)
+ {
+ this.iter = iter;
+ }
+
+ @Override
+ public boolean hasNext() {
+ return this.iter.hasNext();
+ }
+
+ @Override
+ public S next() {
+ return this.iter.next();
+ }
+
+ @Override
+ public void remove() {
+ this.iter.remove();
+
+ }
+
+ @Override
+ public void close() throws IOException {
+ //No Op
+ }
+}
\ No newline at end of file
From c5dbc67e6e0aad654658b04a9688179f97bfc980 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer
Date: Mon, 14 Oct 2013 12:47:57 -0700
Subject: [PATCH 05/27] ZipNumBlock Loader: * add info on which shard failed to
RuntimIOException * use connectedUrl in all exceptions messages *
attemptBlockLoad: use SEVERE only on last retry of required cluster which
will lead to a 503, use WARNING otherwise
---
.../format/gzip/zipnum/SummaryBlockIterator.java | 2 +-
.../archive/format/gzip/zipnum/ZipNumBlockLoader.java | 4 ----
.../org/archive/format/gzip/zipnum/ZipNumCluster.java | 10 +++++++---
.../util/binsearch/impl/http/ApacheHttp31SLR.java | 2 +-
.../util/binsearch/impl/http/HTTPURLConnSLR.java | 2 +-
.../java/org/archive/util/io/RuntimeIOException.java | 5 +++++
6 files changed, 15 insertions(+), 10 deletions(-)
diff --git a/src/main/java/org/archive/format/gzip/zipnum/SummaryBlockIterator.java b/src/main/java/org/archive/format/gzip/zipnum/SummaryBlockIterator.java
index 0046625c..cbf947f6 100644
--- a/src/main/java/org/archive/format/gzip/zipnum/SummaryBlockIterator.java
+++ b/src/main/java/org/archive/format/gzip/zipnum/SummaryBlockIterator.java
@@ -120,7 +120,7 @@ public CloseableIterator getNextInner() {
SeekableLineReader currReader = zipnumIndex.doBlockLoad(currPartId, startOffset, totalLength);
if ((currReader == null) && zipnumIndex.isRequired()) {
- throw new RuntimeIOException();
+ throw new RuntimeIOException("Failed to load shards for: " + currPartId);
}
if (currReader != null) {
diff --git a/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java b/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java
index 2144bd30..9a6c459d 100644
--- a/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java
+++ b/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java
@@ -177,10 +177,6 @@ public SeekableLineReader attemptLoadBlock(String location, long startOffset, in
}
currReader = null;
}
-
-// if (isRequired) {
-// throw new RuntimeIOException(io);
-// }
}
return currReader;
diff --git a/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java b/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java
index 70e21029..892dfbc0 100644
--- a/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java
+++ b/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java
@@ -75,7 +75,7 @@ public void run() {
Thread.sleep(checkInterval);
if (summary != null) {
- summary.reloadFactory();
+ summary.reloadFactory();
}
}
@@ -583,7 +583,7 @@ SeekableLineReader loadCachedBalancedReader(String partId, String[] locations, l
if (cachedUrl != null) {
long start = System.currentTimeMillis();
- reader = blockLoader.attemptLoadBlock(cachedUrl, offset, length, true, isRequired());
+ reader = blockLoader.attemptLoadBlock(cachedUrl, offset, length, true, false);
long duration = System.currentTimeMillis() - start;
@@ -605,6 +605,8 @@ SeekableLineReader loadCachedBalancedReader(String partId, String[] locations, l
Collections.shuffle(indexs);
}
+ final int lastIndex = locations.length - 1;
+
for (int index : indexs) {
// Skip failed cached url
if (cachedUrl != null && locations[index].equals(cachedUrl)) {
@@ -613,7 +615,9 @@ SeekableLineReader loadCachedBalancedReader(String partId, String[] locations, l
long start = System.currentTimeMillis();
- reader = blockLoader.attemptLoadBlock(locations[index], offset, length, true, isRequired());
+ boolean required = (isRequired() && (index == lastIndex));
+
+ reader = blockLoader.attemptLoadBlock(locations[index], offset, length, true, required);
long duration = System.currentTimeMillis() - start;
diff --git a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java
index e09c02f9..5964e268 100644
--- a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java
+++ b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java
@@ -129,7 +129,7 @@ protected InputStream doSeekLoad(long offset, int maxLength) throws IOException
connectedUrl = activeMethod.getURI().toString();
if ((code != 206) && (code != 200)) {
- throw new BadHttpStatusException(code, url + " " + rangeHeader);
+ throw new BadHttpStatusException(code, connectedUrl + " " + rangeHeader);
}
InputStream is = activeMethod.getResponseBodyAsStream();
diff --git a/src/main/java/org/archive/util/binsearch/impl/http/HTTPURLConnSLR.java b/src/main/java/org/archive/util/binsearch/impl/http/HTTPURLConnSLR.java
index c811ef68..6d618e43 100644
--- a/src/main/java/org/archive/util/binsearch/impl/http/HTTPURLConnSLR.java
+++ b/src/main/java/org/archive/util/binsearch/impl/http/HTTPURLConnSLR.java
@@ -79,7 +79,7 @@ protected InputStream doSeekLoad(long offset, int maxLength)
connectedUrl = httpUrlConn.getURL().toString();
if ((code != 206) && (code != 200)) {
- throw new BadHttpStatusException(code, url + " " + rangeHeader);
+ throw new BadHttpStatusException(code, connectedUrl + " " + rangeHeader);
}
InputStream is = httpUrlConn.getInputStream();
diff --git a/src/main/java/org/archive/util/io/RuntimeIOException.java b/src/main/java/org/archive/util/io/RuntimeIOException.java
index e93e5639..9b1d4a1a 100644
--- a/src/main/java/org/archive/util/io/RuntimeIOException.java
+++ b/src/main/java/org/archive/util/io/RuntimeIOException.java
@@ -10,6 +10,11 @@ public RuntimeIOException()
}
+ public RuntimeIOException(String message)
+ {
+ super(message);
+ }
+
public RuntimeIOException(int status)
{
this.status = status;
From 1fc60191669b3e48da3b30fb595e1250228f9581 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer
Date: Mon, 14 Oct 2013 14:57:26 -0700
Subject: [PATCH 06/27] ZIPNUMLoader: attempt better error msgs by propagating
full error in exception
---
.../archive/format/gzip/zipnum/ZipNumBlockLoader.java | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java b/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java
index 9a6c459d..2247eda4 100644
--- a/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java
+++ b/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java
@@ -15,6 +15,7 @@
import org.archive.util.binsearch.impl.HTTPSeekableLineReader;
import org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory;
import org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory.HttpLibs;
+import org.archive.util.io.RuntimeIOException;
public class ZipNumBlockLoader {
@@ -165,8 +166,10 @@ public SeekableLineReader attemptLoadBlock(String location, long startOffset, in
actualLocation = location;
}
+ String msg = io.toString() + " -- -r " + startOffset + "-" + (startOffset + totalLength - 1) + " " + actualLocation;
+
if (LOGGER.isLoggable(level)) {
- LOGGER.log(level, io.toString() + " -- -r " + startOffset + "-" + (startOffset + totalLength - 1) + " " + actualLocation + " req? " + isRequired);
+ LOGGER.log(level, msg);
}
if (currReader != null) {
@@ -177,6 +180,10 @@ public SeekableLineReader attemptLoadBlock(String location, long startOffset, in
}
currReader = null;
}
+
+ if (isRequired) {
+ throw new RuntimeIOException(msg);
+ }
}
return currReader;
From 936fb4934b724d767c5338c9557431c1b49bb970 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer
Date: Mon, 14 Oct 2013 17:45:02 -0700
Subject: [PATCH 07/27] FIX: ApacheHttp31SLR save the connected url even on
error! ZIPNUM: track 2nd attempt to load correctly
---
.../org/archive/format/gzip/zipnum/ZipNumCluster.java | 9 ++++++---
.../util/binsearch/impl/http/ApacheHttp31SLR.java | 1 +
2 files changed, 7 insertions(+), 3 deletions(-)
diff --git a/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java b/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java
index 892dfbc0..b30cf489 100644
--- a/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java
+++ b/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java
@@ -607,15 +607,18 @@ SeekableLineReader loadCachedBalancedReader(String partId, String[] locations, l
final int lastIndex = locations.length - 1;
- for (int index : indexs) {
+ for (int i = 0; i < indexs.size(); i++) {
+
+ int index = indexs.get(i);
+
// Skip failed cached url
- if (cachedUrl != null && locations[index].equals(cachedUrl)) {
+ if ((cachedUrl != null) && locations[index].equals(cachedUrl)) {
continue;
}
long start = System.currentTimeMillis();
- boolean required = (isRequired() && (index == lastIndex));
+ boolean required = (isRequired() && (i == lastIndex));
reader = blockLoader.attemptLoadBlock(locations[index], offset, length, true, required);
diff --git a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java
index 5964e268..ad92f3cb 100644
--- a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java
+++ b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java
@@ -137,6 +137,7 @@ protected InputStream doSeekLoad(long offset, int maxLength) throws IOException
return cin;
} catch (IOException io) {
+ connectedUrl = activeMethod.getURI().toString();
doClose();
throw io;
}
From 04744417eb97c9856c33892cc87476052ceb1e65 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer
Date: Thu, 17 Oct 2013 09:08:06 -0700
Subject: [PATCH 08/27] FIX: ApacheSLR: turn off cookies when using manual
cookie
---
.../org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java | 2 ++
1 file changed, 2 insertions(+)
diff --git a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java
index ad92f3cb..0f2e102d 100644
--- a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java
+++ b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java
@@ -8,6 +8,7 @@
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpMethod;
+import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.HeadMethod;
import org.apache.commons.io.input.CountingInputStream;
@@ -121,6 +122,7 @@ protected InputStream doSeekLoad(long offset, int maxLength) throws IOException
}
if (this.getCookie() != null) {
+ activeMethod.getParams().setCookiePolicy(CookiePolicy.IGNORE_COOKIES);
activeMethod.setRequestHeader("Cookie", this.getCookie());
}
From f82aead0ae91c881485ba24a6694965e707c1c90 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer
Date: Thu, 17 Oct 2013 10:53:59 -0700
Subject: [PATCH 09/27] HttpSLR: add optional error header which can be saved
RuntimeIOException: add status
---
.../binsearch/impl/HTTPSeekableLineReader.java | 18 ++++++++++++++++++
.../binsearch/impl/http/ApacheHttp31SLR.java | 4 ++++
.../archive/util/io/RuntimeIOException.java | 8 +++++++-
3 files changed, 29 insertions(+), 1 deletion(-)
diff --git a/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReader.java b/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReader.java
index 63eab9b4..d686a5e2 100644
--- a/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReader.java
+++ b/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReader.java
@@ -32,6 +32,8 @@ public int getStatus()
protected boolean noKeepAlive;
protected String cookie;
protected String connectedUrl;
+ protected String errHeader;
+ protected String saveErrHeader;
public abstract String getUrl();
@@ -76,4 +78,20 @@ public String getConnectedUrl()
{
return connectedUrl;
}
+
+ public String getSaveErrHeader() {
+ return saveErrHeader;
+ }
+
+ public void setSaveErrHeader(String saveErrHeader) {
+ this.saveErrHeader = saveErrHeader;
+ }
+
+ public String getErrHeader() {
+ return errHeader;
+ }
+
+ public void setErrHeader(String errHeader) {
+ this.errHeader = errHeader;
+ }
}
\ No newline at end of file
diff --git a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java
index 0f2e102d..c4fdbba8 100644
--- a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java
+++ b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java
@@ -139,6 +139,10 @@ protected InputStream doSeekLoad(long offset, int maxLength) throws IOException
return cin;
} catch (IOException io) {
+ if (saveErrHeader != null) {
+ errHeader = getHeaderValue(saveErrHeader);
+ }
+
connectedUrl = activeMethod.getURI().toString();
doClose();
throw io;
diff --git a/src/main/java/org/archive/util/io/RuntimeIOException.java b/src/main/java/org/archive/util/io/RuntimeIOException.java
index 9b1d4a1a..1d74f79c 100644
--- a/src/main/java/org/archive/util/io/RuntimeIOException.java
+++ b/src/main/java/org/archive/util/io/RuntimeIOException.java
@@ -23,7 +23,13 @@ public RuntimeIOException(int status)
public RuntimeIOException(Throwable cause)
{
super(cause);
- }
+ }
+
+ public RuntimeIOException(int status, Throwable cause)
+ {
+ super(cause);
+ this.status = status;
+ }
public int getStatus()
{
From 647adec66cd34bac20d2732671606f3e237a2bc5 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer
Date: Thu, 17 Oct 2013 23:03:19 -0700
Subject: [PATCH 10/27] MultiCDXInputSource: Make comparator public
---
.../java/org/archive/format/cdx/MultiCDXInputSource.java | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/src/main/java/org/archive/format/cdx/MultiCDXInputSource.java b/src/main/java/org/archive/format/cdx/MultiCDXInputSource.java
index 7f1ff002..35bb9043 100644
--- a/src/main/java/org/archive/format/cdx/MultiCDXInputSource.java
+++ b/src/main/java/org/archive/format/cdx/MultiCDXInputSource.java
@@ -41,18 +41,22 @@ public void setCdxUris(List cdxUris) throws IOException {
}
- Comparator comparator = new Comparator() {
+ public final static Comparator defaultComparator = new Comparator() {
public int compare(String s1, String s2) {
return s1.compareTo(s2);
}
};
- Comparator reverseComparator = new Comparator() {
+ public final static Comparator defaultReverseComparator = new Comparator() {
public int compare(String s1, String s2) {
return -s1.compareTo(s2);
}
};
+ protected Comparator comparator = defaultComparator;
+ protected Comparator reverseComparator = defaultReverseComparator;
+
+
public CloseableIterator getCDXIterator(String key, String prefix, boolean exact, ZipNumParams params) throws IOException {
SortedCompositeIterator scitr = new SortedCompositeIterator(cdx.size(), params.isReverse() ? reverseComparator : comparator);
From 7d4fbf451ed3ed8e09b0eb077b95411928a98a3c Mon Sep 17 00:00:00 2001
From: Ilya Kreymer
Date: Fri, 18 Oct 2013 13:10:58 -0700
Subject: [PATCH 11/27] FIX: MultiCDXInputSource: add optimization for output
lazy initing of sequential cluster load
---
.../format/cdx/MultiCDXInputSource.java | 80 ++++++++++++++++++-
1 file changed, 78 insertions(+), 2 deletions(-)
diff --git a/src/main/java/org/archive/format/cdx/MultiCDXInputSource.java b/src/main/java/org/archive/format/cdx/MultiCDXInputSource.java
index 35bb9043..aa44a887 100644
--- a/src/main/java/org/archive/format/cdx/MultiCDXInputSource.java
+++ b/src/main/java/org/archive/format/cdx/MultiCDXInputSource.java
@@ -75,14 +75,90 @@ public CloseableIterator getCDXIterator(String key, String prefix, boole
return scitr;
}
+ // A special iterator which initializes on actual first use
+ protected static class LazyInitIterator implements CloseableIterator
+ {
+ CDXInputSource source;
+ CloseableIterator iter;
+ boolean failed = false;
+
+ String key, start, end;
+ ZipNumParams params;
+
+ protected LazyInitIterator(CDXInputSource source, String key, String start, String end, ZipNumParams params)
+ {
+ this.key = key;
+ this.start = start;
+ this.end = end;
+
+ this.params = params;
+
+ this.source = source;
+ }
+
+ protected void initIter()
+ {
+ if (iter != null) {
+ return;
+ }
+
+ try {
+ iter = source.getCDXIterator(key, start, end, params);
+ } catch (IOException io) {
+ LOGGER.warning(io.toString());
+ iter = null;
+ }
+ }
+
+ @Override
+ public boolean hasNext() {
+ initIter();
+
+ if (iter == null) {
+ return false;
+ }
+
+ return iter.hasNext();
+ }
+
+ @Override
+ public String next() {
+ initIter();
+
+ if (iter == null) {
+ return null;
+ }
+
+ return iter.next();
+ }
+
+ @Override
+ public void remove() {
+
+ }
+
+ @Override
+ public void close() throws IOException {
+ if (iter != null) {
+ iter.close();
+ }
+ }
+ }
+
public CloseableIterator createSeqIterator(String key, String start, String end, ZipNumParams params)
{
CloseableCompositeIterator composite = new CloseableCompositeIterator();
CloseableIterator iter = null;
- for (CDXInputSource cdxReader : cdx) {
+ for (int i = 0; i < cdx.size(); i++) {
try {
- iter = cdxReader.getCDXIterator(key, start, end, params);
+ CDXInputSource cdxReader = cdx.get(i);
+
+ if (i == (cdx.size() - 1)) {
+ iter = cdxReader.getCDXIterator(key, start, end, params);
+ } else {
+ iter = new LazyInitIterator(cdxReader, key, start, end, params);
+ }
if (!params.isReverse()) {
composite.addLast(iter);
From 2d2d30e75b95717605580e7e48dd2dca430f8b0e Mon Sep 17 00:00:00 2001
From: Ilya Kreymer
Date: Wed, 23 Oct 2013 16:51:47 -0700
Subject: [PATCH 12/27] FEATURE: add getTotalCount() to cdx input sources
---
src/main/java/org/archive/format/cdx/CDXFile.java | 6 ++++++
.../java/org/archive/format/cdx/CDXInputSource.java | 2 ++
.../org/archive/format/cdx/MultiCDXInputSource.java | 11 +++++++++++
.../org/archive/format/gzip/zipnum/ZipNumCluster.java | 6 +++++-
.../org/archive/format/gzip/zipnum/ZipNumIndex.java | 8 ++++++++
5 files changed, 32 insertions(+), 1 deletion(-)
diff --git a/src/main/java/org/archive/format/cdx/CDXFile.java b/src/main/java/org/archive/format/cdx/CDXFile.java
index 0c3a777a..7dca0464 100644
--- a/src/main/java/org/archive/format/cdx/CDXFile.java
+++ b/src/main/java/org/archive/format/cdx/CDXFile.java
@@ -97,4 +97,10 @@ public static BufferedReader createStreamingLineReader(String uri, boolean gzipp
BufferedReader reader = new BufferedReader(new InputStreamReader(input));
return reader;
}
+
+ @Override
+ public long getTotalLines() {
+ //TODO: Implement
+ return 0;
+ }
}
diff --git a/src/main/java/org/archive/format/cdx/CDXInputSource.java b/src/main/java/org/archive/format/cdx/CDXInputSource.java
index 0a926ebc..34abde53 100644
--- a/src/main/java/org/archive/format/cdx/CDXInputSource.java
+++ b/src/main/java/org/archive/format/cdx/CDXInputSource.java
@@ -9,4 +9,6 @@ public interface CDXInputSource {
public CloseableIterator getCDXIterator(String key, String prefix, boolean exact, ZipNumParams params) throws IOException;
public CloseableIterator getCDXIterator(String key, String start, String startEndUrl, ZipNumParams params) throws IOException;
+
+ public long getTotalLines();
}
diff --git a/src/main/java/org/archive/format/cdx/MultiCDXInputSource.java b/src/main/java/org/archive/format/cdx/MultiCDXInputSource.java
index aa44a887..cbf70c0e 100644
--- a/src/main/java/org/archive/format/cdx/MultiCDXInputSource.java
+++ b/src/main/java/org/archive/format/cdx/MultiCDXInputSource.java
@@ -196,4 +196,15 @@ public CloseableIterator getCDXIterator(String key, String start, String
return scitr;
}
+
+ @Override
+ public long getTotalLines() {
+ long sum = 0;
+
+ for (CDXInputSource cdxReader : cdx) {
+ sum += cdxReader.getTotalLines();
+ }
+
+ return sum;
+ }
}
diff --git a/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java b/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java
index b30cf489..bc773a58 100644
--- a/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java
+++ b/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java
@@ -190,6 +190,7 @@ public void init() throws IOException
startDate = newStartDate;
endDate = newEndDate;
locRoot = newLocRoot;
+ this.cdxLinesTotalCount = computeTotalLines();
if (!disabled) {
this.loadLastBlockSizes(blockSizesFile);
@@ -241,6 +242,8 @@ protected void syncLoad(long newModTime)
endDate = newEndDate;
disabled = newIsDisabled;
locRoot = newLocRoot;
+
+ this.cdxLinesTotalCount = computeTotalLines();
}
if (this.locCacheMap != null) {
@@ -484,8 +487,9 @@ public long getLastBlockDiff(String startKey, int startPart, int endPart) {
return diff;
}
+
// Adjust from shorter blocks, if loaded
- public long getTotalLines()
+ public long computeTotalLines()
{
long numLines = 0;
diff --git a/src/main/java/org/archive/format/gzip/zipnum/ZipNumIndex.java b/src/main/java/org/archive/format/gzip/zipnum/ZipNumIndex.java
index 7860be36..ad8c9297 100644
--- a/src/main/java/org/archive/format/gzip/zipnum/ZipNumIndex.java
+++ b/src/main/java/org/archive/format/gzip/zipnum/ZipNumIndex.java
@@ -29,6 +29,9 @@ public class ZipNumIndex implements CDXInputSource {
// Used only for reference / user info
protected int cdxLinesPerBlock = 3000;
+
+ protected long cdxLinesTotalCount = 0;
+
//protected HashMap locMap = null;
protected final static boolean DEFAULT_USE_NIO = true;
@@ -528,4 +531,9 @@ public boolean isRequired() {
public void setRequired(boolean required) {
this.required = required;
}
+
+ @Override
+ public long getTotalLines() {
+ return cdxLinesTotalCount;
+ }
}
From baf1ad8921efb472be893e9ee92d48e49e56229e Mon Sep 17 00:00:00 2001
From: Ilya Kreymer
Date: Thu, 24 Oct 2013 11:19:22 -0700
Subject: [PATCH 13/27] log RuntimeIOException
---
.../org/archive/util/binsearch/SeekableLineReaderIterator.java | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/main/java/org/archive/util/binsearch/SeekableLineReaderIterator.java b/src/main/java/org/archive/util/binsearch/SeekableLineReaderIterator.java
index ca443ad4..991553c8 100644
--- a/src/main/java/org/archive/util/binsearch/SeekableLineReaderIterator.java
+++ b/src/main/java/org/archive/util/binsearch/SeekableLineReaderIterator.java
@@ -26,7 +26,7 @@ public String getNextInner() {
next = slr.readLine();
} catch (IOException e) {
if (propagateException) {
- throw new RuntimeIOException();
+ throw new RuntimeIOException(e.toString());
}
}
}
From 154386c3d0c741a2750bb16e5720a72468b7407e Mon Sep 17 00:00:00 2001
From: Ilya Kreymer
Date: Fri, 1 Nov 2013 14:58:19 -0700
Subject: [PATCH 14/27] add ThreadLocalHttpConnectionManager (from
heritrix-commons) Switch to use ThreadLocalHttpConnectionManager as default
for ApacheSLR!
---
.../impl/http/ApacheHttp31SLRFactory.java | 18 +-
.../io/ThreadLocalHttpConnectionManager.java | 291 ++++++++++++++++++
2 files changed, 301 insertions(+), 8 deletions(-)
create mode 100644 src/main/java/org/archive/util/io/ThreadLocalHttpConnectionManager.java
diff --git a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java
index 52e73a94..1f37b365 100644
--- a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java
+++ b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java
@@ -3,21 +3,21 @@
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
-import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
import org.apache.commons.httpclient.HostConfiguration;
import org.apache.commons.httpclient.HttpClient;
-import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
+import org.apache.commons.httpclient.HttpConnectionManager;
import org.apache.commons.httpclient.params.HttpClientParams;
import org.archive.util.binsearch.impl.HTTPSeekableLineReader;
import org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory;
+import org.archive.util.httpclient.ThreadLocalHttpConnectionManager;
public class ApacheHttp31SLRFactory extends HTTPSeekableLineReaderFactory {
private final static Logger LOGGER = Logger.getLogger(ApacheHttp31SLRFactory.class.getName());
- private MultiThreadedHttpConnectionManager connectionManager = null;
+ private HttpConnectionManager connectionManager = null;
private HostConfiguration hostConfiguration = null;
private HttpClient http = null;
@@ -26,7 +26,8 @@ public ApacheHttp31SLRFactory(String uriString) {
}
public ApacheHttp31SLRFactory() {
- connectionManager = new MultiThreadedHttpConnectionManager();
+ //connectionManager = new MultiThreadedHttpConnectionManager();
+ connectionManager = new ThreadLocalHttpConnectionManager();
hostConfiguration = new HostConfiguration();
HttpClientParams params = new HttpClientParams();
http = new HttpClient(params,connectionManager);
@@ -35,15 +36,16 @@ public ApacheHttp31SLRFactory() {
public void close() throws IOException
{
- connectionManager.deleteClosedConnections();
+ //connectionManager.deleteClosedConnections();
+ connectionManager.closeIdleConnections(0);
}
@Override
public ApacheHttp31SLR get(String url) throws IOException {
- if (LOGGER.isLoggable(Level.FINEST)) {
- LOGGER.finest("Connections: " + connectionManager.getConnectionsInPool(hostConfiguration));
- }
+// if (LOGGER.isLoggable(Level.FINEST)) {
+// LOGGER.finest("Connections: " + connectionManager.getConnectionsInPool(hostConfiguration));
+// }
return new ApacheHttp31SLR(http, url);
}
diff --git a/src/main/java/org/archive/util/io/ThreadLocalHttpConnectionManager.java b/src/main/java/org/archive/util/io/ThreadLocalHttpConnectionManager.java
new file mode 100644
index 00000000..83555584
--- /dev/null
+++ b/src/main/java/org/archive/util/io/ThreadLocalHttpConnectionManager.java
@@ -0,0 +1,291 @@
+/**
+ * ====================================================================
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ====================================================================
+ *
+ */
+package org.archive.util.io;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import org.apache.commons.httpclient.HostConfiguration;
+import org.apache.commons.httpclient.HttpConnection;
+import org.apache.commons.httpclient.HttpConnectionManager;
+import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
+
+/**
+ * A simple, but thread-safe HttpClient {@link HttpConnectionManager}.
+ * Based on {@link org.apache.commons.httpclient.SimpleHttpConnectionManager}.
+ *
+ * Java >= 1.4 is recommended.
+ *
+ * @author Christian Kohlschuetter
+ */
+public final class ThreadLocalHttpConnectionManager implements
+ HttpConnectionManager {
+
+ private static final CloserThread closer = new CloserThread();
+ private static final Logger logger = Logger
+ .getLogger(ThreadLocalHttpConnectionManager.class.getName());
+
+ private final ThreadLocal tl = new ThreadLocal() {
+ protected synchronized ConnectionInfo initialValue() {
+ return new ConnectionInfo();
+ }
+ };
+
+ private ConnectionInfo getConnectionInfo() {
+ return (ConnectionInfo) tl.get();
+ }
+
+ private static final class ConnectionInfo {
+ /** The http connection */
+ private HttpConnection conn = null;
+
+ /**
+ * The time the connection was made idle.
+ */
+ private long idleStartTime = Long.MAX_VALUE;
+ }
+
+ public ThreadLocalHttpConnectionManager() {
+ }
+
+ /**
+ * Since the same connection is about to be reused, make sure the
+ * previous request was completely processed, and if not
+ * consume it now.
+ * @param conn The connection
+ * @return true, if the connection is reusable
+ */
+ private static boolean finishLastResponse(final HttpConnection conn) {
+ InputStream lastResponse = conn.getLastResponseInputStream();
+ if(lastResponse != null) {
+ conn.setLastResponseInputStream(null);
+ try {
+ lastResponse.close();
+ return true;
+ } catch (IOException ioe) {
+ // force reconnect.
+ return false;
+ }
+ } else {
+ return false;
+ }
+ }
+
+ /**
+ * Collection of parameters associated with this connection manager.
+ */
+ private HttpConnectionManagerParams params = new HttpConnectionManagerParams();
+
+ /**
+ * @see HttpConnectionManager#getConnection(HostConfiguration)
+ */
+ public HttpConnection getConnection(
+ final HostConfiguration hostConfiguration) {
+ return getConnection(hostConfiguration, 0);
+ }
+
+ /**
+ * Gets the staleCheckingEnabled value to be set on HttpConnections that are created.
+ *
+ * @return true if stale checking will be enabled on HttpConections
+ *
+ * @see HttpConnection#isStaleCheckingEnabled()
+ *
+ * @deprecated Use {@link HttpConnectionManagerParams#isStaleCheckingEnabled()},
+ * {@link HttpConnectionManager#getParams()}.
+ */
+ public boolean isConnectionStaleCheckingEnabled() {
+ return this.params.isStaleCheckingEnabled();
+ }
+
+ /**
+ * Sets the staleCheckingEnabled value to be set on HttpConnections that are created.
+ *
+ * @param connectionStaleCheckingEnabled true if stale checking will be enabled
+ * on HttpConections
+ *
+ * @see HttpConnection#setStaleCheckingEnabled(boolean)
+ *
+ * @deprecated Use {@link HttpConnectionManagerParams#setStaleCheckingEnabled(boolean)},
+ * {@link HttpConnectionManager#getParams()}.
+ */
+ public void setConnectionStaleCheckingEnabled(
+ final boolean connectionStaleCheckingEnabled) {
+ this.params.setStaleCheckingEnabled(connectionStaleCheckingEnabled);
+ }
+
+ /**
+ * @see HttpConnectionManager#getConnectionWithTimeout(HostConfiguration, long)
+ *
+ * @since 3.0
+ */
+ public HttpConnection getConnectionWithTimeout(
+ final HostConfiguration hostConfiguration, final long timeout) {
+
+ final ConnectionInfo ci = getConnectionInfo();
+ HttpConnection httpConnection = ci.conn;
+
+ // make sure the host and proxy are correct for this connection
+ // close it and set the values if they are not
+ if(httpConnection == null || !finishLastResponse(httpConnection)
+ || !hostConfiguration.hostEquals(httpConnection)
+ || !hostConfiguration.proxyEquals(httpConnection)) {
+
+ if(httpConnection != null && httpConnection.isOpen()) {
+ closer.closeConnection(httpConnection);
+ }
+
+ httpConnection = new HttpConnection(hostConfiguration);
+ httpConnection.setHttpConnectionManager(this);
+ httpConnection.getParams().setDefaults(this.params);
+ ci.conn = httpConnection;
+
+ httpConnection.setHost(hostConfiguration.getHost());
+ httpConnection.setPort(hostConfiguration.getPort());
+ httpConnection.setProtocol(hostConfiguration.getProtocol());
+ httpConnection.setLocalAddress(hostConfiguration.getLocalAddress());
+
+ httpConnection.setProxyHost(hostConfiguration.getProxyHost());
+ httpConnection.setProxyPort(hostConfiguration.getProxyPort());
+ }
+
+ // remove the connection from the timeout handler
+ ci.idleStartTime = Long.MAX_VALUE;
+
+ return httpConnection;
+ }
+
+ /**
+ * @see HttpConnectionManager#getConnection(HostConfiguration, long)
+ *
+ * @deprecated Use #getConnectionWithTimeout(HostConfiguration, long)
+ */
+ public HttpConnection getConnection(
+ final HostConfiguration hostConfiguration, final long timeout) {
+ return getConnectionWithTimeout(hostConfiguration, timeout);
+ }
+
+ /**
+ * @see HttpConnectionManager#releaseConnection(org.apache.commons.httpclient.HttpConnection)
+ */
+ public void releaseConnection(final HttpConnection conn) {
+ final ConnectionInfo ci = getConnectionInfo();
+ HttpConnection httpConnection = ci.conn;
+
+ if(conn != httpConnection) {
+ throw new IllegalStateException(
+ "Unexpected release of an unknown connection.");
+ }
+
+ finishLastResponse(httpConnection);
+
+ // track the time the connection was made idle
+ ci.idleStartTime = System.currentTimeMillis();
+ }
+
+ /**
+ * Returns {@link HttpConnectionManagerParams parameters} associated
+ * with this connection manager.
+ *
+ * @since 2.1
+ *
+ * @see HttpConnectionManagerParams
+ */
+ public HttpConnectionManagerParams getParams() {
+ return this.params;
+ }
+
+ /**
+ * Assigns {@link HttpConnectionManagerParams parameters} for this
+ * connection manager.
+ *
+ * @since 2.1
+ *
+ * @see HttpConnectionManagerParams
+ */
+ public void setParams(final HttpConnectionManagerParams p) {
+ if(p == null) {
+ throw new IllegalArgumentException("Parameters may not be null");
+ }
+ this.params = p;
+ }
+
+ /**
+ * @since 3.0
+ */
+ public void closeIdleConnections(final long idleTimeout) {
+ long maxIdleTime = System.currentTimeMillis() - idleTimeout;
+
+ final ConnectionInfo ci = getConnectionInfo();
+
+ if(ci.idleStartTime <= maxIdleTime) {
+ ci.conn.close();
+ }
+ }
+
+ private static final class CloserThread extends Thread {
+ private List connections
+ = new ArrayList();
+
+ private static final int SLEEP_INTERVAL = 5000;
+
+ public CloserThread() {
+ super("HttpConnection closer");
+ // Make this a daemon thread so it can't be responsible for the JVM
+ // not shutting down.
+ setDaemon(true);
+ start();
+ }
+
+ public void closeConnection(final HttpConnection conn) {
+ synchronized (connections) {
+ connections.add(conn);
+ }
+ }
+
+ public void run() {
+ try {
+ while (!Thread.interrupted()) {
+ Thread.sleep(SLEEP_INTERVAL);
+
+ List s;
+ synchronized (connections) {
+ s = connections;
+ connections = new ArrayList();
+ }
+ logger.log(Level.INFO, "Closing " + s.size()
+ + " HttpConnections");
+ for(final Iterator it = s.iterator();
+ it.hasNext();) {
+ HttpConnection conn = it.next();
+ conn.close();
+ conn.setHttpConnectionManager(null);
+ it.remove();
+ }
+ }
+ } catch (InterruptedException e) {
+ return;
+ }
+ }
+ }
+}
From cd4310101050d10e00053f66aed0740b0f46c315 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer
Date: Fri, 1 Nov 2013 22:03:34 +0000
Subject: [PATCH 15/27] fix path for ThreadLocalHttpConnectionManager
---
.../util/{io => httpclient}/ThreadLocalHttpConnectionManager.java | 0
1 file changed, 0 insertions(+), 0 deletions(-)
rename src/main/java/org/archive/util/{io => httpclient}/ThreadLocalHttpConnectionManager.java (100%)
diff --git a/src/main/java/org/archive/util/io/ThreadLocalHttpConnectionManager.java b/src/main/java/org/archive/util/httpclient/ThreadLocalHttpConnectionManager.java
similarity index 100%
rename from src/main/java/org/archive/util/io/ThreadLocalHttpConnectionManager.java
rename to src/main/java/org/archive/util/httpclient/ThreadLocalHttpConnectionManager.java
From 1c58cba6d8d210028187dd089528af72fec77d23 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer
Date: Fri, 1 Nov 2013 22:05:56 +0000
Subject: [PATCH 16/27] fix package
---
.../util/httpclient/ThreadLocalHttpConnectionManager.java | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/main/java/org/archive/util/httpclient/ThreadLocalHttpConnectionManager.java b/src/main/java/org/archive/util/httpclient/ThreadLocalHttpConnectionManager.java
index 83555584..d3101e5f 100644
--- a/src/main/java/org/archive/util/httpclient/ThreadLocalHttpConnectionManager.java
+++ b/src/main/java/org/archive/util/httpclient/ThreadLocalHttpConnectionManager.java
@@ -15,7 +15,7 @@
* ====================================================================
*
*/
-package org.archive.util.io;
+package org.archive.util.httpclient;
import java.io.IOException;
import java.io.InputStream;
From da379775ff2f3482c83fe730f89e6d18ab813e1c Mon Sep 17 00:00:00 2001
From: Ilya Kreymer
Date: Fri, 1 Nov 2013 15:48:14 -0700
Subject: [PATCH 17/27] move ThreadLocalHttpConnectionManager to original
package org.archive.httpclient
---
.../{util => }/httpclient/ThreadLocalHttpConnectionManager.java | 2 +-
.../util/binsearch/impl/http/ApacheHttp31SLRFactory.java | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
rename src/main/java/org/archive/{util => }/httpclient/ThreadLocalHttpConnectionManager.java (99%)
diff --git a/src/main/java/org/archive/util/httpclient/ThreadLocalHttpConnectionManager.java b/src/main/java/org/archive/httpclient/ThreadLocalHttpConnectionManager.java
similarity index 99%
rename from src/main/java/org/archive/util/httpclient/ThreadLocalHttpConnectionManager.java
rename to src/main/java/org/archive/httpclient/ThreadLocalHttpConnectionManager.java
index d3101e5f..91e850ea 100644
--- a/src/main/java/org/archive/util/httpclient/ThreadLocalHttpConnectionManager.java
+++ b/src/main/java/org/archive/httpclient/ThreadLocalHttpConnectionManager.java
@@ -15,7 +15,7 @@
* ====================================================================
*
*/
-package org.archive.util.httpclient;
+package org.archive.httpclient;
import java.io.IOException;
import java.io.InputStream;
diff --git a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java
index 1f37b365..cffd0ebf 100644
--- a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java
+++ b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java
@@ -10,9 +10,9 @@
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpConnectionManager;
import org.apache.commons.httpclient.params.HttpClientParams;
+import org.archive.httpclient.ThreadLocalHttpConnectionManager;
import org.archive.util.binsearch.impl.HTTPSeekableLineReader;
import org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory;
-import org.archive.util.httpclient.ThreadLocalHttpConnectionManager;
public class ApacheHttp31SLRFactory extends HTTPSeekableLineReaderFactory {
private final static Logger LOGGER = Logger.getLogger(ApacheHttp31SLRFactory.class.getName());
From 68dab84a2d6e2a250055fb2847e7645d9555235f Mon Sep 17 00:00:00 2001
From: Ilya Kreymer
Date: Fri, 1 Nov 2013 18:10:15 -0700
Subject: [PATCH 18/27] FEATURE: add support for httpcore 4.3 for
SeekableLineReader!
---
pom.xml | 7 +-
.../impl/HTTPSeekableLineReaderFactory.java | 6 +
.../impl/http/ApacheHttp31SLRFactory.java | 6 +-
.../binsearch/impl/http/ApacheHttp43SLR.java | 180 ++++++++++++++++++
.../impl/http/ApacheHttp43SLRFactory.java | 100 ++++++++++
5 files changed, 295 insertions(+), 4 deletions(-)
create mode 100644 src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp43SLR.java
create mode 100644 src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp43SLRFactory.java
diff --git a/pom.xml b/pom.xml
index db1efecc..03b1240d 100644
--- a/pom.xml
+++ b/pom.xml
@@ -118,7 +118,12 @@
mg4j1.0.1compile
-
+
+
+ org.apache.httpcomponents
+ httpcore
+ 4.3
+
diff --git a/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReaderFactory.java b/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReaderFactory.java
index c1fa6fb6..b4a23db0 100644
--- a/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReaderFactory.java
+++ b/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReaderFactory.java
@@ -4,6 +4,7 @@
import org.archive.util.binsearch.SeekableLineReaderFactory;
import org.archive.util.binsearch.impl.http.ApacheHttp31SLRFactory;
+import org.archive.util.binsearch.impl.http.ApacheHttp43SLRFactory;
import org.archive.util.binsearch.impl.http.HTTPURLConnSLRFactory;
public abstract class HTTPSeekableLineReaderFactory implements SeekableLineReaderFactory {
@@ -20,6 +21,7 @@ protected HTTPSeekableLineReaderFactory()
public enum HttpLibs
{
APACHE_31,
+ APACHE_43,
URLCONN,
}
@@ -50,6 +52,10 @@ public static HTTPSeekableLineReaderFactory getHttpFactory(HttpLibs type, String
case URLCONN:
factory = new HTTPURLConnSLRFactory();
break;
+
+ case APACHE_43:
+ factory = new ApacheHttp43SLRFactory();
+ break;
}
if (factory == null) {
diff --git a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java
index cffd0ebf..9bd7542b 100644
--- a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java
+++ b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java
@@ -9,8 +9,8 @@
import org.apache.commons.httpclient.HostConfiguration;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpConnectionManager;
+import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
import org.apache.commons.httpclient.params.HttpClientParams;
-import org.archive.httpclient.ThreadLocalHttpConnectionManager;
import org.archive.util.binsearch.impl.HTTPSeekableLineReader;
import org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory;
@@ -26,8 +26,8 @@ public ApacheHttp31SLRFactory(String uriString) {
}
public ApacheHttp31SLRFactory() {
- //connectionManager = new MultiThreadedHttpConnectionManager();
- connectionManager = new ThreadLocalHttpConnectionManager();
+ connectionManager = new MultiThreadedHttpConnectionManager();
+ //connectionManager = new ThreadLocalHttpConnectionManager();
hostConfiguration = new HostConfiguration();
HttpClientParams params = new HttpClientParams();
http = new HttpClient(params,connectionManager);
diff --git a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp43SLR.java b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp43SLR.java
new file mode 100644
index 00000000..85a460da
--- /dev/null
+++ b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp43SLR.java
@@ -0,0 +1,180 @@
+package org.archive.util.binsearch.impl.http;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.InetSocketAddress;
+import java.net.Socket;
+import java.net.SocketAddress;
+import java.net.URL;
+
+import org.apache.http.Header;
+import org.apache.http.HttpException;
+import org.apache.http.HttpRequest;
+import org.apache.http.HttpResponse;
+import org.apache.http.HttpVersion;
+import org.apache.http.impl.DefaultBHttpClientConnection;
+import org.apache.http.message.BasicHttpRequest;
+import org.archive.util.binsearch.impl.HTTPSeekableLineReader;
+
+public class ApacheHttp43SLR extends HTTPSeekableLineReader {
+
+ private String urlString;
+
+ private int connectTimeout = 0;
+ private int readTimeout = 0;
+
+ private Socket socket = null;
+ private DefaultBHttpClientConnection activeConn = null;
+ private HttpResponse response = null;
+
+ private final static int BUFF_SIZE = 8192;
+
+ public ApacheHttp43SLR(String url)
+ {
+ urlString = url;
+ }
+
+ public ApacheHttp43SLR(String url, int connectTimeout, int readTimeout)
+ {
+ this.urlString = url;
+ this.connectTimeout = connectTimeout;
+ this.readTimeout = readTimeout;
+ }
+
+ @Override
+ public String getUrl() {
+ return urlString;
+ }
+
+ @Override
+ public long getSize() throws IOException {
+ if (response == null) {
+ return 0;
+ }
+
+ return response.getEntity().getContentLength();
+ }
+
+ @Override
+ public String getHeaderValue(String headerName) {
+ if (response == null) {
+ return null;
+ }
+
+ Header header = response.getFirstHeader(headerName);
+ if (header == null) {
+ return null;
+ }
+
+ return header.getValue();
+ }
+
+ protected static int getPort(URL url)
+ {
+ int port = url.getPort();
+
+ if (port > 0) {
+ return port;
+ }
+
+ return url.getDefaultPort();
+ }
+
+ protected InputStream doSeekLoad(long offset, int maxLength, URL url)
+ throws IOException {
+
+ SocketAddress endpoint = null;
+
+ try {
+ socket = new Socket();
+ endpoint = new InetSocketAddress(url.getHost(), getPort(url));
+ socket.connect(endpoint, connectTimeout);
+
+ activeConn = new DefaultBHttpClientConnection(BUFF_SIZE);
+ activeConn.bind(socket);
+ activeConn.setSocketTimeout(readTimeout);
+
+ HttpRequest request = new BasicHttpRequest("GET", url.getFile(), HttpVersion.HTTP_1_1);
+
+ String rangeHeader = makeRangeHeader(offset, maxLength);
+
+ if (rangeHeader != null) {
+ request.setHeader("Range", rangeHeader);
+ }
+
+ if (this.isNoKeepAlive()) {
+ request.setHeader("Connection", "close");
+ }
+
+ if (this.getCookie() != null) {
+ request.setHeader("Cookie", this.getCookie());
+ }
+
+ request.setHeader("Accept", "*/*");
+ request.setHeader("Host", url.getHost());
+
+ activeConn.sendRequestHeader(request);
+ activeConn.flush();
+
+ response = activeConn.receiveResponseHeader();
+
+ int code = response.getStatusLine().getStatusCode();
+
+ connectedUrl = url.toString();
+
+ if (code > 300 && code < 400) {
+ Header header = response.getFirstHeader("Location");
+
+ doClose();
+
+ if (header != null) {
+ URL redirectURL = new URL(header.getValue());
+ return doSeekLoad(offset, maxLength, redirectURL);
+ }
+ }
+
+ if (code != 200 && code != 206) {
+ throw new BadHttpStatusException(code, connectedUrl + " " + rangeHeader);
+ }
+
+ activeConn.receiveResponseEntity(response);
+
+ return response.getEntity().getContent();
+
+ } catch (HttpException e) {
+ doClose();
+ throw new IOException(e);
+
+ } catch (IOException io) {
+
+ if (saveErrHeader != null) {
+ errHeader = getHeaderValue(saveErrHeader);
+ }
+
+ connectedUrl = url.toString();
+
+ doClose();
+ throw io;
+ }
+ }
+
+ @Override
+ protected void doClose() throws IOException {
+ if (activeConn != null) {
+ activeConn.close();
+ activeConn = null;
+ socket = null;
+ } else if (socket != null) {
+ socket.close();
+ socket = null;
+ }
+ response = null;
+ }
+
+ @Override
+ protected InputStream doSeekLoad(long offset, int maxLength)
+ throws IOException {
+
+ return doSeekLoad(offset, maxLength, new URL(urlString));
+ }
+}
diff --git a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp43SLRFactory.java b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp43SLRFactory.java
new file mode 100644
index 00000000..5e3bb3ed
--- /dev/null
+++ b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp43SLRFactory.java
@@ -0,0 +1,100 @@
+package org.archive.util.binsearch.impl.http;
+
+import java.io.IOException;
+
+import org.archive.util.binsearch.impl.HTTPSeekableLineReader;
+import org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory;
+
+public class ApacheHttp43SLRFactory extends HTTPSeekableLineReaderFactory {
+
+ private int readTimeout = 0;
+ private int connectTimeout = 0;
+
+ public ApacheHttp43SLRFactory()
+ {
+
+ }
+
+ @Override
+ public HTTPSeekableLineReader get(String url) throws IOException {
+ return new ApacheHttp43SLR(url, connectTimeout, readTimeout);
+ }
+
+ @Override
+ public void close() throws IOException {
+ // TODO Auto-generated method stub
+ }
+
+ @Override
+ public void setProxyHostPort(String hostPort) {
+ // TODO Auto-generated method stub
+
+ }
+
+ @Override
+ public void setMaxTotalConnections(int maxTotalConnections) {
+ // TODO Auto-generated method stub
+
+ }
+
+ @Override
+ public int getMaxTotalConnections() {
+ // TODO Auto-generated method stub
+ return 0;
+ }
+
+ @Override
+ public void setMaxHostConnections(int maxHostConnections) {
+ // TODO Auto-generated method stub
+
+ }
+
+ @Override
+ public int getMaxHostConnections() {
+ // TODO Auto-generated method stub
+ return 0;
+ }
+
+ @Override
+ public int getConnectionTimeoutMS() {
+ return connectTimeout;
+ }
+
+ @Override
+ public void setConnectionTimeoutMS(int connectionTimeoutMS) {
+ connectTimeout = connectionTimeoutMS;
+
+ }
+
+ @Override
+ public int getSocketTimeoutMS() {
+ return readTimeout;
+ }
+
+ @Override
+ public void setSocketTimeoutMS(int socketTimeoutMS) {
+ readTimeout = socketTimeoutMS;
+ }
+
+ @Override
+ public void setStaleChecking(boolean enabled) {
+
+ }
+
+ @Override
+ public boolean isStaleChecking() {
+ // TODO Auto-generated method stub
+ return false;
+ }
+
+ @Override
+ public long getModTime() {
+ // TODO Auto-generated method stub
+ return 0;
+ }
+
+ @Override
+ public void setNumRetries(int numRetries) {
+ // TODO Auto-generated method stub
+ }
+}
From 54bdd7da37c195cc1885a9e6e424a6f91555f1df Mon Sep 17 00:00:00 2001
From: Ilya Kreymer
Date: Sat, 2 Nov 2013 10:47:35 -0700
Subject: [PATCH 19/27] apach43SLR: better reading of entire buffer
---
.../binsearch/impl/http/ApacheHttp43SLR.java | 40 +++++++++++++++++--
1 file changed, 37 insertions(+), 3 deletions(-)
diff --git a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp43SLR.java b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp43SLR.java
index 85a460da..ef206bb1 100644
--- a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp43SLR.java
+++ b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp43SLR.java
@@ -1,5 +1,6 @@
package org.archive.util.binsearch.impl.http;
+import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.InetSocketAddress;
@@ -14,7 +15,9 @@
import org.apache.http.HttpVersion;
import org.apache.http.impl.DefaultBHttpClientConnection;
import org.apache.http.message.BasicHttpRequest;
+import org.apache.http.util.EntityUtils;
import org.archive.util.binsearch.impl.HTTPSeekableLineReader;
+import org.archive.util.zip.GZIPMembersInputStream;
public class ApacheHttp43SLR extends HTTPSeekableLineReader {
@@ -83,11 +86,10 @@ protected static int getPort(URL url)
protected InputStream doSeekLoad(long offset, int maxLength, URL url)
throws IOException {
- SocketAddress endpoint = null;
-
try {
+ SocketAddress endpoint = new InetSocketAddress(url.getHost(), getPort(url));
+
socket = new Socket();
- endpoint = new InetSocketAddress(url.getHost(), getPort(url));
socket.connect(endpoint, connectTimeout);
activeConn = new DefaultBHttpClientConnection(BUFF_SIZE);
@@ -104,6 +106,8 @@ protected InputStream doSeekLoad(long offset, int maxLength, URL url)
if (this.isNoKeepAlive()) {
request.setHeader("Connection", "close");
+ } else {
+ request.setHeader("Connection", "keep-alive");
}
if (this.getCookie() != null) {
@@ -157,6 +161,36 @@ protected InputStream doSeekLoad(long offset, int maxLength, URL url)
throw io;
}
}
+
+ @Override
+ public void seekWithMaxRead(long offset, boolean gzip, int maxLength) throws IOException
+ {
+ if (closed) {
+ throw new IOException("Seek after close()");
+ }
+
+ br = null;
+
+ try {
+ doSeekLoad(offset, maxLength);
+
+ if (bufferFully && (maxLength > 0)) {
+ byte[] buffer = EntityUtils.toByteArray(response.getEntity());
+
+ doClose();
+
+ is = new ByteArrayInputStream(buffer);
+ }
+
+ if (gzip) {
+ is = new GZIPMembersInputStream(is, blockSize);
+ }
+
+ } catch (IOException io) {
+ doClose();
+ throw io;
+ }
+ }
@Override
protected void doClose() throws IOException {
From 1e8c1afaffc2378145f5171d6f4c4c2ac7f30c3b Mon Sep 17 00:00:00 2001
From: Ilya Kreymer
Date: Sun, 3 Nov 2013 18:38:29 -0800
Subject: [PATCH 20/27] canonicalizer: add ExtractRule and RewriteRule
---
.../java/org/archive/url/ExtractRule.java | 45 +++++++++++++++
.../java/org/archive/url/RewriteRule.java | 55 +++++++++++++++++++
.../org/archive/url/WaybackURLKeyMaker.java | 48 ++--------------
3 files changed, 104 insertions(+), 44 deletions(-)
create mode 100644 src/main/java/org/archive/url/ExtractRule.java
create mode 100644 src/main/java/org/archive/url/RewriteRule.java
diff --git a/src/main/java/org/archive/url/ExtractRule.java b/src/main/java/org/archive/url/ExtractRule.java
new file mode 100644
index 00000000..6d975b61
--- /dev/null
+++ b/src/main/java/org/archive/url/ExtractRule.java
@@ -0,0 +1,45 @@
+package org.archive.url;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class ExtractRule
+{
+ protected String startsWith;
+ protected String regex;
+
+ protected Pattern regexPattern;
+
+ public String getStartsWith() {
+ return startsWith;
+ }
+ public void setStartsWith(String startsWith) {
+ this.startsWith = startsWith;
+ }
+ public String getRegex() {
+ return regex;
+ }
+ public void setRegex(String regex) {
+ regexPattern = Pattern.compile(regex);
+ this.regex = regex;
+ }
+
+ public Matcher extract(String url)
+ {
+ if ((startsWith != null) && !url.startsWith(startsWith)) {
+ return null;
+ }
+
+ if (regexPattern == null) {
+ return null;
+ }
+
+ Matcher match = regexPattern.matcher(url);
+
+ if (!match.find()) {
+ return null;
+ }
+
+ return match;
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/url/RewriteRule.java b/src/main/java/org/archive/url/RewriteRule.java
new file mode 100644
index 00000000..47292686
--- /dev/null
+++ b/src/main/java/org/archive/url/RewriteRule.java
@@ -0,0 +1,55 @@
+package org.archive.url;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class RewriteRule
+{
+ protected String startsWith;
+ protected String regex;
+ protected String replace;
+
+ protected Pattern regexPattern;
+
+ public String getStartsWith() {
+ return startsWith;
+ }
+ public void setStartsWith(String startsWith) {
+ this.startsWith = startsWith;
+ }
+ public String getRegex() {
+ return regex;
+ }
+ public void setRegex(String regex) {
+ regexPattern = Pattern.compile(regex);
+ this.regex = regex;
+ }
+ public String getReplace() {
+ return replace;
+ }
+ public void setReplace(String replace) {
+ this.replace = replace;
+ }
+
+ public boolean rewrite(StringBuilder sb)
+ {
+ String urlkey = sb.toString();
+
+ if ((startsWith != null) && !urlkey.startsWith(startsWith)) {
+ return false;
+ }
+
+ if (regexPattern == null || replace == null) {
+ return false;
+ }
+
+ Matcher match = regexPattern.matcher(urlkey);
+
+ if (match.matches()) {
+ sb.replace(0, sb.length(), match.replaceAll(replace));
+ return true;
+ }
+
+ return false;
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/url/WaybackURLKeyMaker.java b/src/main/java/org/archive/url/WaybackURLKeyMaker.java
index 23c67d06..99fb92e9 100644
--- a/src/main/java/org/archive/url/WaybackURLKeyMaker.java
+++ b/src/main/java/org/archive/url/WaybackURLKeyMaker.java
@@ -2,8 +2,6 @@
import java.net.URISyntaxException;
import java.util.List;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
public class WaybackURLKeyMaker implements URLKeyMaker {
// URLCanonicalizer canonicalizer = new NonMassagingIAURLCanonicalizer();
@@ -21,34 +19,6 @@ public void setCanonicalizer(URLCanonicalizer canonicalizer) {
protected List customRules;
- public static class RewriteRule
- {
- String startsWith;
- String regex;
- String replace;
- Pattern regexPattern;
-
- public String getStartsWith() {
- return startsWith;
- }
- public void setStartsWith(String startsWith) {
- this.startsWith = startsWith;
- }
- public String getRegex() {
- return regex;
- }
- public void setRegex(String regex) {
- regexPattern = Pattern.compile(regex);
- this.regex = regex;
- }
- public String getReplace() {
- return replace;
- }
- public void setReplace(String replace) {
- this.replace = replace;
- }
- }
-
public WaybackURLKeyMaker()
{
@@ -117,22 +87,12 @@ public void setCustomRules(List customRules) {
protected String applyCustomRules(String urlkey)
{
+ StringBuilder sb = new StringBuilder(urlkey);
+
for (RewriteRule rule : customRules) {
- if ((rule.startsWith != null) && !urlkey.startsWith(rule.startsWith)) {
- continue;
- }
-
- if (rule.regexPattern == null || rule.replace == null) {
- continue;
- }
-
- Matcher match = rule.regexPattern.matcher(urlkey);
-
- if (match.matches()) {
- urlkey = match.replaceAll(rule.replace);
- }
+ rule.rewrite(sb);
}
- return urlkey;
+ return sb.toString();
}
}
From 0ebbad2b2c71f1e4277105cb193985f0158b7739 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer
Date: Sat, 16 Nov 2013 12:47:11 -0800
Subject: [PATCH 21/27] FIX: add cdxlinefactory doesn't require custom format
extractrule, check for empty string
---
.../archive/format/cdx/StandardCDXLineFactory.java | 11 +++++++++++
src/main/java/org/archive/url/ExtractRule.java | 2 +-
2 files changed, 12 insertions(+), 1 deletion(-)
diff --git a/src/main/java/org/archive/format/cdx/StandardCDXLineFactory.java b/src/main/java/org/archive/format/cdx/StandardCDXLineFactory.java
index d2c299e5..33da41f1 100644
--- a/src/main/java/org/archive/format/cdx/StandardCDXLineFactory.java
+++ b/src/main/java/org/archive/format/cdx/StandardCDXLineFactory.java
@@ -39,6 +39,17 @@ public FieldSplitFormat getParseFormat()
return parseFormat;
}
+ public CDXLine createStandardCDXLine(String input)
+ {
+ if (parseFormat == cdx11) {
+ return new CDX11Line(input, parseFormat);
+ } else if (parseFormat == cdx09) {
+ return new CDX09Line(input, parseFormat);
+ } else {
+ return new CDXLine(input, parseFormat);
+ }
+ }
+
public CDXLine createStandardCDXLine(String input, FieldSplitFormat exFormat)
{
if (parseFormat == cdx11) {
diff --git a/src/main/java/org/archive/url/ExtractRule.java b/src/main/java/org/archive/url/ExtractRule.java
index 6d975b61..bcfb3b2f 100644
--- a/src/main/java/org/archive/url/ExtractRule.java
+++ b/src/main/java/org/archive/url/ExtractRule.java
@@ -26,7 +26,7 @@ public void setRegex(String regex) {
public Matcher extract(String url)
{
- if ((startsWith != null) && !url.startsWith(startsWith)) {
+ if ((startsWith != null) && !startsWith.isEmpty() && !url.startsWith(startsWith)) {
return null;
}
From 5077ad8ece1d0d63948db8371e83f39045d9de2b Mon Sep 17 00:00:00 2001
From: Vinay Goel
Date: Sat, 23 Nov 2013 04:41:16 -0800
Subject: [PATCH 22/27] Extract outlinks/hopinfo from warc/metadata records
---
.../WARCMetadataRecordExtractorOutput.java | 150 ++++++++++++++++++
1 file changed, 150 insertions(+)
create mode 100644 src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java
diff --git a/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java b/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java
new file mode 100644
index 00000000..0d564a6f
--- /dev/null
+++ b/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java
@@ -0,0 +1,150 @@
+package org.archive.extract;
+
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.net.MalformedURLException;
+import java.net.URISyntaxException;
+import java.net.URL;
+import java.util.List;
+import java.util.logging.Logger;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.archive.format.gzip.GZIPFormatException;
+import org.archive.format.json.JSONUtils;
+import org.archive.format.json.SimpleJSONPathSpec;
+import org.archive.resource.MetaData;
+import org.archive.resource.Resource;
+import org.archive.util.IAUtils;
+import org.archive.util.StreamCopy;
+import org.json.JSONArray;
+import org.json.JSONException;
+import org.json.JSONObject;
+
+import com.google.common.io.CountingOutputStream;
+import com.google.common.io.NullOutputStream;
+
+public class WARCMetadataRecordExtractorOutput implements ExtractorOutput {
+ private static final Logger LOG =
+ Logger.getLogger(WARCMetadataRecordExtractorOutput.class.getName());
+
+ private PrintWriter out;
+ SimpleJSONPathSpec formatSpec = new SimpleJSONPathSpec("Envelope.Format");
+ SimpleJSONPathSpec warcURL = new SimpleJSONPathSpec("Envelope.WARC-Header-Metadata.WARC-Target-URI");
+ SimpleJSONPathSpec warcDate = new SimpleJSONPathSpec("Envelope.WARC-Header-Metadata.WARC-Date");
+ SimpleJSONPathSpec warcType = new SimpleJSONPathSpec("Envelope.WARC-Header-Metadata.WARC-Type");
+ SimpleJSONPathSpec warcMetadataRecord = new SimpleJSONPathSpec("Envelope.Payload-Metadata.WARC-Metadata-Metadata.Metadata-Records");
+
+ private String outputType = "outlinks";
+
+ public WARCMetadataRecordExtractorOutput(PrintWriter out, String outputType) {
+ this.out = out;
+ this.outputType = outputType;
+ }
+
+ public WARCMetadataRecordExtractorOutput(PrintWriter out) {
+ this(out,"outlinks");
+ }
+
+ public void output(Resource resource) throws IOException {
+ NullOutputStream nullo = new NullOutputStream();
+ CountingOutputStream co = new CountingOutputStream(nullo);
+ try {
+ StreamCopy.copy(resource.getInputStream(), co);
+ } catch(GZIPFormatException e) {
+ e.printStackTrace();
+ return;
+ }
+ long bytes = co.getCount();
+ if(bytes > 0) {
+ LOG.info(bytes + " unconsumed bytes in Resource InputStream.");
+ }
+ try {
+ MetaData m = resource.getMetaData().getTopMetaData();
+ // URL DATE OURL MIME HTTP-CODE SHA1 META REDIR OFFSET LENGTH FILE
+ String format = getEnvelopeFormat(m);
+ String origUrl = "TBD";
+ String date = "TBD";
+ String canUrl = "TBD";
+
+ if(format.equals("WARC")) {
+ origUrl = getWARCURL(m);
+ date = getWARCDate(m);
+ String type = getWARCType(m);
+ if(type.equals("metadata")) {
+ String warcMetadataRecord = getWARCMetadataRecord(m);
+
+ JSONArray array = new JSONArray(warcMetadataRecord);
+ String viaUrl = "-";
+ String viaPath = "-";
+ String sourceTag = "-";
+ for(int i=0;i 2)
+ //'outlinks': 'origUrl date origOutlinkUrl linktype linktext'
+ out.format("%s\t%s\t%s\t%s\t\n",origUrl,date,linkParts[0],linkParts[2]);
+ }
+ } else if(outputType.equals("hopinfo")) {
+ String key = obj.get("Name").toString();
+ String value = obj.get("Value").toString();
+ if(key.equals("via")) {
+ viaUrl = value;
+ } else if (key.equals("hopsFromSeed")) {
+ viaPath = value;
+ } else if (key.equals("sourceTag")) {
+ sourceTag = value;
+ }
+ }
+ }
+ if(outputType.equals("hopinfo")) {
+ //'hopinfo': 'origCrawledUrl date origViaUrl hopPathFromVia sourceTag'
+ out.format("%s\t%s\t%s\t%s\t%s\n",origUrl,date,viaUrl,viaPath,sourceTag);
+ }
+ }
+ }
+
+ }
+ catch (Exception e) {
+ throw new IOException(e);
+ }
+ out.flush();
+ }
+
+ private String getEnvelopeFormat(MetaData m) {
+ return unwrapFirst(formatSpec.extract(m),"-");
+ }
+ private String getWARCURL(MetaData m) {
+ return unwrapFirst(warcURL.extract(m),"-");
+ }
+ private String getWARCDate(MetaData m) {
+ return unwrapFirst(warcDate.extract(m),"-");
+ }
+ private String getWARCType(MetaData m) {
+ return unwrapFirst(warcType.extract(m),"-");
+ }
+ private String getWARCMetadataRecord(MetaData m) {
+ return unwrapFirst(warcMetadataRecord.extract(m),"-");
+ }
+
+ private String unwrapFirst(List> l, String defaultValue) {
+ if(l != null) {
+ if(l.size() > 0) {
+ if(l.get(0) != null) {
+ if(l.get(0).size() > 0) {
+ String v = l.get(0).get(0);
+ if(v != null) {
+ if(v.length() > 0) {
+ return v;
+ }
+ }
+ }
+ }
+ }
+ }
+ return defaultValue;
+ }
+}
From 329ff22b0ab15656454ae682acb00b44beedc371 Mon Sep 17 00:00:00 2001
From: Vinay Goel
Date: Mon, 25 Nov 2013 19:10:07 +0000
Subject: [PATCH 23/27] reference pom.xml for building with CDH4
---
pom-cdh4.xml | 229 +++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 229 insertions(+)
create mode 100644 pom-cdh4.xml
diff --git a/pom-cdh4.xml b/pom-cdh4.xml
new file mode 100644
index 00000000..de19d8d0
--- /dev/null
+++ b/pom-cdh4.xml
@@ -0,0 +1,229 @@
+
+ 4.0.0
+
+ org.archive
+ ia-web-commons
+ 1.0-SNAPSHOT
+ jar
+
+ ia-web-commons
+ http://maven.apache.org
+
+
+ UTF-8
+ ${maven.build.timestamp}
+ yyyyMMddhhmmss
+
+
+
+
+ junit
+ junit
+ 3.8.1
+ test
+
+
+
+ com.google.guava
+ guava
+ 14.0.1
+
+
+
+ org.json
+ json
+ 20090211
+
+
+ org.htmlparser
+ htmlparser
+ 1.6
+
+
+
+ org.mozilla
+ juniversalchardet
+ 1.0.3
+
+
+
+ commons-httpclient
+ commons-httpclient
+ 3.1
+
+
+
+ org.apache.hadoop
+ hadoop-core
+ 2.0.0-mr1-cdh4.2.0
+
+
+ commons-httpclient
+ commons-httpclient
+
+
+ javax.servlet
+ servlet-api
+
+
+ javax.servlet.jsp
+ jsp-api
+
+
+ org.mortbay.jetty
+ jetty
+
+
+ org.mortbay.jetty
+ jetty-util
+
+
+ tomcat
+ jasper-runtime
+
+
+ tomcat
+ jasper-compiler
+
+
+
+
+ org.apache.hadoop
+ hadoop-common
+ 2.0.0-cdh4.2.0
+
+
+ org.apache.hadoop
+ hadoop-mapreduce-client-common
+ 2.0.0-cdh4.2.0
+
+
+ org.apache.hadoop
+ hadoop-mapreduce-client-core
+ 2.0.0-cdh4.2.0
+
+
+
+ org.apache.pig
+ pig
+ 0.11.1
+ provided
+
+
+
+ commons-lang
+ commons-lang
+ 2.5
+
+
+
+ commons-io
+ commons-io
+ 2.4
+
+
+
+ org.gnu.inet
+ libidn
+ 1.15
+
+
+ it.unimi.dsi
+ mg4j
+ 1.0.1
+ compile
+
+
+ org.apache.httpcomponents
+ httpcore
+ 4.3
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+ 2.3.2
+
+ 1.6
+ 1.6
+
+
+
+ maven-assembly-plugin
+ 2.4
+
+
+ jar-with-dependencies
+
+ ia-web-commons
+
+
+
+ package
+
+ single
+
+
+
+
+
+
+
+ src/main/resources
+ true
+
+
+
+
+
+
+ internetarchive
+ Internet Archive Maven Repository
+ http://builds.archive.org:8080/maven2
+ default
+
+
+ true
+ daily
+ warn
+
+
+ true
+ daily
+ warn
+
+
+
+
+ cloudera
+ Cloudera Hadoop
+ https://repository.cloudera.com/artifactory/cloudera-repos/
+ default
+
+
+ true
+ daily
+ warn
+
+
+ true
+ daily
+ warn
+
+
+
+
+
+
+
+ repository
+
+ ${repository.url}
+
+
+
+
From fc24be82f632abd7ca56337be77b7ee683368338 Mon Sep 17 00:00:00 2001
From: Noah Levitt
Date: Thu, 5 Dec 2013 16:13:18 -0800
Subject: [PATCH 24/27] moving org.archive.net.PublicSuffixes to ia-web-commons
---
.../java/org/archive/net/PublicSuffixes.java | 363 +
src/main/resources/effective_tld_names.dat | 7045 +++++++++++++++++
.../org/archive/net/PublicSuffixesTest.java | 193 +
3 files changed, 7601 insertions(+)
create mode 100644 src/main/java/org/archive/net/PublicSuffixes.java
create mode 100644 src/main/resources/effective_tld_names.dat
create mode 100644 src/test/java/org/archive/net/PublicSuffixesTest.java
diff --git a/src/main/java/org/archive/net/PublicSuffixes.java b/src/main/java/org/archive/net/PublicSuffixes.java
new file mode 100644
index 00000000..eab8081a
--- /dev/null
+++ b/src/main/java/org/archive/net/PublicSuffixes.java
@@ -0,0 +1,363 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.net;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.FileInputStream;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.io.UnsupportedEncodingException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.io.IOUtils;
+import org.archive.util.TextUtils;
+
+/**
+ * Utility class for making use of the information about 'public suffixes' at
+ * http://publicsuffix.org.
+ *
+ * The public suffix list (once known as 'effective TLDs') was motivated by the
+ * need to decide on which broader domains a subdomain was allowed to set
+ * cookies. For example, a server at 'www.example.com' can set cookies for
+ * 'www.example.com' or 'example.com' but not 'com'. 'www.example.co.uk' can set
+ * cookies for 'www.example.co.uk' or 'example.co.uk' but not 'co.uk' or 'uk'.
+ * The number of rules for all top-level-domains and 2nd- or 3rd- level domains
+ * has become quite long; essentially the broadest domain a subdomain may assign
+ * to is the one that was sold/registered to a specific name registrant.
+ *
+ * This concept should be useful in other contexts, too. Grouping URIs (or
+ * queues of URIs to crawl) together with others sharing the same registered
+ * suffix may be useful for applying the same rules to all, such as assigning
+ * them to the same queue or crawler in a multi- machine setup.
+ *
+ * As of Heritrix3, we prefer the term 'Assignment Level Domain' (ALD)
+ * for such domains, by analogy to 'Top Level Domain' (TLD) or '2nd Level
+ * Domain' (2LD), etc.
+ *
+ * @author Gojomo
+ *
+ * this version of PublicSuffixes uses suffix-tree data structure for generating less
+ * redundant regular expression. It may be even possible to write a light-weight,
+ * thread-safe matcher based on this class.
+ * @author Kenji Nagahashi
+ */
+public class PublicSuffixes {
+ protected static Pattern topmostAssignedSurtPrefixPattern;
+ protected static String topmostAssignedSurtPrefixRegex;
+
+ /**
+ * prefix tree node. each Node represents sequence of letters (prefix)
+ * and alternative sequences following it (list of Node's). Nodes in
+ * {@code branches} are sorted for skip list like lookup and for generating
+ * effective regular expression (see {@link #compareTo(Node)} and {@link #compareTo(char).)
+ *
+ * as is intended for internal use only, there's no access methods. procedures for updating
+ * prefix tree with new input are defined within this class ({@link #addBranch(CharSequence)}).
+ *
+ * terminal node could be represented in two different form: 1) Node with zero branches,
+ * or 2) Node with zero-length {@code cs}. So, root node must be initialized with empty (not null)
+ * {@code branches} unless empty string matches the overall pattern.
+ * {@code cs} must not be null except for root node.
+ */
+ public static class Node implements Comparable {
+ protected CharSequence cs;
+ protected List branches;
+ public Node() {
+ this("", null);
+ }
+ protected Node(CharSequence cs) {
+ this(cs, null);
+ }
+ protected Node(CharSequence cs, List branches) {
+ this.cs = cs;
+ this.branches = branches;
+ }
+ public void addBranch(CharSequence s) {
+ if (branches == null) {
+ branches = new ArrayList();
+ branches.add(new Node("", null));
+ }
+ for (int i = 0; i < branches.size(); i++) {
+ Node alt = branches.get(i);
+ if (alt.add(s)) return;
+ if (alt.compareTo(s.charAt(0)) > 0) {
+ Node alt1 = new Node(s, null);
+ branches.add(i, alt1);
+ return;
+ }
+ }
+ Node alt2 = new Node(s, null);
+ branches.add(alt2);
+ }
+ public boolean add(CharSequence s) {
+ int l = Math.min(s.length(), cs.length());
+ int i = 0;
+ while (i < l && s.charAt(i) == cs.charAt(i))
+ i++;
+ // zero-length match holds only when both cs and s are empty.
+ if (i == 0) return cs.length() == 0 && s.length() == 0;
+ if (i < cs.length()) {
+ CharSequence cs0 = cs.subSequence(0, i);
+ CharSequence cs1 = cs.subSequence(i, cs.length());
+ CharSequence cs2 = s.subSequence(i, s.length());
+ cs = cs0;
+ Node alt1 = new Node(cs1, branches);
+ (branches = new ArrayList()).add(alt1);
+ addBranch(cs2);
+ } else {
+ assert i == cs.length();
+ addBranch(s.subSequence(i, s.length()));
+ }
+ return true;
+ }
+ public int compareTo(Node other) {
+ if (other.cs == null || other.cs.length() == 0)
+ return (cs == null || cs.length() == 0) ? 0 : -1;
+ return compareTo(other.cs.charAt(0));
+ }
+ public int compareTo(char oc) {
+ if (cs == null || cs.length() == 0) return 1;
+ // '!' and '*' must come after ordinary letters, in this order, for regexp
+ // to work as intended.
+ char c = cs.charAt(0);
+ if (c == oc) return 0;
+ if (c == '!') return oc == '*' ? -1 : 1;
+ if (c == '*') return 1;
+ if (oc == '*' || oc == '!') return -1;
+ return Character.valueOf(c).compareTo(oc);
+ // for generating the same regexp as previous version.
+ //return Character.valueOf(oc).compareTo(c);
+ }
+ }
+
+ /**
+ * Utility method for dumping a regex String, based on a published public
+ * suffix list, which matches any SURT-form hostname up through the broadest
+ * 'private' (assigned/sold) domain-segment. That is, for any of the
+ * SURT-form hostnames...
+ *
+ * com,example, com,example,www, com,example,california,www
+ *
+ * ...the regex will match 'com,example,'.
+ *
+ * @param args
+ * @throws IOException
+ */
+ public static void main(String args[]) throws IOException {
+ InputStream is;
+ if (args.length == 0 || "=".equals(args[0])) {
+ // use bundled list
+ is = PublicSuffixes.class.getClassLoader().getResourceAsStream(
+ "effective_tld_names.dat");
+ } else {
+ is = new FileInputStream(args[0]);
+ }
+ BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8"));
+ String regex = getTopmostAssignedSurtPrefixRegex(reader);
+ IOUtils.closeQuietly(is);
+
+ boolean needsClose = false;
+ BufferedWriter writer;
+ if (args.length >= 2) {
+ // write to specified file
+ writer = new BufferedWriter(new FileWriter(args[1]));
+ needsClose = true;
+ } else {
+ // write to stdout
+ writer = new BufferedWriter(new OutputStreamWriter(System.out));
+ }
+ writer.append(regex);
+ writer.flush();
+ if (needsClose) {
+ writer.close();
+ }
+ }
+ /**
+ * Reads a file of the format promulgated by publicsuffix.org, ignoring
+ * comments and '!' exceptions/notations, converting domain segments to
+ * SURT-ordering. Leaves glob-style '*' wildcarding in place. Returns root
+ * node of SURT-ordered prefix tree.
+ *
+ * @param reader
+ * @return root of prefix tree node.
+ * @throws IOException
+ */
+ protected static Node readPublishedFileToSurtTrie(BufferedReader reader) throws IOException {
+ // initializing with empty Alt list prevents empty pattern from being
+ // created for the first addBranch()
+ Node alt = new Node(null, new ArrayList());
+ String line;
+ while ((line = reader.readLine()) != null) {
+ // discard whitespace, empty lines, comments, exceptions
+ line = line.trim();
+ if (line.length() == 0 || line.startsWith("//")) continue;
+ // discard utf8 notation after entry
+ line = line.split("\\s+")[0];
+ // TODO: maybe we don't need to create lower-cased String
+ line = line.toLowerCase();
+ // SURT-order domain segments
+ String[] segs = line.split("\\.");
+ StringBuilder sb = new StringBuilder();
+ for (int i = segs.length - 1; i >= 0; i--) {
+ if (segs[i].length() == 0) continue;
+ sb.append(segs[i]).append(',');
+ }
+ alt.addBranch(sb.toString());
+ }
+ return alt;
+ }
+ /**
+ * utility function for dumping prefix tree structure. intended for debug use.
+ * @param alt root of prefix tree.
+ * @param lv indent level. 0 for root (no indent).
+ * @param out writer to send output to.
+ */
+ public static void dump(Node alt, int lv, PrintWriter out) {
+ for (int i = 0; i < lv; i++)
+ out.print(" ");
+ out.println(alt.cs != null ? ('"'+alt.cs.toString()+'"') : "(null)");
+ if (alt.branches != null) {
+ for (Node br : alt.branches) {
+ dump(br, lv + 1, out);
+ }
+ }
+ }
+ /**
+ * bulids regular expression from prefix-tree {@code alt} into buffer {@code sb}.
+ * @param alt prefix tree root.
+ * @param sb StringBuffer to store regular expression.
+ */
+ protected static void buildRegex(Node alt, StringBuilder sb) {
+ String close = null;
+ if (alt.cs != null) {
+ // actually '!' always be the first character, because it is
+ // always used along with '*'.
+ for (int i = 0; i < alt.cs.length(); i++) {
+ char c = alt.cs.charAt(i);
+ if (c == '!') {
+ if (close != null)
+ throw new RuntimeException("more than one '!'");
+ sb.append("(?=");
+ close = ")";
+ } else if (c == '*') {
+ sb.append("[-\\w]+");
+ } else {
+ sb.append(c);
+ }
+ }
+ }
+ if (alt.branches != null) {
+ // alt.branches.size() should always be > 1
+ if (alt.branches.size() > 1) {
+ sb.append("(?:");
+ }
+ String sep = "";
+ for (Node alt1 : alt.branches) {
+ sb.append(sep); sep = "|";
+ buildRegex(alt1, sb);
+ }
+ if (alt.branches.size() > 1) {
+ sb.append(")");
+ }
+ }
+ if (close != null)
+ sb.append(close);
+ }
+
+ /**
+ * Converts SURT-ordered list of public prefixes into a Java regex which
+ * matches the public-portion "plus one" segment, giving the domain on which
+ * cookies can be set or other policy grouping should occur. Also adds to
+ * regex a fallback matcher that for any new/unknown TLDs assumes the
+ * second-level domain is assignable. (Eg: 'zzz,example,').
+ *
+ * @param list
+ * @return
+ */
+ private static String surtPrefixRegexFromTrie(Node trie) {
+ StringBuilder regex = new StringBuilder();
+ regex.append("(?ix)^\n");
+ trie.addBranch("*,"); // for new/unknown TLDs
+ buildRegex(trie, regex);
+ regex.append("\n([-\\w]+,)");
+ return regex.toString();
+ }
+
+ public static synchronized Pattern getTopmostAssignedSurtPrefixPattern() {
+ if (topmostAssignedSurtPrefixPattern == null) {
+ topmostAssignedSurtPrefixPattern = Pattern
+ .compile(getTopmostAssignedSurtPrefixRegex());
+ }
+ return topmostAssignedSurtPrefixPattern;
+ }
+
+ public static synchronized String getTopmostAssignedSurtPrefixRegex() {
+ if (topmostAssignedSurtPrefixRegex == null) {
+ // use bundled list
+ try {
+ BufferedReader reader = new BufferedReader(new InputStreamReader(
+ PublicSuffixes.class.getClassLoader().getResourceAsStream(
+ "effective_tld_names.dat"), "UTF-8"));
+ topmostAssignedSurtPrefixRegex = getTopmostAssignedSurtPrefixRegex(reader);
+ IOUtils.closeQuietly(reader);
+ } catch (UnsupportedEncodingException ex) {
+ // should never happen
+ throw new RuntimeException(ex);
+ }
+ }
+ return topmostAssignedSurtPrefixRegex;
+ }
+
+ public static String getTopmostAssignedSurtPrefixRegex(BufferedReader reader) {
+ try {
+ Node trie = readPublishedFileToSurtTrie(reader);
+ return surtPrefixRegexFromTrie(trie);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ /**
+ * Truncate SURT to its topmost assigned domain segment; that is,
+ * the public suffix plus one segment, but as a SURT-ordered prefix.
+ *
+ * if the pattern doesn't match, the passed-in SURT is returned.
+ *
+ * @param surt SURT to truncate
+ * @return truncated-to-topmost-assigned SURT prefix
+ */
+ public static String reduceSurtToAssignmentLevel(String surt) {
+ Matcher matcher = TextUtils.getMatcher(
+ getTopmostAssignedSurtPrefixRegex(), surt);
+ if (matcher.find()) {
+ surt = matcher.group();
+ }
+ TextUtils.recycleMatcher(matcher);
+ return surt;
+ }
+}
diff --git a/src/main/resources/effective_tld_names.dat b/src/main/resources/effective_tld_names.dat
new file mode 100644
index 00000000..7c4a0860
--- /dev/null
+++ b/src/main/resources/effective_tld_names.dat
@@ -0,0 +1,7045 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// ===BEGIN ICANN DOMAINS===
+
+// ac : http://en.wikipedia.org/wiki/.ac
+ac
+com.ac
+edu.ac
+gov.ac
+net.ac
+mil.ac
+org.ac
+
+// ad : http://en.wikipedia.org/wiki/.ad
+ad
+nom.ad
+
+// ae : http://en.wikipedia.org/wiki/.ae
+// see also: "Domain Name Eligibility Policy" at http://www.aeda.ae/eng/aepolicy.php
+ae
+co.ae
+net.ae
+org.ae
+sch.ae
+ac.ae
+gov.ae
+mil.ae
+
+// aero : see http://www.information.aero/index.php?id=66
+aero
+accident-investigation.aero
+accident-prevention.aero
+aerobatic.aero
+aeroclub.aero
+aerodrome.aero
+agents.aero
+aircraft.aero
+airline.aero
+airport.aero
+air-surveillance.aero
+airtraffic.aero
+air-traffic-control.aero
+ambulance.aero
+amusement.aero
+association.aero
+author.aero
+ballooning.aero
+broker.aero
+caa.aero
+cargo.aero
+catering.aero
+certification.aero
+championship.aero
+charter.aero
+civilaviation.aero
+club.aero
+conference.aero
+consultant.aero
+consulting.aero
+control.aero
+council.aero
+crew.aero
+design.aero
+dgca.aero
+educator.aero
+emergency.aero
+engine.aero
+engineer.aero
+entertainment.aero
+equipment.aero
+exchange.aero
+express.aero
+federation.aero
+flight.aero
+freight.aero
+fuel.aero
+gliding.aero
+government.aero
+groundhandling.aero
+group.aero
+hanggliding.aero
+homebuilt.aero
+insurance.aero
+journal.aero
+journalist.aero
+leasing.aero
+logistics.aero
+magazine.aero
+maintenance.aero
+marketplace.aero
+media.aero
+microlight.aero
+modelling.aero
+navigation.aero
+parachuting.aero
+paragliding.aero
+passenger-association.aero
+pilot.aero
+press.aero
+production.aero
+recreation.aero
+repbody.aero
+res.aero
+research.aero
+rotorcraft.aero
+safety.aero
+scientist.aero
+services.aero
+show.aero
+skydiving.aero
+software.aero
+student.aero
+taxi.aero
+trader.aero
+trading.aero
+trainer.aero
+union.aero
+workinggroup.aero
+works.aero
+
+// af : http://www.nic.af/help.jsp
+af
+gov.af
+com.af
+org.af
+net.af
+edu.af
+
+// ag : http://www.nic.ag/prices.htm
+ag
+com.ag
+org.ag
+net.ag
+co.ag
+nom.ag
+
+// ai : http://nic.com.ai/
+ai
+off.ai
+com.ai
+net.ai
+org.ai
+
+// al : http://www.ert.gov.al/ert_alb/faq_det.html?Id=31
+al
+com.al
+edu.al
+gov.al
+mil.al
+net.al
+org.al
+
+// am : http://en.wikipedia.org/wiki/.am
+am
+
+// an : http://www.una.an/an_domreg/default.asp
+an
+com.an
+net.an
+org.an
+edu.an
+
+// ao : http://en.wikipedia.org/wiki/.ao
+// http://www.dns.ao/REGISTR.DOC
+ao
+ed.ao
+gv.ao
+og.ao
+co.ao
+pb.ao
+it.ao
+
+// aq : http://en.wikipedia.org/wiki/.aq
+aq
+
+// ar : http://en.wikipedia.org/wiki/.ar
+*.ar
+!congresodelalengua3.ar
+!educ.ar
+!gobiernoelectronico.ar
+!mecon.ar
+!nacion.ar
+!nic.ar
+!promocion.ar
+!retina.ar
+!uba.ar
+
+// arpa : http://en.wikipedia.org/wiki/.arpa
+// Confirmed by registry 2008-06-18
+e164.arpa
+in-addr.arpa
+ip6.arpa
+iris.arpa
+uri.arpa
+urn.arpa
+
+// as : http://en.wikipedia.org/wiki/.as
+as
+gov.as
+
+// asia : http://en.wikipedia.org/wiki/.asia
+asia
+
+// at : http://en.wikipedia.org/wiki/.at
+// Confirmed by registry 2008-06-17
+at
+ac.at
+co.at
+gv.at
+or.at
+
+// au : http://en.wikipedia.org/wiki/.au
+// http://www.auda.org.au/
+// 2LDs
+com.au
+net.au
+org.au
+edu.au
+gov.au
+asn.au
+id.au
+csiro.au
+// Historic 2LDs (closed to new registration, but sites still exist)
+info.au
+conf.au
+oz.au
+// CGDNs - http://www.cgdn.org.au/
+act.au
+nsw.au
+nt.au
+qld.au
+sa.au
+tas.au
+vic.au
+wa.au
+// 3LDs
+act.edu.au
+nsw.edu.au
+nt.edu.au
+qld.edu.au
+sa.edu.au
+tas.edu.au
+vic.edu.au
+wa.edu.au
+act.gov.au
+// Removed at request of Shae.Donelan@services.nsw.gov.au, 2010-03-04
+// nsw.gov.au
+nt.gov.au
+qld.gov.au
+sa.gov.au
+tas.gov.au
+vic.gov.au
+wa.gov.au
+
+// aw : http://en.wikipedia.org/wiki/.aw
+aw
+com.aw
+
+// ax : http://en.wikipedia.org/wiki/.ax
+ax
+
+// az : http://en.wikipedia.org/wiki/.az
+az
+com.az
+net.az
+int.az
+gov.az
+org.az
+edu.az
+info.az
+pp.az
+mil.az
+name.az
+pro.az
+biz.az
+
+// ba : http://en.wikipedia.org/wiki/.ba
+ba
+org.ba
+net.ba
+edu.ba
+gov.ba
+mil.ba
+unsa.ba
+unbi.ba
+co.ba
+com.ba
+rs.ba
+
+// bb : http://en.wikipedia.org/wiki/.bb
+bb
+biz.bb
+com.bb
+edu.bb
+gov.bb
+info.bb
+net.bb
+org.bb
+store.bb
+
+// bd : http://en.wikipedia.org/wiki/.bd
+*.bd
+
+// be : http://en.wikipedia.org/wiki/.be
+// Confirmed by registry 2008-06-08
+be
+ac.be
+
+// bf : http://en.wikipedia.org/wiki/.bf
+bf
+gov.bf
+
+// bg : http://en.wikipedia.org/wiki/.bg
+// https://www.register.bg/user/static/rules/en/index.html
+bg
+a.bg
+b.bg
+c.bg
+d.bg
+e.bg
+f.bg
+g.bg
+h.bg
+i.bg
+j.bg
+k.bg
+l.bg
+m.bg
+n.bg
+o.bg
+p.bg
+q.bg
+r.bg
+s.bg
+t.bg
+u.bg
+v.bg
+w.bg
+x.bg
+y.bg
+z.bg
+0.bg
+1.bg
+2.bg
+3.bg
+4.bg
+5.bg
+6.bg
+7.bg
+8.bg
+9.bg
+
+// bh : http://en.wikipedia.org/wiki/.bh
+bh
+com.bh
+edu.bh
+net.bh
+org.bh
+gov.bh
+
+// bi : http://en.wikipedia.org/wiki/.bi
+// http://whois.nic.bi/
+bi
+co.bi
+com.bi
+edu.bi
+or.bi
+org.bi
+
+// biz : http://en.wikipedia.org/wiki/.biz
+biz
+
+// bj : http://en.wikipedia.org/wiki/.bj
+bj
+asso.bj
+barreau.bj
+gouv.bj
+
+// bm : http://www.bermudanic.bm/dnr-text.txt
+bm
+com.bm
+edu.bm
+gov.bm
+net.bm
+org.bm
+
+// bn : http://en.wikipedia.org/wiki/.bn
+*.bn
+
+// bo : http://www.nic.bo/
+bo
+com.bo
+edu.bo
+gov.bo
+gob.bo
+int.bo
+org.bo
+net.bo
+mil.bo
+tv.bo
+
+// br : http://registro.br/dominio/dpn.html
+// Updated by registry 2011-03-01
+br
+adm.br
+adv.br
+agr.br
+am.br
+arq.br
+art.br
+ato.br
+b.br
+bio.br
+blog.br
+bmd.br
+cim.br
+cng.br
+cnt.br
+com.br
+coop.br
+ecn.br
+eco.br
+edu.br
+emp.br
+eng.br
+esp.br
+etc.br
+eti.br
+far.br
+flog.br
+fm.br
+fnd.br
+fot.br
+fst.br
+g12.br
+ggf.br
+gov.br
+imb.br
+ind.br
+inf.br
+jor.br
+jus.br
+leg.br
+lel.br
+mat.br
+med.br
+mil.br
+mus.br
+net.br
+nom.br
+not.br
+ntr.br
+odo.br
+org.br
+ppg.br
+pro.br
+psc.br
+psi.br
+qsl.br
+radio.br
+rec.br
+slg.br
+srv.br
+taxi.br
+teo.br
+tmp.br
+trd.br
+tur.br
+tv.br
+vet.br
+vlog.br
+wiki.br
+zlg.br
+
+// bs : http://www.nic.bs/rules.html
+bs
+com.bs
+net.bs
+org.bs
+edu.bs
+gov.bs
+
+// bt : http://en.wikipedia.org/wiki/.bt
+bt
+com.bt
+edu.bt
+gov.bt
+net.bt
+org.bt
+
+// bv : No registrations at this time.
+// Submitted by registry 2006-06-16
+
+// bw : http://en.wikipedia.org/wiki/.bw
+// http://www.gobin.info/domainname/bw.doc
+// list of other 2nd level tlds ?
+bw
+co.bw
+org.bw
+
+// by : http://en.wikipedia.org/wiki/.by
+// http://tld.by/rules_2006_en.html
+// list of other 2nd level tlds ?
+by
+gov.by
+mil.by
+// Official information does not indicate that com.by is a reserved
+// second-level domain, but it's being used as one (see www.google.com.by and
+// www.yahoo.com.by, for example), so we list it here for safety's sake.
+com.by
+
+// http://hoster.by/
+of.by
+
+// bz : http://en.wikipedia.org/wiki/.bz
+// http://www.belizenic.bz/
+bz
+com.bz
+net.bz
+org.bz
+edu.bz
+gov.bz
+
+// ca : http://en.wikipedia.org/wiki/.ca
+ca
+// ca geographical names
+ab.ca
+bc.ca
+mb.ca
+nb.ca
+nf.ca
+nl.ca
+ns.ca
+nt.ca
+nu.ca
+on.ca
+pe.ca
+qc.ca
+sk.ca
+yk.ca
+// gc.ca: http://en.wikipedia.org/wiki/.gc.ca
+// see also: http://registry.gc.ca/en/SubdomainFAQ
+gc.ca
+
+// cat : http://en.wikipedia.org/wiki/.cat
+cat
+
+// cc : http://en.wikipedia.org/wiki/.cc
+cc
+
+// cd : http://en.wikipedia.org/wiki/.cd
+// see also: https://www.nic.cd/domain/insertDomain_2.jsp?act=1
+cd
+gov.cd
+
+// cf : http://en.wikipedia.org/wiki/.cf
+cf
+
+// cg : http://en.wikipedia.org/wiki/.cg
+cg
+
+// ch : http://en.wikipedia.org/wiki/.ch
+ch
+
+// ci : http://en.wikipedia.org/wiki/.ci
+// http://www.nic.ci/index.php?page=charte
+ci
+org.ci
+or.ci
+com.ci
+co.ci
+edu.ci
+ed.ci
+ac.ci
+net.ci
+go.ci
+asso.ci
+aéroport.ci
+int.ci
+presse.ci
+md.ci
+gouv.ci
+
+// ck : http://en.wikipedia.org/wiki/.ck
+*.ck
+!www.ck
+
+// cl : http://en.wikipedia.org/wiki/.cl
+cl
+gov.cl
+gob.cl
+co.cl
+mil.cl
+
+// cm : http://en.wikipedia.org/wiki/.cm
+cm
+gov.cm
+
+// cn : http://en.wikipedia.org/wiki/.cn
+// Submitted by registry 2008-06-11
+cn
+ac.cn
+com.cn
+edu.cn
+gov.cn
+net.cn
+org.cn
+mil.cn
+公司.cn
+网络.cn
+網絡.cn
+// cn geographic names
+ah.cn
+bj.cn
+cq.cn
+fj.cn
+gd.cn
+gs.cn
+gz.cn
+gx.cn
+ha.cn
+hb.cn
+he.cn
+hi.cn
+hl.cn
+hn.cn
+jl.cn
+js.cn
+jx.cn
+ln.cn
+nm.cn
+nx.cn
+qh.cn
+sc.cn
+sd.cn
+sh.cn
+sn.cn
+sx.cn
+tj.cn
+xj.cn
+xz.cn
+yn.cn
+zj.cn
+hk.cn
+mo.cn
+tw.cn
+
+// co : http://en.wikipedia.org/wiki/.co
+// Submitted by registry 2008-06-11
+co
+arts.co
+com.co
+edu.co
+firm.co
+gov.co
+info.co
+int.co
+mil.co
+net.co
+nom.co
+org.co
+rec.co
+web.co
+
+// com : http://en.wikipedia.org/wiki/.com
+com
+
+// coop : http://en.wikipedia.org/wiki/.coop
+coop
+
+// cr : http://www.nic.cr/niccr_publico/showRegistroDominiosScreen.do
+cr
+ac.cr
+co.cr
+ed.cr
+fi.cr
+go.cr
+or.cr
+sa.cr
+
+// cu : http://en.wikipedia.org/wiki/.cu
+cu
+com.cu
+edu.cu
+org.cu
+net.cu
+gov.cu
+inf.cu
+
+// cv : http://en.wikipedia.org/wiki/.cv
+cv
+
+// cw : http://www.una.cw/cw_registry/
+// Confirmed by registry 2013-03-26
+cw
+com.cw
+edu.cw
+net.cw
+org.cw
+
+// cx : http://en.wikipedia.org/wiki/.cx
+// list of other 2nd level tlds ?
+cx
+gov.cx
+
+// cy : http://en.wikipedia.org/wiki/.cy
+*.cy
+
+// cz : http://en.wikipedia.org/wiki/.cz
+cz
+
+// de : http://en.wikipedia.org/wiki/.de
+// Confirmed by registry (with technical
+// reservations) 2008-07-01
+de
+
+// dj : http://en.wikipedia.org/wiki/.dj
+dj
+
+// dk : http://en.wikipedia.org/wiki/.dk
+// Confirmed by registry 2008-06-17
+dk
+
+// dm : http://en.wikipedia.org/wiki/.dm
+dm
+com.dm
+net.dm
+org.dm
+edu.dm
+gov.dm
+
+// do : http://en.wikipedia.org/wiki/.do
+do
+art.do
+com.do
+edu.do
+gob.do
+gov.do
+mil.do
+net.do
+org.do
+sld.do
+web.do
+
+// dz : http://en.wikipedia.org/wiki/.dz
+dz
+com.dz
+org.dz
+net.dz
+gov.dz
+edu.dz
+asso.dz
+pol.dz
+art.dz
+
+// ec : http://www.nic.ec/reg/paso1.asp
+// Submitted by registry 2008-07-04
+ec
+com.ec
+info.ec
+net.ec
+fin.ec
+k12.ec
+med.ec
+pro.ec
+org.ec
+edu.ec
+gov.ec
+gob.ec
+mil.ec
+
+// edu : http://en.wikipedia.org/wiki/.edu
+edu
+
+// ee : http://www.eenet.ee/EENet/dom_reeglid.html#lisa_B
+ee
+edu.ee
+gov.ee
+riik.ee
+lib.ee
+med.ee
+com.ee
+pri.ee
+aip.ee
+org.ee
+fie.ee
+
+// eg : http://en.wikipedia.org/wiki/.eg
+eg
+com.eg
+edu.eg
+eun.eg
+gov.eg
+mil.eg
+name.eg
+net.eg
+org.eg
+sci.eg
+
+// er : http://en.wikipedia.org/wiki/.er
+*.er
+
+// es : https://www.nic.es/site_ingles/ingles/dominios/index.html
+es
+com.es
+nom.es
+org.es
+gob.es
+edu.es
+
+// et : http://en.wikipedia.org/wiki/.et
+*.et
+
+// eu : http://en.wikipedia.org/wiki/.eu
+eu
+
+// fi : http://en.wikipedia.org/wiki/.fi
+fi
+// aland.fi : http://en.wikipedia.org/wiki/.ax
+// This domain is being phased out in favor of .ax. As there are still many
+// domains under aland.fi, we still keep it on the list until aland.fi is
+// completely removed.
+// TODO: Check for updates (expected to be phased out around Q1/2009)
+aland.fi
+
+// fj : http://en.wikipedia.org/wiki/.fj
+*.fj
+
+// fk : http://en.wikipedia.org/wiki/.fk
+*.fk
+
+// fm : http://en.wikipedia.org/wiki/.fm
+fm
+
+// fo : http://en.wikipedia.org/wiki/.fo
+fo
+
+// fr : http://www.afnic.fr/
+// domaines descriptifs : http://www.afnic.fr/obtenir/chartes/nommage-fr/annexe-descriptifs
+fr
+com.fr
+asso.fr
+nom.fr
+prd.fr
+presse.fr
+tm.fr
+// domaines sectoriels : http://www.afnic.fr/obtenir/chartes/nommage-fr/annexe-sectoriels
+aeroport.fr
+assedic.fr
+avocat.fr
+avoues.fr
+cci.fr
+chambagri.fr
+chirurgiens-dentistes.fr
+experts-comptables.fr
+geometre-expert.fr
+gouv.fr
+greta.fr
+huissier-justice.fr
+medecin.fr
+notaires.fr
+pharmacien.fr
+port.fr
+veterinaire.fr
+
+// ga : http://en.wikipedia.org/wiki/.ga
+ga
+
+// gb : This registry is effectively dormant
+// Submitted by registry 2008-06-12
+
+// gd : http://en.wikipedia.org/wiki/.gd
+gd
+
+// ge : http://www.nic.net.ge/policy_en.pdf
+ge
+com.ge
+edu.ge
+gov.ge
+org.ge
+mil.ge
+net.ge
+pvt.ge
+
+// gf : http://en.wikipedia.org/wiki/.gf
+gf
+
+// gg : http://www.channelisles.net/applic/avextn.shtml
+gg
+co.gg
+org.gg
+net.gg
+sch.gg
+gov.gg
+
+// gh : http://en.wikipedia.org/wiki/.gh
+// see also: http://www.nic.gh/reg_now.php
+// Although domains directly at second level are not possible at the moment,
+// they have been possible for some time and may come back.
+gh
+com.gh
+edu.gh
+gov.gh
+org.gh
+mil.gh
+
+// gi : http://www.nic.gi/rules.html
+gi
+com.gi
+ltd.gi
+gov.gi
+mod.gi
+edu.gi
+org.gi
+
+// gl : http://en.wikipedia.org/wiki/.gl
+// http://nic.gl
+gl
+
+// gm : http://www.nic.gm/htmlpages%5Cgm-policy.htm
+gm
+
+// gn : http://psg.com/dns/gn/gn.txt
+// Submitted by registry 2008-06-17
+ac.gn
+com.gn
+edu.gn
+gov.gn
+org.gn
+net.gn
+
+// gov : http://en.wikipedia.org/wiki/.gov
+gov
+
+// gp : http://www.nic.gp/index.php?lang=en
+gp
+com.gp
+net.gp
+mobi.gp
+edu.gp
+org.gp
+asso.gp
+
+// gq : http://en.wikipedia.org/wiki/.gq
+gq
+
+// gr : https://grweb.ics.forth.gr/english/1617-B-2005.html
+// Submitted by registry 2008-06-09
+gr
+com.gr
+edu.gr
+net.gr
+org.gr
+gov.gr
+
+// gs : http://en.wikipedia.org/wiki/.gs
+gs
+
+// gt : http://www.gt/politicas_de_registro.html
+gt
+com.gt
+edu.gt
+gob.gt
+ind.gt
+mil.gt
+net.gt
+org.gt
+
+// gu : http://gadao.gov.gu/registration.txt
+*.gu
+
+// gw : http://en.wikipedia.org/wiki/.gw
+gw
+
+// gy : http://en.wikipedia.org/wiki/.gy
+// http://registry.gy/
+gy
+co.gy
+com.gy
+net.gy
+
+// hk : https://www.hkdnr.hk
+// Submitted by registry 2008-06-11
+hk
+com.hk
+edu.hk
+gov.hk
+idv.hk
+net.hk
+org.hk
+公司.hk
+教育.hk
+敎育.hk
+政府.hk
+個人.hk
+个人.hk
+箇人.hk
+網络.hk
+网络.hk
+组織.hk
+網絡.hk
+网絡.hk
+组织.hk
+組織.hk
+組织.hk
+
+// hm : http://en.wikipedia.org/wiki/.hm
+hm
+
+// hn : http://www.nic.hn/politicas/ps02,,05.html
+hn
+com.hn
+edu.hn
+org.hn
+net.hn
+mil.hn
+gob.hn
+
+// hr : http://www.dns.hr/documents/pdf/HRTLD-regulations.pdf
+hr
+iz.hr
+from.hr
+name.hr
+com.hr
+
+// ht : http://www.nic.ht/info/charte.cfm
+ht
+com.ht
+shop.ht
+firm.ht
+info.ht
+adult.ht
+net.ht
+pro.ht
+org.ht
+med.ht
+art.ht
+coop.ht
+pol.ht
+asso.ht
+edu.ht
+rel.ht
+gouv.ht
+perso.ht
+
+// hu : http://www.domain.hu/domain/English/sld.html
+// Confirmed by registry 2008-06-12
+hu
+co.hu
+info.hu
+org.hu
+priv.hu
+sport.hu
+tm.hu
+2000.hu
+agrar.hu
+bolt.hu
+casino.hu
+city.hu
+erotica.hu
+erotika.hu
+film.hu
+forum.hu
+games.hu
+hotel.hu
+ingatlan.hu
+jogasz.hu
+konyvelo.hu
+lakas.hu
+media.hu
+news.hu
+reklam.hu
+sex.hu
+shop.hu
+suli.hu
+szex.hu
+tozsde.hu
+utazas.hu
+video.hu
+
+// id : https://register.pandi.or.id/
+id
+ac.id
+biz.id
+co.id
+go.id
+mil.id
+my.id
+net.id
+or.id
+sch.id
+web.id
+
+// ie : http://en.wikipedia.org/wiki/.ie
+ie
+gov.ie
+
+// il : http://en.wikipedia.org/wiki/.il
+*.il
+
+// im : https://www.nic.im/pdfs/imfaqs.pdf
+im
+co.im
+ltd.co.im
+plc.co.im
+net.im
+gov.im
+org.im
+nic.im
+ac.im
+
+// in : http://en.wikipedia.org/wiki/.in
+// see also: http://www.inregistry.in/policies/
+// Please note, that nic.in is not an offical eTLD, but used by most
+// government institutions.
+in
+co.in
+firm.in
+net.in
+org.in
+gen.in
+ind.in
+nic.in
+ac.in
+edu.in
+res.in
+gov.in
+mil.in
+
+// info : http://en.wikipedia.org/wiki/.info
+info
+
+// int : http://en.wikipedia.org/wiki/.int
+// Confirmed by registry 2008-06-18
+int
+eu.int
+
+// io : http://www.nic.io/rules.html
+// list of other 2nd level tlds ?
+io
+com.io
+
+// iq : http://www.cmc.iq/english/iq/iqregister1.htm
+iq
+gov.iq
+edu.iq
+mil.iq
+com.iq
+org.iq
+net.iq
+
+// ir : http://www.nic.ir/Terms_and_Conditions_ir,_Appendix_1_Domain_Rules
+// Also see http://www.nic.ir/Internationalized_Domain_Names
+// Two .ir entries added at request of , 2010-04-16
+ir
+ac.ir
+co.ir
+gov.ir
+id.ir
+net.ir
+org.ir
+sch.ir
+// xn--mgba3a4f16a.ir (.ir, Persian YEH)
+ایران.ir
+// xn--mgba3a4fra.ir (.ir, Arabic YEH)
+ايران.ir
+
+// is : http://www.isnic.is/domain/rules.php
+// Confirmed by registry 2008-12-06
+is
+net.is
+com.is
+edu.is
+gov.is
+org.is
+int.is
+
+// it : http://en.wikipedia.org/wiki/.it
+it
+gov.it
+edu.it
+// list of reserved geo-names :
+// http://www.nic.it/documenti/regolamenti-e-linee-guida/regolamento-assegnazione-versione-6.0.pdf
+// (There is also a list of reserved geo-names corresponding to Italian
+// municipalities : http://www.nic.it/documenti/appendice-c.pdf , but it is
+// not included here.)
+agrigento.it
+ag.it
+alessandria.it
+al.it
+ancona.it
+an.it
+aosta.it
+aoste.it
+ao.it
+arezzo.it
+ar.it
+ascoli-piceno.it
+ascolipiceno.it
+ap.it
+asti.it
+at.it
+avellino.it
+av.it
+bari.it
+ba.it
+andria-barletta-trani.it
+andriabarlettatrani.it
+trani-barletta-andria.it
+tranibarlettaandria.it
+barletta-trani-andria.it
+barlettatraniandria.it
+andria-trani-barletta.it
+andriatranibarletta.it
+trani-andria-barletta.it
+traniandriabarletta.it
+bt.it
+belluno.it
+bl.it
+benevento.it
+bn.it
+bergamo.it
+bg.it
+biella.it
+bi.it
+bologna.it
+bo.it
+bolzano.it
+bozen.it
+balsan.it
+alto-adige.it
+altoadige.it
+suedtirol.it
+bz.it
+brescia.it
+bs.it
+brindisi.it
+br.it
+cagliari.it
+ca.it
+caltanissetta.it
+cl.it
+campobasso.it
+cb.it
+carboniaiglesias.it
+carbonia-iglesias.it
+iglesias-carbonia.it
+iglesiascarbonia.it
+ci.it
+caserta.it
+ce.it
+catania.it
+ct.it
+catanzaro.it
+cz.it
+chieti.it
+ch.it
+como.it
+co.it
+cosenza.it
+cs.it
+cremona.it
+cr.it
+crotone.it
+kr.it
+cuneo.it
+cn.it
+dell-ogliastra.it
+dellogliastra.it
+ogliastra.it
+og.it
+enna.it
+en.it
+ferrara.it
+fe.it
+fermo.it
+fm.it
+firenze.it
+florence.it
+fi.it
+foggia.it
+fg.it
+forli-cesena.it
+forlicesena.it
+cesena-forli.it
+cesenaforli.it
+fc.it
+frosinone.it
+fr.it
+genova.it
+genoa.it
+ge.it
+gorizia.it
+go.it
+grosseto.it
+gr.it
+imperia.it
+im.it
+isernia.it
+is.it
+laquila.it
+aquila.it
+aq.it
+la-spezia.it
+laspezia.it
+sp.it
+latina.it
+lt.it
+lecce.it
+le.it
+lecco.it
+lc.it
+livorno.it
+li.it
+lodi.it
+lo.it
+lucca.it
+lu.it
+macerata.it
+mc.it
+mantova.it
+mn.it
+massa-carrara.it
+massacarrara.it
+carrara-massa.it
+carraramassa.it
+ms.it
+matera.it
+mt.it
+medio-campidano.it
+mediocampidano.it
+campidano-medio.it
+campidanomedio.it
+vs.it
+messina.it
+me.it
+milano.it
+milan.it
+mi.it
+modena.it
+mo.it
+monza.it
+monza-brianza.it
+monzabrianza.it
+monzaebrianza.it
+monzaedellabrianza.it
+monza-e-della-brianza.it
+mb.it
+napoli.it
+naples.it
+na.it
+novara.it
+no.it
+nuoro.it
+nu.it
+oristano.it
+or.it
+padova.it
+padua.it
+pd.it
+palermo.it
+pa.it
+parma.it
+pr.it
+pavia.it
+pv.it
+perugia.it
+pg.it
+pescara.it
+pe.it
+pesaro-urbino.it
+pesarourbino.it
+urbino-pesaro.it
+urbinopesaro.it
+pu.it
+piacenza.it
+pc.it
+pisa.it
+pi.it
+pistoia.it
+pt.it
+pordenone.it
+pn.it
+potenza.it
+pz.it
+prato.it
+po.it
+ragusa.it
+rg.it
+ravenna.it
+ra.it
+reggio-calabria.it
+reggiocalabria.it
+rc.it
+reggio-emilia.it
+reggioemilia.it
+re.it
+rieti.it
+ri.it
+rimini.it
+rn.it
+roma.it
+rome.it
+rm.it
+rovigo.it
+ro.it
+salerno.it
+sa.it
+sassari.it
+ss.it
+savona.it
+sv.it
+siena.it
+si.it
+siracusa.it
+sr.it
+sondrio.it
+so.it
+taranto.it
+ta.it
+tempio-olbia.it
+tempioolbia.it
+olbia-tempio.it
+olbiatempio.it
+ot.it
+teramo.it
+te.it
+terni.it
+tr.it
+torino.it
+turin.it
+to.it
+trapani.it
+tp.it
+trento.it
+trentino.it
+tn.it
+treviso.it
+tv.it
+trieste.it
+ts.it
+udine.it
+ud.it
+varese.it
+va.it
+venezia.it
+venice.it
+ve.it
+verbania.it
+vb.it
+vercelli.it
+vc.it
+verona.it
+vr.it
+vibo-valentia.it
+vibovalentia.it
+vv.it
+vicenza.it
+vi.it
+viterbo.it
+vt.it
+
+// je : http://www.channelisles.net/applic/avextn.shtml
+je
+co.je
+org.je
+net.je
+sch.je
+gov.je
+
+// jm : http://www.com.jm/register.html
+*.jm
+
+// jo : http://www.dns.jo/Registration_policy.aspx
+jo
+com.jo
+org.jo
+net.jo
+edu.jo
+sch.jo
+gov.jo
+mil.jo
+name.jo
+
+// jobs : http://en.wikipedia.org/wiki/.jobs
+jobs
+
+// jp : http://en.wikipedia.org/wiki/.jp
+// http://jprs.co.jp/en/jpdomain.html
+// Updated by registry 2012-05-28
+jp
+// jp organizational type names
+ac.jp
+ad.jp
+co.jp
+ed.jp
+go.jp
+gr.jp
+lg.jp
+ne.jp
+or.jp
+// jp preficture type names
+aichi.jp
+akita.jp
+aomori.jp
+chiba.jp
+ehime.jp
+fukui.jp
+fukuoka.jp
+fukushima.jp
+gifu.jp
+gunma.jp
+hiroshima.jp
+hokkaido.jp
+hyogo.jp
+ibaraki.jp
+ishikawa.jp
+iwate.jp
+kagawa.jp
+kagoshima.jp
+kanagawa.jp
+kochi.jp
+kumamoto.jp
+kyoto.jp
+mie.jp
+miyagi.jp
+miyazaki.jp
+nagano.jp
+nagasaki.jp
+nara.jp
+niigata.jp
+oita.jp
+okayama.jp
+okinawa.jp
+osaka.jp
+saga.jp
+saitama.jp
+shiga.jp
+shimane.jp
+shizuoka.jp
+tochigi.jp
+tokushima.jp
+tokyo.jp
+tottori.jp
+toyama.jp
+wakayama.jp
+yamagata.jp
+yamaguchi.jp
+yamanashi.jp
+// jp geographic type names
+// http://jprs.jp/doc/rule/saisoku-1.html
+*.kawasaki.jp
+*.kitakyushu.jp
+*.kobe.jp
+*.nagoya.jp
+*.sapporo.jp
+*.sendai.jp
+*.yokohama.jp
+!city.kawasaki.jp
+!city.kitakyushu.jp
+!city.kobe.jp
+!city.nagoya.jp
+!city.sapporo.jp
+!city.sendai.jp
+!city.yokohama.jp
+// 4th level registration
+aisai.aichi.jp
+ama.aichi.jp
+anjo.aichi.jp
+asuke.aichi.jp
+chiryu.aichi.jp
+chita.aichi.jp
+fuso.aichi.jp
+gamagori.aichi.jp
+handa.aichi.jp
+hazu.aichi.jp
+hekinan.aichi.jp
+higashiura.aichi.jp
+ichinomiya.aichi.jp
+inazawa.aichi.jp
+inuyama.aichi.jp
+isshiki.aichi.jp
+iwakura.aichi.jp
+kanie.aichi.jp
+kariya.aichi.jp
+kasugai.aichi.jp
+kira.aichi.jp
+kiyosu.aichi.jp
+komaki.aichi.jp
+konan.aichi.jp
+kota.aichi.jp
+mihama.aichi.jp
+miyoshi.aichi.jp
+nagakute.aichi.jp
+nishio.aichi.jp
+nisshin.aichi.jp
+obu.aichi.jp
+oguchi.aichi.jp
+oharu.aichi.jp
+okazaki.aichi.jp
+owariasahi.aichi.jp
+seto.aichi.jp
+shikatsu.aichi.jp
+shinshiro.aichi.jp
+shitara.aichi.jp
+tahara.aichi.jp
+takahama.aichi.jp
+tobishima.aichi.jp
+toei.aichi.jp
+togo.aichi.jp
+tokai.aichi.jp
+tokoname.aichi.jp
+toyoake.aichi.jp
+toyohashi.aichi.jp
+toyokawa.aichi.jp
+toyone.aichi.jp
+toyota.aichi.jp
+tsushima.aichi.jp
+yatomi.aichi.jp
+akita.akita.jp
+daisen.akita.jp
+fujisato.akita.jp
+gojome.akita.jp
+hachirogata.akita.jp
+happou.akita.jp
+higashinaruse.akita.jp
+honjo.akita.jp
+honjyo.akita.jp
+ikawa.akita.jp
+kamikoani.akita.jp
+kamioka.akita.jp
+katagami.akita.jp
+kazuno.akita.jp
+kitaakita.akita.jp
+kosaka.akita.jp
+kyowa.akita.jp
+misato.akita.jp
+mitane.akita.jp
+moriyoshi.akita.jp
+nikaho.akita.jp
+noshiro.akita.jp
+odate.akita.jp
+oga.akita.jp
+ogata.akita.jp
+semboku.akita.jp
+yokote.akita.jp
+yurihonjo.akita.jp
+aomori.aomori.jp
+gonohe.aomori.jp
+hachinohe.aomori.jp
+hashikami.aomori.jp
+hiranai.aomori.jp
+hirosaki.aomori.jp
+itayanagi.aomori.jp
+kuroishi.aomori.jp
+misawa.aomori.jp
+mutsu.aomori.jp
+nakadomari.aomori.jp
+noheji.aomori.jp
+oirase.aomori.jp
+owani.aomori.jp
+rokunohe.aomori.jp
+sannohe.aomori.jp
+shichinohe.aomori.jp
+shingo.aomori.jp
+takko.aomori.jp
+towada.aomori.jp
+tsugaru.aomori.jp
+tsuruta.aomori.jp
+abiko.chiba.jp
+asahi.chiba.jp
+chonan.chiba.jp
+chosei.chiba.jp
+choshi.chiba.jp
+chuo.chiba.jp
+funabashi.chiba.jp
+futtsu.chiba.jp
+hanamigawa.chiba.jp
+ichihara.chiba.jp
+ichikawa.chiba.jp
+ichinomiya.chiba.jp
+inzai.chiba.jp
+isumi.chiba.jp
+kamagaya.chiba.jp
+kamogawa.chiba.jp
+kashiwa.chiba.jp
+katori.chiba.jp
+katsuura.chiba.jp
+kimitsu.chiba.jp
+kisarazu.chiba.jp
+kozaki.chiba.jp
+kujukuri.chiba.jp
+kyonan.chiba.jp
+matsudo.chiba.jp
+midori.chiba.jp
+mihama.chiba.jp
+minamiboso.chiba.jp
+mobara.chiba.jp
+mutsuzawa.chiba.jp
+nagara.chiba.jp
+nagareyama.chiba.jp
+narashino.chiba.jp
+narita.chiba.jp
+noda.chiba.jp
+oamishirasato.chiba.jp
+omigawa.chiba.jp
+onjuku.chiba.jp
+otaki.chiba.jp
+sakae.chiba.jp
+sakura.chiba.jp
+shimofusa.chiba.jp
+shirako.chiba.jp
+shiroi.chiba.jp
+shisui.chiba.jp
+sodegaura.chiba.jp
+sosa.chiba.jp
+tako.chiba.jp
+tateyama.chiba.jp
+togane.chiba.jp
+tohnosho.chiba.jp
+tomisato.chiba.jp
+urayasu.chiba.jp
+yachimata.chiba.jp
+yachiyo.chiba.jp
+yokaichiba.chiba.jp
+yokoshibahikari.chiba.jp
+yotsukaido.chiba.jp
+ainan.ehime.jp
+honai.ehime.jp
+ikata.ehime.jp
+imabari.ehime.jp
+iyo.ehime.jp
+kamijima.ehime.jp
+kihoku.ehime.jp
+kumakogen.ehime.jp
+masaki.ehime.jp
+matsuno.ehime.jp
+matsuyama.ehime.jp
+namikata.ehime.jp
+niihama.ehime.jp
+ozu.ehime.jp
+saijo.ehime.jp
+seiyo.ehime.jp
+shikokuchuo.ehime.jp
+tobe.ehime.jp
+toon.ehime.jp
+uchiko.ehime.jp
+uwajima.ehime.jp
+yawatahama.ehime.jp
+echizen.fukui.jp
+eiheiji.fukui.jp
+fukui.fukui.jp
+ikeda.fukui.jp
+katsuyama.fukui.jp
+mihama.fukui.jp
+minamiechizen.fukui.jp
+obama.fukui.jp
+ohi.fukui.jp
+ono.fukui.jp
+sabae.fukui.jp
+sakai.fukui.jp
+takahama.fukui.jp
+tsuruga.fukui.jp
+wakasa.fukui.jp
+ashiya.fukuoka.jp
+buzen.fukuoka.jp
+chikugo.fukuoka.jp
+chikuho.fukuoka.jp
+chikujo.fukuoka.jp
+chikushino.fukuoka.jp
+chikuzen.fukuoka.jp
+chuo.fukuoka.jp
+dazaifu.fukuoka.jp
+fukuchi.fukuoka.jp
+hakata.fukuoka.jp
+higashi.fukuoka.jp
+hirokawa.fukuoka.jp
+hisayama.fukuoka.jp
+iizuka.fukuoka.jp
+inatsuki.fukuoka.jp
+kaho.fukuoka.jp
+kasuga.fukuoka.jp
+kasuya.fukuoka.jp
+kawara.fukuoka.jp
+keisen.fukuoka.jp
+koga.fukuoka.jp
+kurate.fukuoka.jp
+kurogi.fukuoka.jp
+kurume.fukuoka.jp
+minami.fukuoka.jp
+miyako.fukuoka.jp
+miyama.fukuoka.jp
+miyawaka.fukuoka.jp
+mizumaki.fukuoka.jp
+munakata.fukuoka.jp
+nakagawa.fukuoka.jp
+nakama.fukuoka.jp
+nishi.fukuoka.jp
+nogata.fukuoka.jp
+ogori.fukuoka.jp
+okagaki.fukuoka.jp
+okawa.fukuoka.jp
+oki.fukuoka.jp
+omuta.fukuoka.jp
+onga.fukuoka.jp
+onojo.fukuoka.jp
+oto.fukuoka.jp
+saigawa.fukuoka.jp
+sasaguri.fukuoka.jp
+shingu.fukuoka.jp
+shinyoshitomi.fukuoka.jp
+shonai.fukuoka.jp
+soeda.fukuoka.jp
+sue.fukuoka.jp
+tachiarai.fukuoka.jp
+tagawa.fukuoka.jp
+takata.fukuoka.jp
+toho.fukuoka.jp
+toyotsu.fukuoka.jp
+tsuiki.fukuoka.jp
+ukiha.fukuoka.jp
+umi.fukuoka.jp
+usui.fukuoka.jp
+yamada.fukuoka.jp
+yame.fukuoka.jp
+yanagawa.fukuoka.jp
+yukuhashi.fukuoka.jp
+aizubange.fukushima.jp
+aizumisato.fukushima.jp
+aizuwakamatsu.fukushima.jp
+asakawa.fukushima.jp
+bandai.fukushima.jp
+date.fukushima.jp
+fukushima.fukushima.jp
+furudono.fukushima.jp
+futaba.fukushima.jp
+hanawa.fukushima.jp
+higashi.fukushima.jp
+hirata.fukushima.jp
+hirono.fukushima.jp
+iitate.fukushima.jp
+inawashiro.fukushima.jp
+ishikawa.fukushima.jp
+iwaki.fukushima.jp
+izumizaki.fukushima.jp
+kagamiishi.fukushima.jp
+kaneyama.fukushima.jp
+kawamata.fukushima.jp
+kitakata.fukushima.jp
+kitashiobara.fukushima.jp
+koori.fukushima.jp
+koriyama.fukushima.jp
+kunimi.fukushima.jp
+miharu.fukushima.jp
+mishima.fukushima.jp
+namie.fukushima.jp
+nango.fukushima.jp
+nishiaizu.fukushima.jp
+nishigo.fukushima.jp
+okuma.fukushima.jp
+omotego.fukushima.jp
+ono.fukushima.jp
+otama.fukushima.jp
+samegawa.fukushima.jp
+shimogo.fukushima.jp
+shirakawa.fukushima.jp
+showa.fukushima.jp
+soma.fukushima.jp
+sukagawa.fukushima.jp
+taishin.fukushima.jp
+tamakawa.fukushima.jp
+tanagura.fukushima.jp
+tenei.fukushima.jp
+yabuki.fukushima.jp
+yamato.fukushima.jp
+yamatsuri.fukushima.jp
+yanaizu.fukushima.jp
+yugawa.fukushima.jp
+anpachi.gifu.jp
+ena.gifu.jp
+gifu.gifu.jp
+ginan.gifu.jp
+godo.gifu.jp
+gujo.gifu.jp
+hashima.gifu.jp
+hichiso.gifu.jp
+hida.gifu.jp
+higashishirakawa.gifu.jp
+ibigawa.gifu.jp
+ikeda.gifu.jp
+kakamigahara.gifu.jp
+kani.gifu.jp
+kasahara.gifu.jp
+kasamatsu.gifu.jp
+kawaue.gifu.jp
+kitagata.gifu.jp
+mino.gifu.jp
+minokamo.gifu.jp
+mitake.gifu.jp
+mizunami.gifu.jp
+motosu.gifu.jp
+nakatsugawa.gifu.jp
+ogaki.gifu.jp
+sakahogi.gifu.jp
+seki.gifu.jp
+sekigahara.gifu.jp
+shirakawa.gifu.jp
+tajimi.gifu.jp
+takayama.gifu.jp
+tarui.gifu.jp
+toki.gifu.jp
+tomika.gifu.jp
+wanouchi.gifu.jp
+yamagata.gifu.jp
+yaotsu.gifu.jp
+yoro.gifu.jp
+annaka.gunma.jp
+chiyoda.gunma.jp
+fujioka.gunma.jp
+higashiagatsuma.gunma.jp
+isesaki.gunma.jp
+itakura.gunma.jp
+kanna.gunma.jp
+kanra.gunma.jp
+katashina.gunma.jp
+kawaba.gunma.jp
+kiryu.gunma.jp
+kusatsu.gunma.jp
+maebashi.gunma.jp
+meiwa.gunma.jp
+midori.gunma.jp
+minakami.gunma.jp
+naganohara.gunma.jp
+nakanojo.gunma.jp
+nanmoku.gunma.jp
+numata.gunma.jp
+oizumi.gunma.jp
+ora.gunma.jp
+ota.gunma.jp
+shibukawa.gunma.jp
+shimonita.gunma.jp
+shinto.gunma.jp
+showa.gunma.jp
+takasaki.gunma.jp
+takayama.gunma.jp
+tamamura.gunma.jp
+tatebayashi.gunma.jp
+tomioka.gunma.jp
+tsukiyono.gunma.jp
+tsumagoi.gunma.jp
+ueno.gunma.jp
+yoshioka.gunma.jp
+asaminami.hiroshima.jp
+daiwa.hiroshima.jp
+etajima.hiroshima.jp
+fuchu.hiroshima.jp
+fukuyama.hiroshima.jp
+hatsukaichi.hiroshima.jp
+higashihiroshima.hiroshima.jp
+hongo.hiroshima.jp
+jinsekikogen.hiroshima.jp
+kaita.hiroshima.jp
+kui.hiroshima.jp
+kumano.hiroshima.jp
+kure.hiroshima.jp
+mihara.hiroshima.jp
+miyoshi.hiroshima.jp
+naka.hiroshima.jp
+onomichi.hiroshima.jp
+osakikamijima.hiroshima.jp
+otake.hiroshima.jp
+saka.hiroshima.jp
+sera.hiroshima.jp
+seranishi.hiroshima.jp
+shinichi.hiroshima.jp
+shobara.hiroshima.jp
+takehara.hiroshima.jp
+abashiri.hokkaido.jp
+abira.hokkaido.jp
+aibetsu.hokkaido.jp
+akabira.hokkaido.jp
+akkeshi.hokkaido.jp
+asahikawa.hokkaido.jp
+ashibetsu.hokkaido.jp
+ashoro.hokkaido.jp
+assabu.hokkaido.jp
+atsuma.hokkaido.jp
+bibai.hokkaido.jp
+biei.hokkaido.jp
+bifuka.hokkaido.jp
+bihoro.hokkaido.jp
+biratori.hokkaido.jp
+chippubetsu.hokkaido.jp
+chitose.hokkaido.jp
+date.hokkaido.jp
+ebetsu.hokkaido.jp
+embetsu.hokkaido.jp
+eniwa.hokkaido.jp
+erimo.hokkaido.jp
+esan.hokkaido.jp
+esashi.hokkaido.jp
+fukagawa.hokkaido.jp
+fukushima.hokkaido.jp
+furano.hokkaido.jp
+furubira.hokkaido.jp
+haboro.hokkaido.jp
+hakodate.hokkaido.jp
+hamatonbetsu.hokkaido.jp
+hidaka.hokkaido.jp
+higashikagura.hokkaido.jp
+higashikawa.hokkaido.jp
+hiroo.hokkaido.jp
+hokuryu.hokkaido.jp
+hokuto.hokkaido.jp
+honbetsu.hokkaido.jp
+horokanai.hokkaido.jp
+horonobe.hokkaido.jp
+ikeda.hokkaido.jp
+imakane.hokkaido.jp
+ishikari.hokkaido.jp
+iwamizawa.hokkaido.jp
+iwanai.hokkaido.jp
+kamifurano.hokkaido.jp
+kamikawa.hokkaido.jp
+kamishihoro.hokkaido.jp
+kamisunagawa.hokkaido.jp
+kamoenai.hokkaido.jp
+kayabe.hokkaido.jp
+kembuchi.hokkaido.jp
+kikonai.hokkaido.jp
+kimobetsu.hokkaido.jp
+kitahiroshima.hokkaido.jp
+kitami.hokkaido.jp
+kiyosato.hokkaido.jp
+koshimizu.hokkaido.jp
+kunneppu.hokkaido.jp
+kuriyama.hokkaido.jp
+kuromatsunai.hokkaido.jp
+kushiro.hokkaido.jp
+kutchan.hokkaido.jp
+kyowa.hokkaido.jp
+mashike.hokkaido.jp
+matsumae.hokkaido.jp
+mikasa.hokkaido.jp
+minamifurano.hokkaido.jp
+mombetsu.hokkaido.jp
+moseushi.hokkaido.jp
+mukawa.hokkaido.jp
+muroran.hokkaido.jp
+naie.hokkaido.jp
+nakagawa.hokkaido.jp
+nakasatsunai.hokkaido.jp
+nakatombetsu.hokkaido.jp
+nanae.hokkaido.jp
+nanporo.hokkaido.jp
+nayoro.hokkaido.jp
+nemuro.hokkaido.jp
+niikappu.hokkaido.jp
+niki.hokkaido.jp
+nishiokoppe.hokkaido.jp
+noboribetsu.hokkaido.jp
+numata.hokkaido.jp
+obihiro.hokkaido.jp
+obira.hokkaido.jp
+oketo.hokkaido.jp
+okoppe.hokkaido.jp
+otaru.hokkaido.jp
+otobe.hokkaido.jp
+otofuke.hokkaido.jp
+otoineppu.hokkaido.jp
+oumu.hokkaido.jp
+ozora.hokkaido.jp
+pippu.hokkaido.jp
+rankoshi.hokkaido.jp
+rebun.hokkaido.jp
+rikubetsu.hokkaido.jp
+rishiri.hokkaido.jp
+rishirifuji.hokkaido.jp
+saroma.hokkaido.jp
+sarufutsu.hokkaido.jp
+shakotan.hokkaido.jp
+shari.hokkaido.jp
+shibecha.hokkaido.jp
+shibetsu.hokkaido.jp
+shikabe.hokkaido.jp
+shikaoi.hokkaido.jp
+shimamaki.hokkaido.jp
+shimizu.hokkaido.jp
+shimokawa.hokkaido.jp
+shinshinotsu.hokkaido.jp
+shintoku.hokkaido.jp
+shiranuka.hokkaido.jp
+shiraoi.hokkaido.jp
+shiriuchi.hokkaido.jp
+sobetsu.hokkaido.jp
+sunagawa.hokkaido.jp
+taiki.hokkaido.jp
+takasu.hokkaido.jp
+takikawa.hokkaido.jp
+takinoue.hokkaido.jp
+teshikaga.hokkaido.jp
+tobetsu.hokkaido.jp
+tohma.hokkaido.jp
+tomakomai.hokkaido.jp
+tomari.hokkaido.jp
+toya.hokkaido.jp
+toyako.hokkaido.jp
+toyotomi.hokkaido.jp
+toyoura.hokkaido.jp
+tsubetsu.hokkaido.jp
+tsukigata.hokkaido.jp
+urakawa.hokkaido.jp
+urausu.hokkaido.jp
+uryu.hokkaido.jp
+utashinai.hokkaido.jp
+wakkanai.hokkaido.jp
+wassamu.hokkaido.jp
+yakumo.hokkaido.jp
+yoichi.hokkaido.jp
+aioi.hyogo.jp
+akashi.hyogo.jp
+ako.hyogo.jp
+amagasaki.hyogo.jp
+aogaki.hyogo.jp
+asago.hyogo.jp
+ashiya.hyogo.jp
+awaji.hyogo.jp
+fukusaki.hyogo.jp
+goshiki.hyogo.jp
+harima.hyogo.jp
+himeji.hyogo.jp
+ichikawa.hyogo.jp
+inagawa.hyogo.jp
+itami.hyogo.jp
+kakogawa.hyogo.jp
+kamigori.hyogo.jp
+kamikawa.hyogo.jp
+kasai.hyogo.jp
+kasuga.hyogo.jp
+kawanishi.hyogo.jp
+miki.hyogo.jp
+minamiawaji.hyogo.jp
+nishinomiya.hyogo.jp
+nishiwaki.hyogo.jp
+ono.hyogo.jp
+sanda.hyogo.jp
+sannan.hyogo.jp
+sasayama.hyogo.jp
+sayo.hyogo.jp
+shingu.hyogo.jp
+shinonsen.hyogo.jp
+shiso.hyogo.jp
+sumoto.hyogo.jp
+taishi.hyogo.jp
+taka.hyogo.jp
+takarazuka.hyogo.jp
+takasago.hyogo.jp
+takino.hyogo.jp
+tamba.hyogo.jp
+tatsuno.hyogo.jp
+toyooka.hyogo.jp
+yabu.hyogo.jp
+yashiro.hyogo.jp
+yoka.hyogo.jp
+yokawa.hyogo.jp
+ami.ibaraki.jp
+asahi.ibaraki.jp
+bando.ibaraki.jp
+chikusei.ibaraki.jp
+daigo.ibaraki.jp
+fujishiro.ibaraki.jp
+hitachi.ibaraki.jp
+hitachinaka.ibaraki.jp
+hitachiomiya.ibaraki.jp
+hitachiota.ibaraki.jp
+ibaraki.ibaraki.jp
+ina.ibaraki.jp
+inashiki.ibaraki.jp
+itako.ibaraki.jp
+iwama.ibaraki.jp
+joso.ibaraki.jp
+kamisu.ibaraki.jp
+kasama.ibaraki.jp
+kashima.ibaraki.jp
+kasumigaura.ibaraki.jp
+koga.ibaraki.jp
+miho.ibaraki.jp
+mito.ibaraki.jp
+moriya.ibaraki.jp
+naka.ibaraki.jp
+namegata.ibaraki.jp
+oarai.ibaraki.jp
+ogawa.ibaraki.jp
+omitama.ibaraki.jp
+ryugasaki.ibaraki.jp
+sakai.ibaraki.jp
+sakuragawa.ibaraki.jp
+shimodate.ibaraki.jp
+shimotsuma.ibaraki.jp
+shirosato.ibaraki.jp
+sowa.ibaraki.jp
+suifu.ibaraki.jp
+takahagi.ibaraki.jp
+tamatsukuri.ibaraki.jp
+tokai.ibaraki.jp
+tomobe.ibaraki.jp
+tone.ibaraki.jp
+toride.ibaraki.jp
+tsuchiura.ibaraki.jp
+tsukuba.ibaraki.jp
+uchihara.ibaraki.jp
+ushiku.ibaraki.jp
+yachiyo.ibaraki.jp
+yamagata.ibaraki.jp
+yawara.ibaraki.jp
+yuki.ibaraki.jp
+anamizu.ishikawa.jp
+hakui.ishikawa.jp
+hakusan.ishikawa.jp
+kaga.ishikawa.jp
+kahoku.ishikawa.jp
+kanazawa.ishikawa.jp
+kawakita.ishikawa.jp
+komatsu.ishikawa.jp
+nakanoto.ishikawa.jp
+nanao.ishikawa.jp
+nomi.ishikawa.jp
+nonoichi.ishikawa.jp
+noto.ishikawa.jp
+shika.ishikawa.jp
+suzu.ishikawa.jp
+tsubata.ishikawa.jp
+tsurugi.ishikawa.jp
+uchinada.ishikawa.jp
+wajima.ishikawa.jp
+fudai.iwate.jp
+fujisawa.iwate.jp
+hanamaki.iwate.jp
+hiraizumi.iwate.jp
+hirono.iwate.jp
+ichinohe.iwate.jp
+ichinoseki.iwate.jp
+iwaizumi.iwate.jp
+iwate.iwate.jp
+joboji.iwate.jp
+kamaishi.iwate.jp
+kanegasaki.iwate.jp
+karumai.iwate.jp
+kawai.iwate.jp
+kitakami.iwate.jp
+kuji.iwate.jp
+kunohe.iwate.jp
+kuzumaki.iwate.jp
+miyako.iwate.jp
+mizusawa.iwate.jp
+morioka.iwate.jp
+ninohe.iwate.jp
+noda.iwate.jp
+ofunato.iwate.jp
+oshu.iwate.jp
+otsuchi.iwate.jp
+rikuzentakata.iwate.jp
+shiwa.iwate.jp
+shizukuishi.iwate.jp
+sumita.iwate.jp
+takizawa.iwate.jp
+tanohata.iwate.jp
+tono.iwate.jp
+yahaba.iwate.jp
+yamada.iwate.jp
+ayagawa.kagawa.jp
+higashikagawa.kagawa.jp
+kanonji.kagawa.jp
+kotohira.kagawa.jp
+manno.kagawa.jp
+marugame.kagawa.jp
+mitoyo.kagawa.jp
+naoshima.kagawa.jp
+sanuki.kagawa.jp
+tadotsu.kagawa.jp
+takamatsu.kagawa.jp
+tonosho.kagawa.jp
+uchinomi.kagawa.jp
+utazu.kagawa.jp
+zentsuji.kagawa.jp
+akune.kagoshima.jp
+amami.kagoshima.jp
+hioki.kagoshima.jp
+isa.kagoshima.jp
+isen.kagoshima.jp
+izumi.kagoshima.jp
+kagoshima.kagoshima.jp
+kanoya.kagoshima.jp
+kawanabe.kagoshima.jp
+kinko.kagoshima.jp
+kouyama.kagoshima.jp
+makurazaki.kagoshima.jp
+matsumoto.kagoshima.jp
+minamitane.kagoshima.jp
+nakatane.kagoshima.jp
+nishinoomote.kagoshima.jp
+satsumasendai.kagoshima.jp
+soo.kagoshima.jp
+tarumizu.kagoshima.jp
+yusui.kagoshima.jp
+aikawa.kanagawa.jp
+atsugi.kanagawa.jp
+ayase.kanagawa.jp
+chigasaki.kanagawa.jp
+ebina.kanagawa.jp
+fujisawa.kanagawa.jp
+hadano.kanagawa.jp
+hakone.kanagawa.jp
+hiratsuka.kanagawa.jp
+isehara.kanagawa.jp
+kaisei.kanagawa.jp
+kamakura.kanagawa.jp
+kiyokawa.kanagawa.jp
+matsuda.kanagawa.jp
+minamiashigara.kanagawa.jp
+miura.kanagawa.jp
+nakai.kanagawa.jp
+ninomiya.kanagawa.jp
+odawara.kanagawa.jp
+oi.kanagawa.jp
+oiso.kanagawa.jp
+sagamihara.kanagawa.jp
+samukawa.kanagawa.jp
+tsukui.kanagawa.jp
+yamakita.kanagawa.jp
+yamato.kanagawa.jp
+yokosuka.kanagawa.jp
+yugawara.kanagawa.jp
+zama.kanagawa.jp
+zushi.kanagawa.jp
+aki.kochi.jp
+geisei.kochi.jp
+hidaka.kochi.jp
+higashitsuno.kochi.jp
+ino.kochi.jp
+kagami.kochi.jp
+kami.kochi.jp
+kitagawa.kochi.jp
+kochi.kochi.jp
+mihara.kochi.jp
+motoyama.kochi.jp
+muroto.kochi.jp
+nahari.kochi.jp
+nakamura.kochi.jp
+nankoku.kochi.jp
+nishitosa.kochi.jp
+niyodogawa.kochi.jp
+ochi.kochi.jp
+okawa.kochi.jp
+otoyo.kochi.jp
+otsuki.kochi.jp
+sakawa.kochi.jp
+sukumo.kochi.jp
+susaki.kochi.jp
+tosa.kochi.jp
+tosashimizu.kochi.jp
+toyo.kochi.jp
+tsuno.kochi.jp
+umaji.kochi.jp
+yasuda.kochi.jp
+yusuhara.kochi.jp
+amakusa.kumamoto.jp
+arao.kumamoto.jp
+aso.kumamoto.jp
+choyo.kumamoto.jp
+gyokuto.kumamoto.jp
+hitoyoshi.kumamoto.jp
+kamiamakusa.kumamoto.jp
+kashima.kumamoto.jp
+kikuchi.kumamoto.jp
+kosa.kumamoto.jp
+kumamoto.kumamoto.jp
+mashiki.kumamoto.jp
+mifune.kumamoto.jp
+minamata.kumamoto.jp
+minamioguni.kumamoto.jp
+nagasu.kumamoto.jp
+nishihara.kumamoto.jp
+oguni.kumamoto.jp
+ozu.kumamoto.jp
+sumoto.kumamoto.jp
+takamori.kumamoto.jp
+uki.kumamoto.jp
+uto.kumamoto.jp
+yamaga.kumamoto.jp
+yamato.kumamoto.jp
+yatsushiro.kumamoto.jp
+ayabe.kyoto.jp
+fukuchiyama.kyoto.jp
+higashiyama.kyoto.jp
+ide.kyoto.jp
+ine.kyoto.jp
+joyo.kyoto.jp
+kameoka.kyoto.jp
+kamo.kyoto.jp
+kita.kyoto.jp
+kizu.kyoto.jp
+kumiyama.kyoto.jp
+kyotamba.kyoto.jp
+kyotanabe.kyoto.jp
+kyotango.kyoto.jp
+maizuru.kyoto.jp
+minami.kyoto.jp
+minamiyamashiro.kyoto.jp
+miyazu.kyoto.jp
+muko.kyoto.jp
+nagaokakyo.kyoto.jp
+nakagyo.kyoto.jp
+nantan.kyoto.jp
+oyamazaki.kyoto.jp
+sakyo.kyoto.jp
+seika.kyoto.jp
+tanabe.kyoto.jp
+uji.kyoto.jp
+ujitawara.kyoto.jp
+wazuka.kyoto.jp
+yamashina.kyoto.jp
+yawata.kyoto.jp
+asahi.mie.jp
+inabe.mie.jp
+ise.mie.jp
+kameyama.mie.jp
+kawagoe.mie.jp
+kiho.mie.jp
+kisosaki.mie.jp
+kiwa.mie.jp
+komono.mie.jp
+kumano.mie.jp
+kuwana.mie.jp
+matsusaka.mie.jp
+meiwa.mie.jp
+mihama.mie.jp
+minamiise.mie.jp
+misugi.mie.jp
+miyama.mie.jp
+nabari.mie.jp
+shima.mie.jp
+suzuka.mie.jp
+tado.mie.jp
+taiki.mie.jp
+taki.mie.jp
+tamaki.mie.jp
+toba.mie.jp
+tsu.mie.jp
+udono.mie.jp
+ureshino.mie.jp
+watarai.mie.jp
+yokkaichi.mie.jp
+furukawa.miyagi.jp
+higashimatsushima.miyagi.jp
+ishinomaki.miyagi.jp
+iwanuma.miyagi.jp
+kakuda.miyagi.jp
+kami.miyagi.jp
+kawasaki.miyagi.jp
+kesennuma.miyagi.jp
+marumori.miyagi.jp
+matsushima.miyagi.jp
+minamisanriku.miyagi.jp
+misato.miyagi.jp
+murata.miyagi.jp
+natori.miyagi.jp
+ogawara.miyagi.jp
+ohira.miyagi.jp
+onagawa.miyagi.jp
+osaki.miyagi.jp
+rifu.miyagi.jp
+semine.miyagi.jp
+shibata.miyagi.jp
+shichikashuku.miyagi.jp
+shikama.miyagi.jp
+shiogama.miyagi.jp
+shiroishi.miyagi.jp
+tagajo.miyagi.jp
+taiwa.miyagi.jp
+tome.miyagi.jp
+tomiya.miyagi.jp
+wakuya.miyagi.jp
+watari.miyagi.jp
+yamamoto.miyagi.jp
+zao.miyagi.jp
+aya.miyazaki.jp
+ebino.miyazaki.jp
+gokase.miyazaki.jp
+hyuga.miyazaki.jp
+kadogawa.miyazaki.jp
+kawaminami.miyazaki.jp
+kijo.miyazaki.jp
+kitagawa.miyazaki.jp
+kitakata.miyazaki.jp
+kitaura.miyazaki.jp
+kobayashi.miyazaki.jp
+kunitomi.miyazaki.jp
+kushima.miyazaki.jp
+mimata.miyazaki.jp
+miyakonojo.miyazaki.jp
+miyazaki.miyazaki.jp
+morotsuka.miyazaki.jp
+nichinan.miyazaki.jp
+nishimera.miyazaki.jp
+nobeoka.miyazaki.jp
+saito.miyazaki.jp
+shiiba.miyazaki.jp
+shintomi.miyazaki.jp
+takaharu.miyazaki.jp
+takanabe.miyazaki.jp
+takazaki.miyazaki.jp
+tsuno.miyazaki.jp
+achi.nagano.jp
+agematsu.nagano.jp
+anan.nagano.jp
+aoki.nagano.jp
+asahi.nagano.jp
+azumino.nagano.jp
+chikuhoku.nagano.jp
+chikuma.nagano.jp
+chino.nagano.jp
+fujimi.nagano.jp
+hakuba.nagano.jp
+hara.nagano.jp
+hiraya.nagano.jp
+iida.nagano.jp
+iijima.nagano.jp
+iiyama.nagano.jp
+iizuna.nagano.jp
+ikeda.nagano.jp
+ikusaka.nagano.jp
+ina.nagano.jp
+karuizawa.nagano.jp
+kawakami.nagano.jp
+kiso.nagano.jp
+kisofukushima.nagano.jp
+kitaaiki.nagano.jp
+komagane.nagano.jp
+komoro.nagano.jp
+matsukawa.nagano.jp
+matsumoto.nagano.jp
+miasa.nagano.jp
+minamiaiki.nagano.jp
+minamimaki.nagano.jp
+minamiminowa.nagano.jp
+minowa.nagano.jp
+miyada.nagano.jp
+miyota.nagano.jp
+mochizuki.nagano.jp
+nagano.nagano.jp
+nagawa.nagano.jp
+nagiso.nagano.jp
+nakagawa.nagano.jp
+nakano.nagano.jp
+nozawaonsen.nagano.jp
+obuse.nagano.jp
+ogawa.nagano.jp
+okaya.nagano.jp
+omachi.nagano.jp
+omi.nagano.jp
+ookuwa.nagano.jp
+ooshika.nagano.jp
+otaki.nagano.jp
+otari.nagano.jp
+sakae.nagano.jp
+sakaki.nagano.jp
+saku.nagano.jp
+sakuho.nagano.jp
+shimosuwa.nagano.jp
+shinanomachi.nagano.jp
+shiojiri.nagano.jp
+suwa.nagano.jp
+suzaka.nagano.jp
+takagi.nagano.jp
+takamori.nagano.jp
+takayama.nagano.jp
+tateshina.nagano.jp
+tatsuno.nagano.jp
+togakushi.nagano.jp
+togura.nagano.jp
+tomi.nagano.jp
+ueda.nagano.jp
+wada.nagano.jp
+yamagata.nagano.jp
+yamanouchi.nagano.jp
+yasaka.nagano.jp
+yasuoka.nagano.jp
+chijiwa.nagasaki.jp
+futsu.nagasaki.jp
+goto.nagasaki.jp
+hasami.nagasaki.jp
+hirado.nagasaki.jp
+iki.nagasaki.jp
+isahaya.nagasaki.jp
+kawatana.nagasaki.jp
+kuchinotsu.nagasaki.jp
+matsuura.nagasaki.jp
+nagasaki.nagasaki.jp
+obama.nagasaki.jp
+omura.nagasaki.jp
+oseto.nagasaki.jp
+saikai.nagasaki.jp
+sasebo.nagasaki.jp
+seihi.nagasaki.jp
+shimabara.nagasaki.jp
+shinkamigoto.nagasaki.jp
+togitsu.nagasaki.jp
+tsushima.nagasaki.jp
+unzen.nagasaki.jp
+ando.nara.jp
+gose.nara.jp
+heguri.nara.jp
+higashiyoshino.nara.jp
+ikaruga.nara.jp
+ikoma.nara.jp
+kamikitayama.nara.jp
+kanmaki.nara.jp
+kashiba.nara.jp
+kashihara.nara.jp
+katsuragi.nara.jp
+kawai.nara.jp
+kawakami.nara.jp
+kawanishi.nara.jp
+koryo.nara.jp
+kurotaki.nara.jp
+mitsue.nara.jp
+miyake.nara.jp
+nara.nara.jp
+nosegawa.nara.jp
+oji.nara.jp
+ouda.nara.jp
+oyodo.nara.jp
+sakurai.nara.jp
+sango.nara.jp
+shimoichi.nara.jp
+shimokitayama.nara.jp
+shinjo.nara.jp
+soni.nara.jp
+takatori.nara.jp
+tawaramoto.nara.jp
+tenkawa.nara.jp
+tenri.nara.jp
+uda.nara.jp
+yamatokoriyama.nara.jp
+yamatotakada.nara.jp
+yamazoe.nara.jp
+yoshino.nara.jp
+aga.niigata.jp
+agano.niigata.jp
+gosen.niigata.jp
+itoigawa.niigata.jp
+izumozaki.niigata.jp
+joetsu.niigata.jp
+kamo.niigata.jp
+kariwa.niigata.jp
+kashiwazaki.niigata.jp
+minamiuonuma.niigata.jp
+mitsuke.niigata.jp
+muika.niigata.jp
+murakami.niigata.jp
+myoko.niigata.jp
+nagaoka.niigata.jp
+niigata.niigata.jp
+ojiya.niigata.jp
+omi.niigata.jp
+sado.niigata.jp
+sanjo.niigata.jp
+seiro.niigata.jp
+seirou.niigata.jp
+sekikawa.niigata.jp
+shibata.niigata.jp
+tagami.niigata.jp
+tainai.niigata.jp
+tochio.niigata.jp
+tokamachi.niigata.jp
+tsubame.niigata.jp
+tsunan.niigata.jp
+uonuma.niigata.jp
+yahiko.niigata.jp
+yoita.niigata.jp
+yuzawa.niigata.jp
+beppu.oita.jp
+bungoono.oita.jp
+bungotakada.oita.jp
+hasama.oita.jp
+hiji.oita.jp
+himeshima.oita.jp
+hita.oita.jp
+kamitsue.oita.jp
+kokonoe.oita.jp
+kuju.oita.jp
+kunisaki.oita.jp
+kusu.oita.jp
+oita.oita.jp
+saiki.oita.jp
+taketa.oita.jp
+tsukumi.oita.jp
+usa.oita.jp
+usuki.oita.jp
+yufu.oita.jp
+akaiwa.okayama.jp
+asakuchi.okayama.jp
+bizen.okayama.jp
+hayashima.okayama.jp
+ibara.okayama.jp
+kagamino.okayama.jp
+kasaoka.okayama.jp
+kibichuo.okayama.jp
+kumenan.okayama.jp
+kurashiki.okayama.jp
+maniwa.okayama.jp
+misaki.okayama.jp
+nagi.okayama.jp
+niimi.okayama.jp
+nishiawakura.okayama.jp
+okayama.okayama.jp
+satosho.okayama.jp
+setouchi.okayama.jp
+shinjo.okayama.jp
+shoo.okayama.jp
+soja.okayama.jp
+takahashi.okayama.jp
+tamano.okayama.jp
+tsuyama.okayama.jp
+wake.okayama.jp
+yakage.okayama.jp
+aguni.okinawa.jp
+ginowan.okinawa.jp
+ginoza.okinawa.jp
+gushikami.okinawa.jp
+haebaru.okinawa.jp
+higashi.okinawa.jp
+hirara.okinawa.jp
+iheya.okinawa.jp
+ishigaki.okinawa.jp
+ishikawa.okinawa.jp
+itoman.okinawa.jp
+izena.okinawa.jp
+kadena.okinawa.jp
+kin.okinawa.jp
+kitadaito.okinawa.jp
+kitanakagusuku.okinawa.jp
+kumejima.okinawa.jp
+kunigami.okinawa.jp
+minamidaito.okinawa.jp
+motobu.okinawa.jp
+nago.okinawa.jp
+naha.okinawa.jp
+nakagusuku.okinawa.jp
+nakijin.okinawa.jp
+nanjo.okinawa.jp
+nishihara.okinawa.jp
+ogimi.okinawa.jp
+okinawa.okinawa.jp
+onna.okinawa.jp
+shimoji.okinawa.jp
+taketomi.okinawa.jp
+tarama.okinawa.jp
+tokashiki.okinawa.jp
+tomigusuku.okinawa.jp
+tonaki.okinawa.jp
+urasoe.okinawa.jp
+uruma.okinawa.jp
+yaese.okinawa.jp
+yomitan.okinawa.jp
+yonabaru.okinawa.jp
+yonaguni.okinawa.jp
+zamami.okinawa.jp
+abeno.osaka.jp
+chihayaakasaka.osaka.jp
+chuo.osaka.jp
+daito.osaka.jp
+fujiidera.osaka.jp
+habikino.osaka.jp
+hannan.osaka.jp
+higashiosaka.osaka.jp
+higashisumiyoshi.osaka.jp
+higashiyodogawa.osaka.jp
+hirakata.osaka.jp
+ibaraki.osaka.jp
+ikeda.osaka.jp
+izumi.osaka.jp
+izumiotsu.osaka.jp
+izumisano.osaka.jp
+kadoma.osaka.jp
+kaizuka.osaka.jp
+kanan.osaka.jp
+kashiwara.osaka.jp
+katano.osaka.jp
+kawachinagano.osaka.jp
+kishiwada.osaka.jp
+kita.osaka.jp
+kumatori.osaka.jp
+matsubara.osaka.jp
+minato.osaka.jp
+minoh.osaka.jp
+misaki.osaka.jp
+moriguchi.osaka.jp
+neyagawa.osaka.jp
+nishi.osaka.jp
+nose.osaka.jp
+osakasayama.osaka.jp
+sakai.osaka.jp
+sayama.osaka.jp
+sennan.osaka.jp
+settsu.osaka.jp
+shijonawate.osaka.jp
+shimamoto.osaka.jp
+suita.osaka.jp
+tadaoka.osaka.jp
+taishi.osaka.jp
+tajiri.osaka.jp
+takaishi.osaka.jp
+takatsuki.osaka.jp
+tondabayashi.osaka.jp
+toyonaka.osaka.jp
+toyono.osaka.jp
+yao.osaka.jp
+ariake.saga.jp
+arita.saga.jp
+fukudomi.saga.jp
+genkai.saga.jp
+hamatama.saga.jp
+hizen.saga.jp
+imari.saga.jp
+kamimine.saga.jp
+kanzaki.saga.jp
+karatsu.saga.jp
+kashima.saga.jp
+kitagata.saga.jp
+kitahata.saga.jp
+kiyama.saga.jp
+kouhoku.saga.jp
+kyuragi.saga.jp
+nishiarita.saga.jp
+ogi.saga.jp
+omachi.saga.jp
+ouchi.saga.jp
+saga.saga.jp
+shiroishi.saga.jp
+taku.saga.jp
+tara.saga.jp
+tosu.saga.jp
+yoshinogari.saga.jp
+arakawa.saitama.jp
+asaka.saitama.jp
+chichibu.saitama.jp
+fujimi.saitama.jp
+fujimino.saitama.jp
+fukaya.saitama.jp
+hanno.saitama.jp
+hanyu.saitama.jp
+hasuda.saitama.jp
+hatogaya.saitama.jp
+hatoyama.saitama.jp
+hidaka.saitama.jp
+higashichichibu.saitama.jp
+higashimatsuyama.saitama.jp
+honjo.saitama.jp
+ina.saitama.jp
+iruma.saitama.jp
+iwatsuki.saitama.jp
+kamiizumi.saitama.jp
+kamikawa.saitama.jp
+kamisato.saitama.jp
+kasukabe.saitama.jp
+kawagoe.saitama.jp
+kawaguchi.saitama.jp
+kawajima.saitama.jp
+kazo.saitama.jp
+kitamoto.saitama.jp
+koshigaya.saitama.jp
+kounosu.saitama.jp
+kuki.saitama.jp
+kumagaya.saitama.jp
+matsubushi.saitama.jp
+minano.saitama.jp
+misato.saitama.jp
+miyashiro.saitama.jp
+miyoshi.saitama.jp
+moroyama.saitama.jp
+nagatoro.saitama.jp
+namegawa.saitama.jp
+niiza.saitama.jp
+ogano.saitama.jp
+ogawa.saitama.jp
+ogose.saitama.jp
+okegawa.saitama.jp
+omiya.saitama.jp
+otaki.saitama.jp
+ranzan.saitama.jp
+ryokami.saitama.jp
+saitama.saitama.jp
+sakado.saitama.jp
+satte.saitama.jp
+sayama.saitama.jp
+shiki.saitama.jp
+shiraoka.saitama.jp
+soka.saitama.jp
+sugito.saitama.jp
+toda.saitama.jp
+tokigawa.saitama.jp
+tokorozawa.saitama.jp
+tsurugashima.saitama.jp
+urawa.saitama.jp
+warabi.saitama.jp
+yashio.saitama.jp
+yokoze.saitama.jp
+yono.saitama.jp
+yorii.saitama.jp
+yoshida.saitama.jp
+yoshikawa.saitama.jp
+yoshimi.saitama.jp
+aisho.shiga.jp
+gamo.shiga.jp
+higashiomi.shiga.jp
+hikone.shiga.jp
+koka.shiga.jp
+konan.shiga.jp
+kosei.shiga.jp
+koto.shiga.jp
+kusatsu.shiga.jp
+maibara.shiga.jp
+moriyama.shiga.jp
+nagahama.shiga.jp
+nishiazai.shiga.jp
+notogawa.shiga.jp
+omihachiman.shiga.jp
+otsu.shiga.jp
+ritto.shiga.jp
+ryuoh.shiga.jp
+takashima.shiga.jp
+takatsuki.shiga.jp
+torahime.shiga.jp
+toyosato.shiga.jp
+yasu.shiga.jp
+akagi.shimane.jp
+ama.shimane.jp
+gotsu.shimane.jp
+hamada.shimane.jp
+higashiizumo.shimane.jp
+hikawa.shimane.jp
+hikimi.shimane.jp
+izumo.shimane.jp
+kakinoki.shimane.jp
+masuda.shimane.jp
+matsue.shimane.jp
+misato.shimane.jp
+nishinoshima.shimane.jp
+ohda.shimane.jp
+okinoshima.shimane.jp
+okuizumo.shimane.jp
+shimane.shimane.jp
+tamayu.shimane.jp
+tsuwano.shimane.jp
+unnan.shimane.jp
+yakumo.shimane.jp
+yasugi.shimane.jp
+yatsuka.shimane.jp
+arai.shizuoka.jp
+atami.shizuoka.jp
+fuji.shizuoka.jp
+fujieda.shizuoka.jp
+fujikawa.shizuoka.jp
+fujinomiya.shizuoka.jp
+fukuroi.shizuoka.jp
+gotemba.shizuoka.jp
+haibara.shizuoka.jp
+hamamatsu.shizuoka.jp
+higashiizu.shizuoka.jp
+ito.shizuoka.jp
+iwata.shizuoka.jp
+izu.shizuoka.jp
+izunokuni.shizuoka.jp
+kakegawa.shizuoka.jp
+kannami.shizuoka.jp
+kawanehon.shizuoka.jp
+kawazu.shizuoka.jp
+kikugawa.shizuoka.jp
+kosai.shizuoka.jp
+makinohara.shizuoka.jp
+matsuzaki.shizuoka.jp
+minamiizu.shizuoka.jp
+mishima.shizuoka.jp
+morimachi.shizuoka.jp
+nishiizu.shizuoka.jp
+numazu.shizuoka.jp
+omaezaki.shizuoka.jp
+shimada.shizuoka.jp
+shimizu.shizuoka.jp
+shimoda.shizuoka.jp
+shizuoka.shizuoka.jp
+susono.shizuoka.jp
+yaizu.shizuoka.jp
+yoshida.shizuoka.jp
+ashikaga.tochigi.jp
+bato.tochigi.jp
+haga.tochigi.jp
+ichikai.tochigi.jp
+iwafune.tochigi.jp
+kaminokawa.tochigi.jp
+kanuma.tochigi.jp
+karasuyama.tochigi.jp
+kuroiso.tochigi.jp
+mashiko.tochigi.jp
+mibu.tochigi.jp
+moka.tochigi.jp
+motegi.tochigi.jp
+nasu.tochigi.jp
+nasushiobara.tochigi.jp
+nikko.tochigi.jp
+nishikata.tochigi.jp
+nogi.tochigi.jp
+ohira.tochigi.jp
+ohtawara.tochigi.jp
+oyama.tochigi.jp
+sakura.tochigi.jp
+sano.tochigi.jp
+shimotsuke.tochigi.jp
+shioya.tochigi.jp
+takanezawa.tochigi.jp
+tochigi.tochigi.jp
+tsuga.tochigi.jp
+ujiie.tochigi.jp
+utsunomiya.tochigi.jp
+yaita.tochigi.jp
+aizumi.tokushima.jp
+anan.tokushima.jp
+ichiba.tokushima.jp
+itano.tokushima.jp
+kainan.tokushima.jp
+komatsushima.tokushima.jp
+matsushige.tokushima.jp
+mima.tokushima.jp
+minami.tokushima.jp
+miyoshi.tokushima.jp
+mugi.tokushima.jp
+nakagawa.tokushima.jp
+naruto.tokushima.jp
+sanagochi.tokushima.jp
+shishikui.tokushima.jp
+tokushima.tokushima.jp
+wajiki.tokushima.jp
+adachi.tokyo.jp
+akiruno.tokyo.jp
+akishima.tokyo.jp
+aogashima.tokyo.jp
+arakawa.tokyo.jp
+bunkyo.tokyo.jp
+chiyoda.tokyo.jp
+chofu.tokyo.jp
+chuo.tokyo.jp
+edogawa.tokyo.jp
+fuchu.tokyo.jp
+fussa.tokyo.jp
+hachijo.tokyo.jp
+hachioji.tokyo.jp
+hamura.tokyo.jp
+higashikurume.tokyo.jp
+higashimurayama.tokyo.jp
+higashiyamato.tokyo.jp
+hino.tokyo.jp
+hinode.tokyo.jp
+hinohara.tokyo.jp
+inagi.tokyo.jp
+itabashi.tokyo.jp
+katsushika.tokyo.jp
+kita.tokyo.jp
+kiyose.tokyo.jp
+kodaira.tokyo.jp
+koganei.tokyo.jp
+kokubunji.tokyo.jp
+komae.tokyo.jp
+koto.tokyo.jp
+kouzushima.tokyo.jp
+kunitachi.tokyo.jp
+machida.tokyo.jp
+meguro.tokyo.jp
+minato.tokyo.jp
+mitaka.tokyo.jp
+mizuho.tokyo.jp
+musashimurayama.tokyo.jp
+musashino.tokyo.jp
+nakano.tokyo.jp
+nerima.tokyo.jp
+ogasawara.tokyo.jp
+okutama.tokyo.jp
+ome.tokyo.jp
+oshima.tokyo.jp
+ota.tokyo.jp
+setagaya.tokyo.jp
+shibuya.tokyo.jp
+shinagawa.tokyo.jp
+shinjuku.tokyo.jp
+suginami.tokyo.jp
+sumida.tokyo.jp
+tachikawa.tokyo.jp
+taito.tokyo.jp
+tama.tokyo.jp
+toshima.tokyo.jp
+chizu.tottori.jp
+hino.tottori.jp
+kawahara.tottori.jp
+koge.tottori.jp
+kotoura.tottori.jp
+misasa.tottori.jp
+nanbu.tottori.jp
+nichinan.tottori.jp
+sakaiminato.tottori.jp
+tottori.tottori.jp
+wakasa.tottori.jp
+yazu.tottori.jp
+yonago.tottori.jp
+asahi.toyama.jp
+fuchu.toyama.jp
+fukumitsu.toyama.jp
+funahashi.toyama.jp
+himi.toyama.jp
+imizu.toyama.jp
+inami.toyama.jp
+johana.toyama.jp
+kamiichi.toyama.jp
+kurobe.toyama.jp
+nakaniikawa.toyama.jp
+namerikawa.toyama.jp
+nanto.toyama.jp
+nyuzen.toyama.jp
+oyabe.toyama.jp
+taira.toyama.jp
+takaoka.toyama.jp
+tateyama.toyama.jp
+toga.toyama.jp
+tonami.toyama.jp
+toyama.toyama.jp
+unazuki.toyama.jp
+uozu.toyama.jp
+yamada.toyama.jp
+arida.wakayama.jp
+aridagawa.wakayama.jp
+gobo.wakayama.jp
+hashimoto.wakayama.jp
+hidaka.wakayama.jp
+hirogawa.wakayama.jp
+inami.wakayama.jp
+iwade.wakayama.jp
+kainan.wakayama.jp
+kamitonda.wakayama.jp
+katsuragi.wakayama.jp
+kimino.wakayama.jp
+kinokawa.wakayama.jp
+kitayama.wakayama.jp
+koya.wakayama.jp
+koza.wakayama.jp
+kozagawa.wakayama.jp
+kudoyama.wakayama.jp
+kushimoto.wakayama.jp
+mihama.wakayama.jp
+misato.wakayama.jp
+nachikatsuura.wakayama.jp
+shingu.wakayama.jp
+shirahama.wakayama.jp
+taiji.wakayama.jp
+tanabe.wakayama.jp
+wakayama.wakayama.jp
+yuasa.wakayama.jp
+yura.wakayama.jp
+asahi.yamagata.jp
+funagata.yamagata.jp
+higashine.yamagata.jp
+iide.yamagata.jp
+kahoku.yamagata.jp
+kaminoyama.yamagata.jp
+kaneyama.yamagata.jp
+kawanishi.yamagata.jp
+mamurogawa.yamagata.jp
+mikawa.yamagata.jp
+murayama.yamagata.jp
+nagai.yamagata.jp
+nakayama.yamagata.jp
+nanyo.yamagata.jp
+nishikawa.yamagata.jp
+obanazawa.yamagata.jp
+oe.yamagata.jp
+oguni.yamagata.jp
+ohkura.yamagata.jp
+oishida.yamagata.jp
+sagae.yamagata.jp
+sakata.yamagata.jp
+sakegawa.yamagata.jp
+shinjo.yamagata.jp
+shirataka.yamagata.jp
+shonai.yamagata.jp
+takahata.yamagata.jp
+tendo.yamagata.jp
+tozawa.yamagata.jp
+tsuruoka.yamagata.jp
+yamagata.yamagata.jp
+yamanobe.yamagata.jp
+yonezawa.yamagata.jp
+yuza.yamagata.jp
+abu.yamaguchi.jp
+hagi.yamaguchi.jp
+hikari.yamaguchi.jp
+hofu.yamaguchi.jp
+iwakuni.yamaguchi.jp
+kudamatsu.yamaguchi.jp
+mitou.yamaguchi.jp
+nagato.yamaguchi.jp
+oshima.yamaguchi.jp
+shimonoseki.yamaguchi.jp
+shunan.yamaguchi.jp
+tabuse.yamaguchi.jp
+tokuyama.yamaguchi.jp
+toyota.yamaguchi.jp
+ube.yamaguchi.jp
+yuu.yamaguchi.jp
+chuo.yamanashi.jp
+doshi.yamanashi.jp
+fuefuki.yamanashi.jp
+fujikawa.yamanashi.jp
+fujikawaguchiko.yamanashi.jp
+fujiyoshida.yamanashi.jp
+hayakawa.yamanashi.jp
+hokuto.yamanashi.jp
+ichikawamisato.yamanashi.jp
+kai.yamanashi.jp
+kofu.yamanashi.jp
+koshu.yamanashi.jp
+kosuge.yamanashi.jp
+minami-alps.yamanashi.jp
+minobu.yamanashi.jp
+nakamichi.yamanashi.jp
+nanbu.yamanashi.jp
+narusawa.yamanashi.jp
+nirasaki.yamanashi.jp
+nishikatsura.yamanashi.jp
+oshino.yamanashi.jp
+otsuki.yamanashi.jp
+showa.yamanashi.jp
+tabayama.yamanashi.jp
+tsuru.yamanashi.jp
+uenohara.yamanashi.jp
+yamanakako.yamanashi.jp
+yamanashi.yamanashi.jp
+
+// ke : http://www.kenic.or.ke/index.php?option=com_content&task=view&id=117&Itemid=145
+*.ke
+
+// kg : http://www.domain.kg/dmn_n.html
+kg
+org.kg
+net.kg
+com.kg
+edu.kg
+gov.kg
+mil.kg
+
+// kh : http://www.mptc.gov.kh/dns_registration.htm
+*.kh
+
+// ki : http://www.ki/dns/index.html
+ki
+edu.ki
+biz.ki
+net.ki
+org.ki
+gov.ki
+info.ki
+com.ki
+
+// km : http://en.wikipedia.org/wiki/.km
+// http://www.domaine.km/documents/charte.doc
+km
+org.km
+nom.km
+gov.km
+prd.km
+tm.km
+edu.km
+mil.km
+ass.km
+com.km
+// These are only mentioned as proposed suggestions at domaine.km, but
+// http://en.wikipedia.org/wiki/.km says they're available for registration:
+coop.km
+asso.km
+presse.km
+medecin.km
+notaires.km
+pharmaciens.km
+veterinaire.km
+gouv.km
+
+// kn : http://en.wikipedia.org/wiki/.kn
+// http://www.dot.kn/domainRules.html
+kn
+net.kn
+org.kn
+edu.kn
+gov.kn
+
+// kp : http://www.kcce.kp/en_index.php
+com.kp
+edu.kp
+gov.kp
+org.kp
+rep.kp
+tra.kp
+
+// kr : http://en.wikipedia.org/wiki/.kr
+// see also: http://domain.nida.or.kr/eng/registration.jsp
+kr
+ac.kr
+co.kr
+es.kr
+go.kr
+hs.kr
+kg.kr
+mil.kr
+ms.kr
+ne.kr
+or.kr
+pe.kr
+re.kr
+sc.kr
+// kr geographical names
+busan.kr
+chungbuk.kr
+chungnam.kr
+daegu.kr
+daejeon.kr
+gangwon.kr
+gwangju.kr
+gyeongbuk.kr
+gyeonggi.kr
+gyeongnam.kr
+incheon.kr
+jeju.kr
+jeonbuk.kr
+jeonnam.kr
+seoul.kr
+ulsan.kr
+
+// kw : http://en.wikipedia.org/wiki/.kw
+*.kw
+
+// ky : http://www.icta.ky/da_ky_reg_dom.php
+// Confirmed by registry 2008-06-17
+ky
+edu.ky
+gov.ky
+com.ky
+org.ky
+net.ky
+
+// kz : http://en.wikipedia.org/wiki/.kz
+// see also: http://www.nic.kz/rules/index.jsp
+kz
+org.kz
+edu.kz
+net.kz
+gov.kz
+mil.kz
+com.kz
+
+// la : http://en.wikipedia.org/wiki/.la
+// Submitted by registry 2008-06-10
+la
+int.la
+net.la
+info.la
+edu.la
+gov.la
+per.la
+com.la
+org.la
+
+// lb : http://en.wikipedia.org/wiki/.lb
+// Submitted by registry 2008-06-17
+com.lb
+edu.lb
+gov.lb
+net.lb
+org.lb
+
+// lc : http://en.wikipedia.org/wiki/.lc
+// see also: http://www.nic.lc/rules.htm
+lc
+com.lc
+net.lc
+co.lc
+org.lc
+edu.lc
+gov.lc
+
+// li : http://en.wikipedia.org/wiki/.li
+li
+
+// lk : http://www.nic.lk/seclevpr.html
+lk
+gov.lk
+sch.lk
+net.lk
+int.lk
+com.lk
+org.lk
+edu.lk
+ngo.lk
+soc.lk
+web.lk
+ltd.lk
+assn.lk
+grp.lk
+hotel.lk
+
+// lr : http://psg.com/dns/lr/lr.txt
+// Submitted by registry 2008-06-17
+com.lr
+edu.lr
+gov.lr
+org.lr
+net.lr
+
+// ls : http://en.wikipedia.org/wiki/.ls
+ls
+co.ls
+org.ls
+
+// lt : http://en.wikipedia.org/wiki/.lt
+lt
+// gov.lt : http://www.gov.lt/index_en.php
+gov.lt
+
+// lu : http://www.dns.lu/en/
+lu
+
+// lv : http://www.nic.lv/DNS/En/generic.php
+lv
+com.lv
+edu.lv
+gov.lv
+org.lv
+mil.lv
+id.lv
+net.lv
+asn.lv
+conf.lv
+
+// ly : http://www.nic.ly/regulations.php
+ly
+com.ly
+net.ly
+gov.ly
+plc.ly
+edu.ly
+sch.ly
+med.ly
+org.ly
+id.ly
+
+// ma : http://en.wikipedia.org/wiki/.ma
+// http://www.anrt.ma/fr/admin/download/upload/file_fr782.pdf
+ma
+co.ma
+net.ma
+gov.ma
+org.ma
+ac.ma
+press.ma
+
+// mc : http://www.nic.mc/
+mc
+tm.mc
+asso.mc
+
+// md : http://en.wikipedia.org/wiki/.md
+md
+
+// me : http://en.wikipedia.org/wiki/.me
+me
+co.me
+net.me
+org.me
+edu.me
+ac.me
+gov.me
+its.me
+priv.me
+
+// mg : http://www.nic.mg/tarif.htm
+mg
+org.mg
+nom.mg
+gov.mg
+prd.mg
+tm.mg
+edu.mg
+mil.mg
+com.mg
+
+// mh : http://en.wikipedia.org/wiki/.mh
+mh
+
+// mil : http://en.wikipedia.org/wiki/.mil
+mil
+
+// mk : http://en.wikipedia.org/wiki/.mk
+// see also: http://dns.marnet.net.mk/postapka.php
+mk
+com.mk
+org.mk
+net.mk
+edu.mk
+gov.mk
+inf.mk
+name.mk
+
+// ml : http://www.gobin.info/domainname/ml-template.doc
+// see also: http://en.wikipedia.org/wiki/.ml
+ml
+com.ml
+edu.ml
+gouv.ml
+gov.ml
+net.ml
+org.ml
+presse.ml
+
+// mm : http://en.wikipedia.org/wiki/.mm
+*.mm
+
+// mn : http://en.wikipedia.org/wiki/.mn
+mn
+gov.mn
+edu.mn
+org.mn
+
+// mo : http://www.monic.net.mo/
+mo
+com.mo
+net.mo
+org.mo
+edu.mo
+gov.mo
+
+// mobi : http://en.wikipedia.org/wiki/.mobi
+mobi
+
+// mp : http://www.dot.mp/
+// Confirmed by registry 2008-06-17
+mp
+
+// mq : http://en.wikipedia.org/wiki/.mq
+mq
+
+// mr : http://en.wikipedia.org/wiki/.mr
+mr
+gov.mr
+
+// ms : http://en.wikipedia.org/wiki/.ms
+ms
+
+// mt : https://www.nic.org.mt/dotmt/
+*.mt
+
+// mu : http://en.wikipedia.org/wiki/.mu
+mu
+com.mu
+net.mu
+org.mu
+gov.mu
+ac.mu
+co.mu
+or.mu
+
+// museum : http://about.museum/naming/
+// http://index.museum/
+museum
+academy.museum
+agriculture.museum
+air.museum
+airguard.museum
+alabama.museum
+alaska.museum
+amber.museum
+ambulance.museum
+american.museum
+americana.museum
+americanantiques.museum
+americanart.museum
+amsterdam.museum
+and.museum
+annefrank.museum
+anthro.museum
+anthropology.museum
+antiques.museum
+aquarium.museum
+arboretum.museum
+archaeological.museum
+archaeology.museum
+architecture.museum
+art.museum
+artanddesign.museum
+artcenter.museum
+artdeco.museum
+arteducation.museum
+artgallery.museum
+arts.museum
+artsandcrafts.museum
+asmatart.museum
+assassination.museum
+assisi.museum
+association.museum
+astronomy.museum
+atlanta.museum
+austin.museum
+australia.museum
+automotive.museum
+aviation.museum
+axis.museum
+badajoz.museum
+baghdad.museum
+bahn.museum
+bale.museum
+baltimore.museum
+barcelona.museum
+baseball.museum
+basel.museum
+baths.museum
+bauern.museum
+beauxarts.museum
+beeldengeluid.museum
+bellevue.museum
+bergbau.museum
+berkeley.museum
+berlin.museum
+bern.museum
+bible.museum
+bilbao.museum
+bill.museum
+birdart.museum
+birthplace.museum
+bonn.museum
+boston.museum
+botanical.museum
+botanicalgarden.museum
+botanicgarden.museum
+botany.museum
+brandywinevalley.museum
+brasil.museum
+bristol.museum
+british.museum
+britishcolumbia.museum
+broadcast.museum
+brunel.museum
+brussel.museum
+brussels.museum
+bruxelles.museum
+building.museum
+burghof.museum
+bus.museum
+bushey.museum
+cadaques.museum
+california.museum
+cambridge.museum
+can.museum
+canada.museum
+capebreton.museum
+carrier.museum
+cartoonart.museum
+casadelamoneda.museum
+castle.museum
+castres.museum
+celtic.museum
+center.museum
+chattanooga.museum
+cheltenham.museum
+chesapeakebay.museum
+chicago.museum
+children.museum
+childrens.museum
+childrensgarden.museum
+chiropractic.museum
+chocolate.museum
+christiansburg.museum
+cincinnati.museum
+cinema.museum
+circus.museum
+civilisation.museum
+civilization.museum
+civilwar.museum
+clinton.museum
+clock.museum
+coal.museum
+coastaldefence.museum
+cody.museum
+coldwar.museum
+collection.museum
+colonialwilliamsburg.museum
+coloradoplateau.museum
+columbia.museum
+columbus.museum
+communication.museum
+communications.museum
+community.museum
+computer.museum
+computerhistory.museum
+comunicações.museum
+contemporary.museum
+contemporaryart.museum
+convent.museum
+copenhagen.museum
+corporation.museum
+correios-e-telecomunicações.museum
+corvette.museum
+costume.museum
+countryestate.museum
+county.museum
+crafts.museum
+cranbrook.museum
+creation.museum
+cultural.museum
+culturalcenter.museum
+culture.museum
+cyber.museum
+cymru.museum
+dali.museum
+dallas.museum
+database.museum
+ddr.museum
+decorativearts.museum
+delaware.museum
+delmenhorst.museum
+denmark.museum
+depot.museum
+design.museum
+detroit.museum
+dinosaur.museum
+discovery.museum
+dolls.museum
+donostia.museum
+durham.museum
+eastafrica.museum
+eastcoast.museum
+education.museum
+educational.museum
+egyptian.museum
+eisenbahn.museum
+elburg.museum
+elvendrell.museum
+embroidery.museum
+encyclopedic.museum
+england.museum
+entomology.museum
+environment.museum
+environmentalconservation.museum
+epilepsy.museum
+essex.museum
+estate.museum
+ethnology.museum
+exeter.museum
+exhibition.museum
+family.museum
+farm.museum
+farmequipment.museum
+farmers.museum
+farmstead.museum
+field.museum
+figueres.museum
+filatelia.museum
+film.museum
+fineart.museum
+finearts.museum
+finland.museum
+flanders.museum
+florida.museum
+force.museum
+fortmissoula.museum
+fortworth.museum
+foundation.museum
+francaise.museum
+frankfurt.museum
+franziskaner.museum
+freemasonry.museum
+freiburg.museum
+fribourg.museum
+frog.museum
+fundacio.museum
+furniture.museum
+gallery.museum
+garden.museum
+gateway.museum
+geelvinck.museum
+gemological.museum
+geology.museum
+georgia.museum
+giessen.museum
+glas.museum
+glass.museum
+gorge.museum
+grandrapids.museum
+graz.museum
+guernsey.museum
+halloffame.museum
+hamburg.museum
+handson.museum
+harvestcelebration.museum
+hawaii.museum
+health.museum
+heimatunduhren.museum
+hellas.museum
+helsinki.museum
+hembygdsforbund.museum
+heritage.museum
+histoire.museum
+historical.museum
+historicalsociety.museum
+historichouses.museum
+historisch.museum
+historisches.museum
+history.museum
+historyofscience.museum
+horology.museum
+house.museum
+humanities.museum
+illustration.museum
+imageandsound.museum
+indian.museum
+indiana.museum
+indianapolis.museum
+indianmarket.museum
+intelligence.museum
+interactive.museum
+iraq.museum
+iron.museum
+isleofman.museum
+jamison.museum
+jefferson.museum
+jerusalem.museum
+jewelry.museum
+jewish.museum
+jewishart.museum
+jfk.museum
+journalism.museum
+judaica.museum
+judygarland.museum
+juedisches.museum
+juif.museum
+karate.museum
+karikatur.museum
+kids.museum
+koebenhavn.museum
+koeln.museum
+kunst.museum
+kunstsammlung.museum
+kunstunddesign.museum
+labor.museum
+labour.museum
+lajolla.museum
+lancashire.museum
+landes.museum
+lans.museum
+läns.museum
+larsson.museum
+lewismiller.museum
+lincoln.museum
+linz.museum
+living.museum
+livinghistory.museum
+localhistory.museum
+london.museum
+losangeles.museum
+louvre.museum
+loyalist.museum
+lucerne.museum
+luxembourg.museum
+luzern.museum
+mad.museum
+madrid.museum
+mallorca.museum
+manchester.museum
+mansion.museum
+mansions.museum
+manx.museum
+marburg.museum
+maritime.museum
+maritimo.museum
+maryland.museum
+marylhurst.museum
+media.museum
+medical.museum
+medizinhistorisches.museum
+meeres.museum
+memorial.museum
+mesaverde.museum
+michigan.museum
+midatlantic.museum
+military.museum
+mill.museum
+miners.museum
+mining.museum
+minnesota.museum
+missile.museum
+missoula.museum
+modern.museum
+moma.museum
+money.museum
+monmouth.museum
+monticello.museum
+montreal.museum
+moscow.museum
+motorcycle.museum
+muenchen.museum
+muenster.museum
+mulhouse.museum
+muncie.museum
+museet.museum
+museumcenter.museum
+museumvereniging.museum
+music.museum
+national.museum
+nationalfirearms.museum
+nationalheritage.museum
+nativeamerican.museum
+naturalhistory.museum
+naturalhistorymuseum.museum
+naturalsciences.museum
+nature.museum
+naturhistorisches.museum
+natuurwetenschappen.museum
+naumburg.museum
+naval.museum
+nebraska.museum
+neues.museum
+newhampshire.museum
+newjersey.museum
+newmexico.museum
+newport.museum
+newspaper.museum
+newyork.museum
+niepce.museum
+norfolk.museum
+north.museum
+nrw.museum
+nuernberg.museum
+nuremberg.museum
+nyc.museum
+nyny.museum
+oceanographic.museum
+oceanographique.museum
+omaha.museum
+online.museum
+ontario.museum
+openair.museum
+oregon.museum
+oregontrail.museum
+otago.museum
+oxford.museum
+pacific.museum
+paderborn.museum
+palace.museum
+paleo.museum
+palmsprings.museum
+panama.museum
+paris.museum
+pasadena.museum
+pharmacy.museum
+philadelphia.museum
+philadelphiaarea.museum
+philately.museum
+phoenix.museum
+photography.museum
+pilots.museum
+pittsburgh.museum
+planetarium.museum
+plantation.museum
+plants.museum
+plaza.museum
+portal.museum
+portland.museum
+portlligat.museum
+posts-and-telecommunications.museum
+preservation.museum
+presidio.museum
+press.museum
+project.museum
+public.museum
+pubol.museum
+quebec.museum
+railroad.museum
+railway.museum
+research.museum
+resistance.museum
+riodejaneiro.museum
+rochester.museum
+rockart.museum
+roma.museum
+russia.museum
+saintlouis.museum
+salem.museum
+salvadordali.museum
+salzburg.museum
+sandiego.museum
+sanfrancisco.museum
+santabarbara.museum
+santacruz.museum
+santafe.museum
+saskatchewan.museum
+satx.museum
+savannahga.museum
+schlesisches.museum
+schoenbrunn.museum
+schokoladen.museum
+school.museum
+schweiz.museum
+science.museum
+scienceandhistory.museum
+scienceandindustry.museum
+sciencecenter.museum
+sciencecenters.museum
+science-fiction.museum
+sciencehistory.museum
+sciences.museum
+sciencesnaturelles.museum
+scotland.museum
+seaport.museum
+settlement.museum
+settlers.museum
+shell.museum
+sherbrooke.museum
+sibenik.museum
+silk.museum
+ski.museum
+skole.museum
+society.museum
+sologne.museum
+soundandvision.museum
+southcarolina.museum
+southwest.museum
+space.museum
+spy.museum
+square.museum
+stadt.museum
+stalbans.museum
+starnberg.museum
+state.museum
+stateofdelaware.museum
+station.museum
+steam.museum
+steiermark.museum
+stjohn.museum
+stockholm.museum
+stpetersburg.museum
+stuttgart.museum
+suisse.museum
+surgeonshall.museum
+surrey.museum
+svizzera.museum
+sweden.museum
+sydney.museum
+tank.museum
+tcm.museum
+technology.museum
+telekommunikation.museum
+television.museum
+texas.museum
+textile.museum
+theater.museum
+time.museum
+timekeeping.museum
+topology.museum
+torino.museum
+touch.museum
+town.museum
+transport.museum
+tree.museum
+trolley.museum
+trust.museum
+trustee.museum
+uhren.museum
+ulm.museum
+undersea.museum
+university.museum
+usa.museum
+usantiques.museum
+usarts.museum
+uscountryestate.museum
+usculture.museum
+usdecorativearts.museum
+usgarden.museum
+ushistory.museum
+ushuaia.museum
+uslivinghistory.museum
+utah.museum
+uvic.museum
+valley.museum
+vantaa.museum
+versailles.museum
+viking.museum
+village.museum
+virginia.museum
+virtual.museum
+virtuel.museum
+vlaanderen.museum
+volkenkunde.museum
+wales.museum
+wallonie.museum
+war.museum
+washingtondc.museum
+watchandclock.museum
+watch-and-clock.museum
+western.museum
+westfalen.museum
+whaling.museum
+wildlife.museum
+williamsburg.museum
+windmill.museum
+workshop.museum
+york.museum
+yorkshire.museum
+yosemite.museum
+youth.museum
+zoological.museum
+zoology.museum
+ירושלים.museum
+иком.museum
+
+// mv : http://en.wikipedia.org/wiki/.mv
+// "mv" included because, contra Wikipedia, google.mv exists.
+mv
+aero.mv
+biz.mv
+com.mv
+coop.mv
+edu.mv
+gov.mv
+info.mv
+int.mv
+mil.mv
+museum.mv
+name.mv
+net.mv
+org.mv
+pro.mv
+
+// mw : http://www.registrar.mw/
+mw
+ac.mw
+biz.mw
+co.mw
+com.mw
+coop.mw
+edu.mw
+gov.mw
+int.mw
+museum.mw
+net.mw
+org.mw
+
+// mx : http://www.nic.mx/
+// Submitted by registry 2008-06-19
+mx
+com.mx
+org.mx
+gob.mx
+edu.mx
+net.mx
+
+// my : http://www.mynic.net.my/
+my
+com.my
+net.my
+org.my
+gov.my
+edu.my
+mil.my
+name.my
+
+// mz : http://www.gobin.info/domainname/mz-template.doc
+*.mz
+!teledata.mz
+
+// na : http://www.na-nic.com.na/
+// http://www.info.na/domain/
+na
+info.na
+pro.na
+name.na
+school.na
+or.na
+dr.na
+us.na
+mx.na
+ca.na
+in.na
+cc.na
+tv.na
+ws.na
+mobi.na
+co.na
+com.na
+org.na
+
+// name : has 2nd-level tlds, but there's no list of them
+name
+
+// nc : http://www.cctld.nc/
+nc
+asso.nc
+
+// ne : http://en.wikipedia.org/wiki/.ne
+ne
+
+// net : http://en.wikipedia.org/wiki/.net
+net
+
+// nf : http://en.wikipedia.org/wiki/.nf
+nf
+com.nf
+net.nf
+per.nf
+rec.nf
+web.nf
+arts.nf
+firm.nf
+info.nf
+other.nf
+store.nf
+
+// ng : http://psg.com/dns/ng/
+// Submitted by registry 2008-06-17
+ac.ng
+com.ng
+edu.ng
+gov.ng
+net.ng
+org.ng
+
+// ni : http://www.nic.ni/dominios.htm
+*.ni
+
+// nl : http://www.domain-registry.nl/ace.php/c,728,122,,,,Home.html
+// Confirmed by registry (with technical
+// reservations) 2008-06-08
+nl
+
+// BV.nl will be a registry for dutch BV's (besloten vennootschap)
+bv.nl
+
+// no : http://www.norid.no/regelverk/index.en.html
+// The Norwegian registry has declined to notify us of updates. The web pages
+// referenced below are the official source of the data. There is also an
+// announce mailing list:
+// https://postlister.uninett.no/sympa/info/norid-diskusjon
+no
+// Norid generic domains : http://www.norid.no/regelverk/vedlegg-c.en.html
+fhs.no
+vgs.no
+fylkesbibl.no
+folkebibl.no
+museum.no
+idrett.no
+priv.no
+// Non-Norid generic domains : http://www.norid.no/regelverk/vedlegg-d.en.html
+mil.no
+stat.no
+dep.no
+kommune.no
+herad.no
+// no geographical names : http://www.norid.no/regelverk/vedlegg-b.en.html
+// counties
+aa.no
+ah.no
+bu.no
+fm.no
+hl.no
+hm.no
+jan-mayen.no
+mr.no
+nl.no
+nt.no
+of.no
+ol.no
+oslo.no
+rl.no
+sf.no
+st.no
+svalbard.no
+tm.no
+tr.no
+va.no
+vf.no
+// primary and lower secondary schools per county
+gs.aa.no
+gs.ah.no
+gs.bu.no
+gs.fm.no
+gs.hl.no
+gs.hm.no
+gs.jan-mayen.no
+gs.mr.no
+gs.nl.no
+gs.nt.no
+gs.of.no
+gs.ol.no
+gs.oslo.no
+gs.rl.no
+gs.sf.no
+gs.st.no
+gs.svalbard.no
+gs.tm.no
+gs.tr.no
+gs.va.no
+gs.vf.no
+// cities
+akrehamn.no
+åkrehamn.no
+algard.no
+ålgård.no
+arna.no
+brumunddal.no
+bryne.no
+bronnoysund.no
+brønnøysund.no
+drobak.no
+drøbak.no
+egersund.no
+fetsund.no
+floro.no
+florø.no
+fredrikstad.no
+hokksund.no
+honefoss.no
+hønefoss.no
+jessheim.no
+jorpeland.no
+jørpeland.no
+kirkenes.no
+kopervik.no
+krokstadelva.no
+langevag.no
+langevåg.no
+leirvik.no
+mjondalen.no
+mjøndalen.no
+mo-i-rana.no
+mosjoen.no
+mosjøen.no
+nesoddtangen.no
+orkanger.no
+osoyro.no
+osøyro.no
+raholt.no
+råholt.no
+sandnessjoen.no
+sandnessjøen.no
+skedsmokorset.no
+slattum.no
+spjelkavik.no
+stathelle.no
+stavern.no
+stjordalshalsen.no
+stjørdalshalsen.no
+tananger.no
+tranby.no
+vossevangen.no
+// communities
+afjord.no
+åfjord.no
+agdenes.no
+al.no
+ål.no
+alesund.no
+ålesund.no
+alstahaug.no
+alta.no
+áltá.no
+alaheadju.no
+álaheadju.no
+alvdal.no
+amli.no
+åmli.no
+amot.no
+åmot.no
+andebu.no
+andoy.no
+andøy.no
+andasuolo.no
+ardal.no
+årdal.no
+aremark.no
+arendal.no
+ås.no
+aseral.no
+åseral.no
+asker.no
+askim.no
+askvoll.no
+askoy.no
+askøy.no
+asnes.no
+åsnes.no
+audnedaln.no
+aukra.no
+aure.no
+aurland.no
+aurskog-holand.no
+aurskog-høland.no
+austevoll.no
+austrheim.no
+averoy.no
+averøy.no
+balestrand.no
+ballangen.no
+balat.no
+bálát.no
+balsfjord.no
+bahccavuotna.no
+báhccavuotna.no
+bamble.no
+bardu.no
+beardu.no
+beiarn.no
+bajddar.no
+bájddar.no
+baidar.no
+báidár.no
+berg.no
+bergen.no
+berlevag.no
+berlevåg.no
+bearalvahki.no
+bearalváhki.no
+bindal.no
+birkenes.no
+bjarkoy.no
+bjarkøy.no
+bjerkreim.no
+bjugn.no
+bodo.no
+bodø.no
+badaddja.no
+bådåddjå.no
+budejju.no
+bokn.no
+bremanger.no
+bronnoy.no
+brønnøy.no
+bygland.no
+bykle.no
+barum.no
+bærum.no
+bo.telemark.no
+bø.telemark.no
+bo.nordland.no
+bø.nordland.no
+bievat.no
+bievát.no
+bomlo.no
+bømlo.no
+batsfjord.no
+båtsfjord.no
+bahcavuotna.no
+báhcavuotna.no
+dovre.no
+drammen.no
+drangedal.no
+dyroy.no
+dyrøy.no
+donna.no
+dønna.no
+eid.no
+eidfjord.no
+eidsberg.no
+eidskog.no
+eidsvoll.no
+eigersund.no
+elverum.no
+enebakk.no
+engerdal.no
+etne.no
+etnedal.no
+evenes.no
+evenassi.no
+evenášši.no
+evje-og-hornnes.no
+farsund.no
+fauske.no
+fuossko.no
+fuoisku.no
+fedje.no
+fet.no
+finnoy.no
+finnøy.no
+fitjar.no
+fjaler.no
+fjell.no
+flakstad.no
+flatanger.no
+flekkefjord.no
+flesberg.no
+flora.no
+fla.no
+flå.no
+folldal.no
+forsand.no
+fosnes.no
+frei.no
+frogn.no
+froland.no
+frosta.no
+frana.no
+fræna.no
+froya.no
+frøya.no
+fusa.no
+fyresdal.no
+forde.no
+førde.no
+gamvik.no
+gangaviika.no
+gáŋgaviika.no
+gaular.no
+gausdal.no
+gildeskal.no
+gildeskål.no
+giske.no
+gjemnes.no
+gjerdrum.no
+gjerstad.no
+gjesdal.no
+gjovik.no
+gjøvik.no
+gloppen.no
+gol.no
+gran.no
+grane.no
+granvin.no
+gratangen.no
+grimstad.no
+grong.no
+kraanghke.no
+kråanghke.no
+grue.no
+gulen.no
+hadsel.no
+halden.no
+halsa.no
+hamar.no
+hamaroy.no
+habmer.no
+hábmer.no
+hapmir.no
+hápmir.no
+hammerfest.no
+hammarfeasta.no
+hámmárfeasta.no
+haram.no
+hareid.no
+harstad.no
+hasvik.no
+aknoluokta.no
+ákŋoluokta.no
+hattfjelldal.no
+aarborte.no
+haugesund.no
+hemne.no
+hemnes.no
+hemsedal.no
+heroy.more-og-romsdal.no
+herøy.møre-og-romsdal.no
+heroy.nordland.no
+herøy.nordland.no
+hitra.no
+hjartdal.no
+hjelmeland.no
+hobol.no
+hobøl.no
+hof.no
+hol.no
+hole.no
+holmestrand.no
+holtalen.no
+holtålen.no
+hornindal.no
+horten.no
+hurdal.no
+hurum.no
+hvaler.no
+hyllestad.no
+hagebostad.no
+hægebostad.no
+hoyanger.no
+høyanger.no
+hoylandet.no
+høylandet.no
+ha.no
+hå.no
+ibestad.no
+inderoy.no
+inderøy.no
+iveland.no
+jevnaker.no
+jondal.no
+jolster.no
+jølster.no
+karasjok.no
+karasjohka.no
+kárášjohka.no
+karlsoy.no
+galsa.no
+gálsá.no
+karmoy.no
+karmøy.no
+kautokeino.no
+guovdageaidnu.no
+klepp.no
+klabu.no
+klæbu.no
+kongsberg.no
+kongsvinger.no
+kragero.no
+kragerø.no
+kristiansand.no
+kristiansund.no
+krodsherad.no
+krødsherad.no
+kvalsund.no
+rahkkeravju.no
+ráhkkerávju.no
+kvam.no
+kvinesdal.no
+kvinnherad.no
+kviteseid.no
+kvitsoy.no
+kvitsøy.no
+kvafjord.no
+kvæfjord.no
+giehtavuoatna.no
+kvanangen.no
+kvænangen.no
+navuotna.no
+návuotna.no
+kafjord.no
+kåfjord.no
+gaivuotna.no
+gáivuotna.no
+larvik.no
+lavangen.no
+lavagis.no
+loabat.no
+loabát.no
+lebesby.no
+davvesiida.no
+leikanger.no
+leirfjord.no
+leka.no
+leksvik.no
+lenvik.no
+leangaviika.no
+leaŋgaviika.no
+lesja.no
+levanger.no
+lier.no
+lierne.no
+lillehammer.no
+lillesand.no
+lindesnes.no
+lindas.no
+lindås.no
+lom.no
+loppa.no
+lahppi.no
+láhppi.no
+lund.no
+lunner.no
+luroy.no
+lurøy.no
+luster.no
+lyngdal.no
+lyngen.no
+ivgu.no
+lardal.no
+lerdal.no
+lærdal.no
+lodingen.no
+lødingen.no
+lorenskog.no
+lørenskog.no
+loten.no
+løten.no
+malvik.no
+masoy.no
+måsøy.no
+muosat.no
+muosát.no
+mandal.no
+marker.no
+marnardal.no
+masfjorden.no
+meland.no
+meldal.no
+melhus.no
+meloy.no
+meløy.no
+meraker.no
+meråker.no
+moareke.no
+moåreke.no
+midsund.no
+midtre-gauldal.no
+modalen.no
+modum.no
+molde.no
+moskenes.no
+moss.no
+mosvik.no
+malselv.no
+målselv.no
+malatvuopmi.no
+málatvuopmi.no
+namdalseid.no
+aejrie.no
+namsos.no
+namsskogan.no
+naamesjevuemie.no
+nååmesjevuemie.no
+laakesvuemie.no
+nannestad.no
+narvik.no
+narviika.no
+naustdal.no
+nedre-eiker.no
+nes.akershus.no
+nes.buskerud.no
+nesna.no
+nesodden.no
+nesseby.no
+unjarga.no
+unjárga.no
+nesset.no
+nissedal.no
+nittedal.no
+nord-aurdal.no
+nord-fron.no
+nord-odal.no
+norddal.no
+nordkapp.no
+davvenjarga.no
+davvenjárga.no
+nordre-land.no
+nordreisa.no
+raisa.no
+ráisa.no
+nore-og-uvdal.no
+notodden.no
+naroy.no
+nærøy.no
+notteroy.no
+nøtterøy.no
+odda.no
+oksnes.no
+øksnes.no
+oppdal.no
+oppegard.no
+oppegård.no
+orkdal.no
+orland.no
+ørland.no
+orskog.no
+ørskog.no
+orsta.no
+ørsta.no
+os.hedmark.no
+os.hordaland.no
+osen.no
+osteroy.no
+osterøy.no
+ostre-toten.no
+østre-toten.no
+overhalla.no
+ovre-eiker.no
+øvre-eiker.no
+oyer.no
+øyer.no
+oygarden.no
+øygarden.no
+oystre-slidre.no
+øystre-slidre.no
+porsanger.no
+porsangu.no
+porsáŋgu.no
+porsgrunn.no
+radoy.no
+radøy.no
+rakkestad.no
+rana.no
+ruovat.no
+randaberg.no
+rauma.no
+rendalen.no
+rennebu.no
+rennesoy.no
+rennesøy.no
+rindal.no
+ringebu.no
+ringerike.no
+ringsaker.no
+rissa.no
+risor.no
+risør.no
+roan.no
+rollag.no
+rygge.no
+ralingen.no
+rælingen.no
+rodoy.no
+rødøy.no
+romskog.no
+rømskog.no
+roros.no
+røros.no
+rost.no
+røst.no
+royken.no
+røyken.no
+royrvik.no
+røyrvik.no
+rade.no
+råde.no
+salangen.no
+siellak.no
+saltdal.no
+salat.no
+sálát.no
+sálat.no
+samnanger.no
+sande.more-og-romsdal.no
+sande.møre-og-romsdal.no
+sande.vestfold.no
+sandefjord.no
+sandnes.no
+sandoy.no
+sandøy.no
+sarpsborg.no
+sauda.no
+sauherad.no
+sel.no
+selbu.no
+selje.no
+seljord.no
+sigdal.no
+siljan.no
+sirdal.no
+skaun.no
+skedsmo.no
+ski.no
+skien.no
+skiptvet.no
+skjervoy.no
+skjervøy.no
+skierva.no
+skiervá.no
+skjak.no
+skjåk.no
+skodje.no
+skanland.no
+skånland.no
+skanit.no
+skánit.no
+smola.no
+smøla.no
+snillfjord.no
+snasa.no
+snåsa.no
+snoasa.no
+snaase.no
+snåase.no
+sogndal.no
+sokndal.no
+sola.no
+solund.no
+songdalen.no
+sortland.no
+spydeberg.no
+stange.no
+stavanger.no
+steigen.no
+steinkjer.no
+stjordal.no
+stjørdal.no
+stokke.no
+stor-elvdal.no
+stord.no
+stordal.no
+storfjord.no
+omasvuotna.no
+strand.no
+stranda.no
+stryn.no
+sula.no
+suldal.no
+sund.no
+sunndal.no
+surnadal.no
+sveio.no
+svelvik.no
+sykkylven.no
+sogne.no
+søgne.no
+somna.no
+sømna.no
+sondre-land.no
+søndre-land.no
+sor-aurdal.no
+sør-aurdal.no
+sor-fron.no
+sør-fron.no
+sor-odal.no
+sør-odal.no
+sor-varanger.no
+sør-varanger.no
+matta-varjjat.no
+mátta-várjjat.no
+sorfold.no
+sørfold.no
+sorreisa.no
+sørreisa.no
+sorum.no
+sørum.no
+tana.no
+deatnu.no
+time.no
+tingvoll.no
+tinn.no
+tjeldsund.no
+dielddanuorri.no
+tjome.no
+tjøme.no
+tokke.no
+tolga.no
+torsken.no
+tranoy.no
+tranøy.no
+tromso.no
+tromsø.no
+tromsa.no
+romsa.no
+trondheim.no
+troandin.no
+trysil.no
+trana.no
+træna.no
+trogstad.no
+trøgstad.no
+tvedestrand.no
+tydal.no
+tynset.no
+tysfjord.no
+divtasvuodna.no
+divttasvuotna.no
+tysnes.no
+tysvar.no
+tysvær.no
+tonsberg.no
+tønsberg.no
+ullensaker.no
+ullensvang.no
+ulvik.no
+utsira.no
+vadso.no
+vadsø.no
+cahcesuolo.no
+čáhcesuolo.no
+vaksdal.no
+valle.no
+vang.no
+vanylven.no
+vardo.no
+vardø.no
+varggat.no
+várggát.no
+vefsn.no
+vaapste.no
+vega.no
+vegarshei.no
+vegårshei.no
+vennesla.no
+verdal.no
+verran.no
+vestby.no
+vestnes.no
+vestre-slidre.no
+vestre-toten.no
+vestvagoy.no
+vestvågøy.no
+vevelstad.no
+vik.no
+vikna.no
+vindafjord.no
+volda.no
+voss.no
+varoy.no
+værøy.no
+vagan.no
+vågan.no
+voagat.no
+vagsoy.no
+vågsøy.no
+vaga.no
+vågå.no
+valer.ostfold.no
+våler.østfold.no
+valer.hedmark.no
+våler.hedmark.no
+
+// np : http://www.mos.com.np/register.html
+*.np
+
+// nr : http://cenpac.net.nr/dns/index.html
+// Confirmed by registry 2008-06-17
+nr
+biz.nr
+info.nr
+gov.nr
+edu.nr
+org.nr
+net.nr
+com.nr
+
+// nu : http://en.wikipedia.org/wiki/.nu
+nu
+
+// nz : http://en.wikipedia.org/wiki/.nz
+*.nz
+
+// om : http://en.wikipedia.org/wiki/.om
+*.om
+!mediaphone.om
+!nawrastelecom.om
+!nawras.om
+!omanmobile.om
+!omanpost.om
+!omantel.om
+!rakpetroleum.om
+!siemens.om
+!songfest.om
+!statecouncil.om
+
+// org : http://en.wikipedia.org/wiki/.org
+org
+
+// pa : http://www.nic.pa/
+// Some additional second level "domains" resolve directly as hostnames, such as
+// pannet.pa, so we add a rule for "pa".
+pa
+ac.pa
+gob.pa
+com.pa
+org.pa
+sld.pa
+edu.pa
+net.pa
+ing.pa
+abo.pa
+med.pa
+nom.pa
+
+// pe : https://www.nic.pe/InformeFinalComision.pdf
+pe
+edu.pe
+gob.pe
+nom.pe
+mil.pe
+org.pe
+com.pe
+net.pe
+
+// pf : http://www.gobin.info/domainname/formulaire-pf.pdf
+pf
+com.pf
+org.pf
+edu.pf
+
+// pg : http://en.wikipedia.org/wiki/.pg
+*.pg
+
+// ph : http://www.domains.ph/FAQ2.asp
+// Submitted by registry 2008-06-13
+ph
+com.ph
+net.ph
+org.ph
+gov.ph
+edu.ph
+ngo.ph
+mil.ph
+i.ph
+
+// pk : http://pk5.pknic.net.pk/pk5/msgNamepk.PK
+pk
+com.pk
+net.pk
+edu.pk
+org.pk
+fam.pk
+biz.pk
+web.pk
+gov.pk
+gob.pk
+gok.pk
+gon.pk
+gop.pk
+gos.pk
+info.pk
+
+// pl : http://www.dns.pl/english/
+pl
+// NASK functional domains (nask.pl / dns.pl) : http://www.dns.pl/english/dns-funk.html
+aid.pl
+agro.pl
+atm.pl
+auto.pl
+biz.pl
+com.pl
+edu.pl
+gmina.pl
+gsm.pl
+info.pl
+mail.pl
+miasta.pl
+media.pl
+mil.pl
+net.pl
+nieruchomosci.pl
+nom.pl
+org.pl
+pc.pl
+powiat.pl
+priv.pl
+realestate.pl
+rel.pl
+sex.pl
+shop.pl
+sklep.pl
+sos.pl
+szkola.pl
+targi.pl
+tm.pl
+tourism.pl
+travel.pl
+turystyka.pl
+// ICM functional domains (icm.edu.pl)
+6bone.pl
+art.pl
+mbone.pl
+// Government domains (administred by ippt.gov.pl)
+gov.pl
+uw.gov.pl
+um.gov.pl
+ug.gov.pl
+upow.gov.pl
+starostwo.gov.pl
+so.gov.pl
+sr.gov.pl
+po.gov.pl
+pa.gov.pl
+// other functional domains
+ngo.pl
+irc.pl
+usenet.pl
+// NASK geographical domains : http://www.dns.pl/english/dns-regiony.html
+augustow.pl
+babia-gora.pl
+bedzin.pl
+beskidy.pl
+bialowieza.pl
+bialystok.pl
+bielawa.pl
+bieszczady.pl
+boleslawiec.pl
+bydgoszcz.pl
+bytom.pl
+cieszyn.pl
+czeladz.pl
+czest.pl
+dlugoleka.pl
+elblag.pl
+elk.pl
+glogow.pl
+gniezno.pl
+gorlice.pl
+grajewo.pl
+ilawa.pl
+jaworzno.pl
+jelenia-gora.pl
+jgora.pl
+kalisz.pl
+kazimierz-dolny.pl
+karpacz.pl
+kartuzy.pl
+kaszuby.pl
+katowice.pl
+kepno.pl
+ketrzyn.pl
+klodzko.pl
+kobierzyce.pl
+kolobrzeg.pl
+konin.pl
+konskowola.pl
+kutno.pl
+lapy.pl
+lebork.pl
+legnica.pl
+lezajsk.pl
+limanowa.pl
+lomza.pl
+lowicz.pl
+lubin.pl
+lukow.pl
+malbork.pl
+malopolska.pl
+mazowsze.pl
+mazury.pl
+mielec.pl
+mielno.pl
+mragowo.pl
+naklo.pl
+nowaruda.pl
+nysa.pl
+olawa.pl
+olecko.pl
+olkusz.pl
+olsztyn.pl
+opoczno.pl
+opole.pl
+ostroda.pl
+ostroleka.pl
+ostrowiec.pl
+ostrowwlkp.pl
+pila.pl
+pisz.pl
+podhale.pl
+podlasie.pl
+polkowice.pl
+pomorze.pl
+pomorskie.pl
+prochowice.pl
+pruszkow.pl
+przeworsk.pl
+pulawy.pl
+radom.pl
+rawa-maz.pl
+rybnik.pl
+rzeszow.pl
+sanok.pl
+sejny.pl
+siedlce.pl
+slask.pl
+slupsk.pl
+sosnowiec.pl
+stalowa-wola.pl
+skoczow.pl
+starachowice.pl
+stargard.pl
+suwalki.pl
+swidnica.pl
+swiebodzin.pl
+swinoujscie.pl
+szczecin.pl
+szczytno.pl
+tarnobrzeg.pl
+tgory.pl
+turek.pl
+tychy.pl
+ustka.pl
+walbrzych.pl
+warmia.pl
+warszawa.pl
+waw.pl
+wegrow.pl
+wielun.pl
+wlocl.pl
+wloclawek.pl
+wodzislaw.pl
+wolomin.pl
+wroclaw.pl
+zachpomor.pl
+zagan.pl
+zarow.pl
+zgora.pl
+zgorzelec.pl
+// TASK geographical domains (www.task.gda.pl/uslugi/dns)
+gda.pl
+gdansk.pl
+gdynia.pl
+med.pl
+sopot.pl
+// other geographical domains
+gliwice.pl
+krakow.pl
+poznan.pl
+wroc.pl
+zakopane.pl
+
+// pm : http://www.afnic.fr/medias/documents/AFNIC-naming-policy2012.pdf
+pm
+
+// pn : http://www.government.pn/PnRegistry/policies.htm
+pn
+gov.pn
+co.pn
+org.pn
+edu.pn
+net.pn
+
+// post : http://en.wikipedia.org/wiki/.post
+post
+
+// pr : http://www.nic.pr/index.asp?f=1
+pr
+com.pr
+net.pr
+org.pr
+gov.pr
+edu.pr
+isla.pr
+pro.pr
+biz.pr
+info.pr
+name.pr
+// these aren't mentioned on nic.pr, but on http://en.wikipedia.org/wiki/.pr
+est.pr
+prof.pr
+ac.pr
+
+// pro : http://www.nic.pro/support_faq.htm
+pro
+aca.pro
+bar.pro
+cpa.pro
+jur.pro
+law.pro
+med.pro
+eng.pro
+
+// ps : http://en.wikipedia.org/wiki/.ps
+// http://www.nic.ps/registration/policy.html#reg
+ps
+edu.ps
+gov.ps
+sec.ps
+plo.ps
+com.ps
+org.ps
+net.ps
+
+// pt : http://online.dns.pt/dns/start_dns
+pt
+net.pt
+gov.pt
+org.pt
+edu.pt
+int.pt
+publ.pt
+com.pt
+nome.pt
+
+// pw : http://en.wikipedia.org/wiki/.pw
+pw
+co.pw
+ne.pw
+or.pw
+ed.pw
+go.pw
+belau.pw
+
+// py : http://www.nic.py/pautas.html#seccion_9
+// Confirmed by registry 2012-10-03
+py
+com.py
+coop.py
+edu.py
+gov.py
+mil.py
+net.py
+org.py
+
+// qa : http://domains.qa/en/
+qa
+com.qa
+edu.qa
+gov.qa
+mil.qa
+name.qa
+net.qa
+org.qa
+sch.qa
+
+// re : http://www.afnic.re/obtenir/chartes/nommage-re/annexe-descriptifs
+re
+com.re
+asso.re
+nom.re
+
+// ro : http://www.rotld.ro/
+ro
+com.ro
+org.ro
+tm.ro
+nt.ro
+nom.ro
+info.ro
+rec.ro
+arts.ro
+firm.ro
+store.ro
+www.ro
+
+// rs : http://en.wikipedia.org/wiki/.rs
+rs
+co.rs
+org.rs
+edu.rs
+ac.rs
+gov.rs
+in.rs
+
+// ru : http://www.cctld.ru/ru/docs/aktiv_8.php
+// Industry domains
+ru
+ac.ru
+com.ru
+edu.ru
+int.ru
+net.ru
+org.ru
+pp.ru
+// Geographical domains
+adygeya.ru
+altai.ru
+amur.ru
+arkhangelsk.ru
+astrakhan.ru
+bashkiria.ru
+belgorod.ru
+bir.ru
+bryansk.ru
+buryatia.ru
+cbg.ru
+chel.ru
+chelyabinsk.ru
+chita.ru
+chukotka.ru
+chuvashia.ru
+dagestan.ru
+dudinka.ru
+e-burg.ru
+grozny.ru
+irkutsk.ru
+ivanovo.ru
+izhevsk.ru
+jar.ru
+joshkar-ola.ru
+kalmykia.ru
+kaluga.ru
+kamchatka.ru
+karelia.ru
+kazan.ru
+kchr.ru
+kemerovo.ru
+khabarovsk.ru
+khakassia.ru
+khv.ru
+kirov.ru
+koenig.ru
+komi.ru
+kostroma.ru
+krasnoyarsk.ru
+kuban.ru
+kurgan.ru
+kursk.ru
+lipetsk.ru
+magadan.ru
+mari.ru
+mari-el.ru
+marine.ru
+mordovia.ru
+mosreg.ru
+msk.ru
+murmansk.ru
+nalchik.ru
+nnov.ru
+nov.ru
+novosibirsk.ru
+nsk.ru
+omsk.ru
+orenburg.ru
+oryol.ru
+palana.ru
+penza.ru
+perm.ru
+pskov.ru
+ptz.ru
+rnd.ru
+ryazan.ru
+sakhalin.ru
+samara.ru
+saratov.ru
+simbirsk.ru
+smolensk.ru
+spb.ru
+stavropol.ru
+stv.ru
+surgut.ru
+tambov.ru
+tatarstan.ru
+tom.ru
+tomsk.ru
+tsaritsyn.ru
+tsk.ru
+tula.ru
+tuva.ru
+tver.ru
+tyumen.ru
+udm.ru
+udmurtia.ru
+ulan-ude.ru
+vladikavkaz.ru
+vladimir.ru
+vladivostok.ru
+volgograd.ru
+vologda.ru
+voronezh.ru
+vrn.ru
+vyatka.ru
+yakutia.ru
+yamal.ru
+yaroslavl.ru
+yekaterinburg.ru
+yuzhno-sakhalinsk.ru
+// More geographical domains
+amursk.ru
+baikal.ru
+cmw.ru
+fareast.ru
+jamal.ru
+kms.ru
+k-uralsk.ru
+kustanai.ru
+kuzbass.ru
+magnitka.ru
+mytis.ru
+nakhodka.ru
+nkz.ru
+norilsk.ru
+oskol.ru
+pyatigorsk.ru
+rubtsovsk.ru
+snz.ru
+syzran.ru
+vdonsk.ru
+zgrad.ru
+// State domains
+gov.ru
+mil.ru
+// Technical domains
+test.ru
+
+// rw : http://www.nic.rw/cgi-bin/policy.pl
+rw
+gov.rw
+net.rw
+edu.rw
+ac.rw
+com.rw
+co.rw
+int.rw
+mil.rw
+gouv.rw
+
+// sa : http://www.nic.net.sa/
+sa
+com.sa
+net.sa
+org.sa
+gov.sa
+med.sa
+pub.sa
+edu.sa
+sch.sa
+
+// sb : http://www.sbnic.net.sb/
+// Submitted by registry 2008-06-08
+sb
+com.sb
+edu.sb
+gov.sb
+net.sb
+org.sb
+
+// sc : http://www.nic.sc/
+sc
+com.sc
+gov.sc
+net.sc
+org.sc
+edu.sc
+
+// sd : http://www.isoc.sd/sudanic.isoc.sd/billing_pricing.htm
+// Submitted by registry 2008-06-17
+sd
+com.sd
+net.sd
+org.sd
+edu.sd
+med.sd
+tv.sd
+gov.sd
+info.sd
+
+// se : http://en.wikipedia.org/wiki/.se
+// Submitted by registry 2008-06-24
+se
+a.se
+ac.se
+b.se
+bd.se
+brand.se
+c.se
+d.se
+e.se
+f.se
+fh.se
+fhsk.se
+fhv.se
+g.se
+h.se
+i.se
+k.se
+komforb.se
+kommunalforbund.se
+komvux.se
+l.se
+lanbib.se
+m.se
+n.se
+naturbruksgymn.se
+o.se
+org.se
+p.se
+parti.se
+pp.se
+press.se
+r.se
+s.se
+sshn.se
+t.se
+tm.se
+u.se
+w.se
+x.se
+y.se
+z.se
+
+// sg : http://www.nic.net.sg/page/registration-policies-procedures-and-guidelines
+sg
+com.sg
+net.sg
+org.sg
+gov.sg
+edu.sg
+per.sg
+
+// sh : http://www.nic.sh/registrar.html
+sh
+com.sh
+net.sh
+gov.sh
+org.sh
+mil.sh
+
+// si : http://en.wikipedia.org/wiki/.si
+si
+
+// sj : No registrations at this time.
+// Submitted by registry 2008-06-16
+
+// sk : http://en.wikipedia.org/wiki/.sk
+// list of 2nd level domains ?
+sk
+
+// sl : http://www.nic.sl
+// Submitted by registry 2008-06-12
+sl
+com.sl
+net.sl
+edu.sl
+gov.sl
+org.sl
+
+// sm : http://en.wikipedia.org/wiki/.sm
+sm
+
+// sn : http://en.wikipedia.org/wiki/.sn
+sn
+art.sn
+com.sn
+edu.sn
+gouv.sn
+org.sn
+perso.sn
+univ.sn
+
+// so : http://www.soregistry.com/
+so
+com.so
+net.so
+org.so
+
+// sr : http://en.wikipedia.org/wiki/.sr
+sr
+
+// st : http://www.nic.st/html/policyrules/
+st
+co.st
+com.st
+consulado.st
+edu.st
+embaixada.st
+gov.st
+mil.st
+net.st
+org.st
+principe.st
+saotome.st
+store.st
+
+// su : http://en.wikipedia.org/wiki/.su
+su
+
+// sv : http://www.svnet.org.sv/svpolicy.html
+*.sv
+
+// sx : http://en.wikipedia.org/wiki/.sx
+// Confirmed by registry 2012-05-31
+sx
+gov.sx
+
+// sy : http://en.wikipedia.org/wiki/.sy
+// see also: http://www.gobin.info/domainname/sy.doc
+sy
+edu.sy
+gov.sy
+net.sy
+mil.sy
+com.sy
+org.sy
+
+// sz : http://en.wikipedia.org/wiki/.sz
+// http://www.sispa.org.sz/
+sz
+co.sz
+ac.sz
+org.sz
+
+// tc : http://en.wikipedia.org/wiki/.tc
+tc
+
+// td : http://en.wikipedia.org/wiki/.td
+td
+
+// tel: http://en.wikipedia.org/wiki/.tel
+// http://www.telnic.org/
+tel
+
+// tf : http://en.wikipedia.org/wiki/.tf
+tf
+
+// tg : http://en.wikipedia.org/wiki/.tg
+// http://www.nic.tg/
+tg
+
+// th : http://en.wikipedia.org/wiki/.th
+// Submitted by registry 2008-06-17
+th
+ac.th
+co.th
+go.th
+in.th
+mi.th
+net.th
+or.th
+
+// tj : http://www.nic.tj/policy.html
+tj
+ac.tj
+biz.tj
+co.tj
+com.tj
+edu.tj
+go.tj
+gov.tj
+int.tj
+mil.tj
+name.tj
+net.tj
+nic.tj
+org.tj
+test.tj
+web.tj
+
+// tk : http://en.wikipedia.org/wiki/.tk
+tk
+
+// tl : http://en.wikipedia.org/wiki/.tl
+tl
+gov.tl
+
+// tm : http://www.nic.tm/local.html
+tm
+com.tm
+co.tm
+org.tm
+net.tm
+nom.tm
+gov.tm
+mil.tm
+edu.tm
+
+// tn : http://en.wikipedia.org/wiki/.tn
+// http://whois.ati.tn/
+tn
+com.tn
+ens.tn
+fin.tn
+gov.tn
+ind.tn
+intl.tn
+nat.tn
+net.tn
+org.tn
+info.tn
+perso.tn
+tourism.tn
+edunet.tn
+rnrt.tn
+rns.tn
+rnu.tn
+mincom.tn
+agrinet.tn
+defense.tn
+turen.tn
+
+// to : http://en.wikipedia.org/wiki/.to
+// Submitted by registry 2008-06-17
+to
+com.to
+gov.to
+net.to
+org.to
+edu.to
+mil.to
+
+// tr : http://en.wikipedia.org/wiki/.tr
+*.tr
+!nic.tr
+// Used by government in the TRNC
+// http://en.wikipedia.org/wiki/.nc.tr
+gov.nc.tr
+
+// travel : http://en.wikipedia.org/wiki/.travel
+travel
+
+// tt : http://www.nic.tt/
+tt
+co.tt
+com.tt
+org.tt
+net.tt
+biz.tt
+info.tt
+pro.tt
+int.tt
+coop.tt
+jobs.tt
+mobi.tt
+travel.tt
+museum.tt
+aero.tt
+name.tt
+gov.tt
+edu.tt
+
+// tv : http://en.wikipedia.org/wiki/.tv
+// Not listing any 2LDs as reserved since none seem to exist in practice,
+// Wikipedia notwithstanding.
+tv
+
+// tw : http://en.wikipedia.org/wiki/.tw
+tw
+edu.tw
+gov.tw
+mil.tw
+com.tw
+net.tw
+org.tw
+idv.tw
+game.tw
+ebiz.tw
+club.tw
+網路.tw
+組織.tw
+商業.tw
+
+// tz : http://www.tznic.or.tz/index.php/domains
+// Confirmed by registry 2013-01-22
+ac.tz
+co.tz
+go.tz
+hotel.tz
+info.tz
+me.tz
+mil.tz
+mobi.tz
+ne.tz
+or.tz
+sc.tz
+tv.tz
+
+// ua : https://hostmaster.ua/policy/?ua
+// Submitted by registry 2012-04-27
+ua
+// ua 2LD
+com.ua
+edu.ua
+gov.ua
+in.ua
+net.ua
+org.ua
+// ua geographic names
+// https://hostmaster.ua/2ld/
+cherkassy.ua
+cherkasy.ua
+chernigov.ua
+chernihiv.ua
+chernivtsi.ua
+chernovtsy.ua
+ck.ua
+cn.ua
+cr.ua
+crimea.ua
+cv.ua
+dn.ua
+dnepropetrovsk.ua
+dnipropetrovsk.ua
+dominic.ua
+donetsk.ua
+dp.ua
+if.ua
+ivano-frankivsk.ua
+kh.ua
+kharkiv.ua
+kharkov.ua
+kherson.ua
+khmelnitskiy.ua
+khmelnytskyi.ua
+kiev.ua
+kirovograd.ua
+km.ua
+kr.ua
+krym.ua
+ks.ua
+kv.ua
+kyiv.ua
+lg.ua
+lt.ua
+lugansk.ua
+lutsk.ua
+lv.ua
+lviv.ua
+mk.ua
+mykolaiv.ua
+nikolaev.ua
+od.ua
+odesa.ua
+odessa.ua
+pl.ua
+poltava.ua
+rivne.ua
+rovno.ua
+rv.ua
+sb.ua
+sebastopol.ua
+sevastopol.ua
+sm.ua
+sumy.ua
+te.ua
+ternopil.ua
+uz.ua
+uzhgorod.ua
+vinnica.ua
+vinnytsia.ua
+vn.ua
+volyn.ua
+yalta.ua
+zaporizhzhe.ua
+zaporizhzhia.ua
+zhitomir.ua
+zhytomyr.ua
+zp.ua
+zt.ua
+
+// Private registries in .ua
+co.ua
+pp.ua
+
+// ug : https://www.registry.co.ug/
+ug
+co.ug
+or.ug
+ac.ug
+sc.ug
+go.ug
+ne.ug
+com.ug
+org.ug
+
+// uk : http://en.wikipedia.org/wiki/.uk
+// Submitted by registry 2012-10-02
+// and tweaked by us pending further consultation.
+*.uk
+*.sch.uk
+!bl.uk
+!british-library.uk
+!jet.uk
+!mod.uk
+!national-library-scotland.uk
+!nel.uk
+!nic.uk
+!nls.uk
+!parliament.uk
+
+// us : http://en.wikipedia.org/wiki/.us
+us
+dni.us
+fed.us
+isa.us
+kids.us
+nsn.us
+// us geographic names
+ak.us
+al.us
+ar.us
+as.us
+az.us
+ca.us
+co.us
+ct.us
+dc.us
+de.us
+fl.us
+ga.us
+gu.us
+hi.us
+ia.us
+id.us
+il.us
+in.us
+ks.us
+ky.us
+la.us
+ma.us
+md.us
+me.us
+mi.us
+mn.us
+mo.us
+ms.us
+mt.us
+nc.us
+nd.us
+ne.us
+nh.us
+nj.us
+nm.us
+nv.us
+ny.us
+oh.us
+ok.us
+or.us
+pa.us
+pr.us
+ri.us
+sc.us
+sd.us
+tn.us
+tx.us
+ut.us
+vi.us
+vt.us
+va.us
+wa.us
+wi.us
+wv.us
+wy.us
+// The registrar notes several more specific domains available in each state,
+// such as state.*.us, dst.*.us, etc., but resolution of these is somewhat
+// haphazard; in some states these domains resolve as addresses, while in others
+// only subdomains are available, or even nothing at all. We include the
+// most common ones where it's clear that different sites are different
+// entities.
+k12.ak.us
+k12.al.us
+k12.ar.us
+k12.as.us
+k12.az.us
+k12.ca.us
+k12.co.us
+k12.ct.us
+k12.dc.us
+k12.de.us
+k12.fl.us
+k12.ga.us
+k12.gu.us
+// k12.hi.us Hawaii has a state-wide DOE login: bug 614565
+k12.ia.us
+k12.id.us
+k12.il.us
+k12.in.us
+k12.ks.us
+k12.ky.us
+k12.la.us
+k12.ma.us
+k12.md.us
+k12.me.us
+k12.mi.us
+k12.mn.us
+k12.mo.us
+k12.ms.us
+k12.mt.us
+k12.nc.us
+k12.nd.us
+k12.ne.us
+k12.nh.us
+k12.nj.us
+k12.nm.us
+k12.nv.us
+k12.ny.us
+k12.oh.us
+k12.ok.us
+k12.or.us
+k12.pa.us
+k12.pr.us
+k12.ri.us
+k12.sc.us
+k12.sd.us
+k12.tn.us
+k12.tx.us
+k12.ut.us
+k12.vi.us
+k12.vt.us
+k12.va.us
+k12.wa.us
+k12.wi.us
+k12.wv.us
+k12.wy.us
+
+cc.ak.us
+cc.al.us
+cc.ar.us
+cc.as.us
+cc.az.us
+cc.ca.us
+cc.co.us
+cc.ct.us
+cc.dc.us
+cc.de.us
+cc.fl.us
+cc.ga.us
+cc.gu.us
+cc.hi.us
+cc.ia.us
+cc.id.us
+cc.il.us
+cc.in.us
+cc.ks.us
+cc.ky.us
+cc.la.us
+cc.ma.us
+cc.md.us
+cc.me.us
+cc.mi.us
+cc.mn.us
+cc.mo.us
+cc.ms.us
+cc.mt.us
+cc.nc.us
+cc.nd.us
+cc.ne.us
+cc.nh.us
+cc.nj.us
+cc.nm.us
+cc.nv.us
+cc.ny.us
+cc.oh.us
+cc.ok.us
+cc.or.us
+cc.pa.us
+cc.pr.us
+cc.ri.us
+cc.sc.us
+cc.sd.us
+cc.tn.us
+cc.tx.us
+cc.ut.us
+cc.vi.us
+cc.vt.us
+cc.va.us
+cc.wa.us
+cc.wi.us
+cc.wv.us
+cc.wy.us
+
+lib.ak.us
+lib.al.us
+lib.ar.us
+lib.as.us
+lib.az.us
+lib.ca.us
+lib.co.us
+lib.ct.us
+lib.dc.us
+lib.de.us
+lib.fl.us
+lib.ga.us
+lib.gu.us
+lib.hi.us
+lib.ia.us
+lib.id.us
+lib.il.us
+lib.in.us
+lib.ks.us
+lib.ky.us
+lib.la.us
+lib.ma.us
+lib.md.us
+lib.me.us
+lib.mi.us
+lib.mn.us
+lib.mo.us
+lib.ms.us
+lib.mt.us
+lib.nc.us
+lib.nd.us
+lib.ne.us
+lib.nh.us
+lib.nj.us
+lib.nm.us
+lib.nv.us
+lib.ny.us
+lib.oh.us
+lib.ok.us
+lib.or.us
+lib.pa.us
+lib.pr.us
+lib.ri.us
+lib.sc.us
+lib.sd.us
+lib.tn.us
+lib.tx.us
+lib.ut.us
+lib.vi.us
+lib.vt.us
+lib.va.us
+lib.wa.us
+lib.wi.us
+lib.wv.us
+lib.wy.us
+
+// k12.ma.us contains school districts in Massachusetts. The 4LDs are
+// managed indepedently except for private (PVT), charter (CHTR) and
+// parochial (PAROCH) schools. Those are delegated dorectly to the
+// 5LD operators.
+pvt.k12.ma.us
+chtr.k12.ma.us
+paroch.k12.ma.us
+
+// uy : http://www.nic.org.uy/
+uy
+com.uy
+edu.uy
+gub.uy
+mil.uy
+net.uy
+org.uy
+
+// uz : http://www.reg.uz/
+uz
+co.uz
+com.uz
+net.uz
+org.uz
+
+// va : http://en.wikipedia.org/wiki/.va
+va
+
+// vc : http://en.wikipedia.org/wiki/.vc
+// Submitted by registry 2008-06-13
+vc
+com.vc
+net.vc
+org.vc
+gov.vc
+mil.vc
+edu.vc
+
+// ve : https://registro.nic.ve/
+// Confirmed by registry 2012-10-04
+ve
+co.ve
+com.ve
+e12.ve
+edu.ve
+gov.ve
+info.ve
+mil.ve
+net.ve
+org.ve
+web.ve
+
+// vg : http://en.wikipedia.org/wiki/.vg
+vg
+
+// vi : http://www.nic.vi/newdomainform.htm
+// http://www.nic.vi/Domain_Rules/body_domain_rules.html indicates some other
+// TLDs are "reserved", such as edu.vi and gov.vi, but doesn't actually say they
+// are available for registration (which they do not seem to be).
+vi
+co.vi
+com.vi
+k12.vi
+net.vi
+org.vi
+
+// vn : https://www.dot.vn/vnnic/vnnic/domainregistration.jsp
+vn
+com.vn
+net.vn
+org.vn
+edu.vn
+gov.vn
+int.vn
+ac.vn
+biz.vn
+info.vn
+name.vn
+pro.vn
+health.vn
+
+// vu : http://en.wikipedia.org/wiki/.vu
+// list of 2nd level tlds ?
+vu
+
+// wf : http://www.afnic.fr/medias/documents/AFNIC-naming-policy2012.pdf
+wf
+
+// ws : http://en.wikipedia.org/wiki/.ws
+// http://samoanic.ws/index.dhtml
+ws
+com.ws
+net.ws
+org.ws
+gov.ws
+edu.ws
+
+// yt : http://www.afnic.fr/medias/documents/AFNIC-naming-policy2012.pdf
+yt
+
+// IDN ccTLDs
+// Please sort by ISO 3166 ccTLD, then punicode string
+// when submitting patches and follow this format:
+// ("" ) :
+// [optional sponsoring org]
+//
+
+// xn--mgbaam7a8h ("Emerat" Arabic) : AE
+// http://nic.ae/english/arabicdomain/rules.jsp
+امارات
+
+// xn--54b7fta0cc ("Bangla" Bangla) : BD
+বাংলা
+
+// xn--fiqs8s ("China" Chinese-Han-Simplified <.Zhonggou>) : CN
+// CNNIC
+// http://cnnic.cn/html/Dir/2005/10/11/3218.htm
+中国
+
+// xn--fiqz9s ("China" Chinese-Han-Traditional <.Zhonggou>) : CN
+// CNNIC
+// http://cnnic.cn/html/Dir/2005/10/11/3218.htm
+中國
+
+// xn--lgbbat1ad8j ("Algeria / Al Jazair" Arabic) : DZ
+الجزائر
+
+// xn--wgbh1c ("Egypt" Arabic .masr) : EG
+// http://www.dotmasr.eg/
+مصر
+
+// xn--node ("ge" Georgian (Mkhedruli)) : GE
+გე
+
+// xn--j6w193g ("Hong Kong" Chinese-Han) : HK
+// https://www2.hkirc.hk/register/rules.jsp
+香港
+
+// xn--h2brj9c ("Bharat" Devanagari) : IN
+// India
+भारत
+
+// xn--mgbbh1a71e ("Bharat" Arabic) : IN
+// India
+بھارت
+
+// xn--fpcrj9c3d ("Bharat" Telugu) : IN
+// India
+భారత్
+
+// xn--gecrj9c ("Bharat" Gujarati) : IN
+// India
+ભારત
+
+// xn--s9brj9c ("Bharat" Gurmukhi) : IN
+// India
+ਭਾਰਤ
+
+// xn--45brj9c ("Bharat" Bengali) : IN
+// India
+ভারত
+
+// xn--xkc2dl3a5ee0h ("India" Tamil) : IN
+// India
+இந்தியா
+
+// xn--mgba3a4f16a ("Iran" Persian) : IR
+ایران
+
+// xn--mgba3a4fra ("Iran" Arabic) : IR
+ايران
+
+// xn--mgbayh7gpa ("al-Ordon" Arabic) : JO
+// National Information Technology Center (NITC)
+// Royal Scientific Society, Al-Jubeiha
+الاردن
+
+// xn--3e0b707e ("Republic of Korea" Hangul) : KR
+한국
+
+// xn--fzc2c9e2c ("Lanka" Sinhalese-Sinhala) : LK
+// http://nic.lk
+ලංකා
+
+// xn--xkc2al3hye2a ("Ilangai" Tamil) : LK
+// http://nic.lk
+இலங்கை
+
+// xn--mgbc0a9azcg ("Morocco / al-Maghrib" Arabic) : MA
+المغرب
+
+// xn--mgb9awbf ("Oman" Arabic) : OM
+عمان
+
+// xn--ygbi2ammx ("Falasteen" Arabic) : PS
+// The Palestinian National Internet Naming Authority (PNINA)
+// http://www.pnina.ps
+فلسطين
+
+// xn--90a3ac ("srb" Cyrillic) : RS
+срб
+
+// xn--p1ai ("rf" Russian-Cyrillic) : RU
+// http://www.cctld.ru/en/docs/rulesrf.php
+рф
+
+// xn--wgbl6a ("Qatar" Arabic) : QA
+// http://www.ict.gov.qa/
+قطر
+
+// xn--mgberp4a5d4ar ("AlSaudiah" Arabic) : SA
+// http://www.nic.net.sa/
+السعودية
+
+// xn--mgberp4a5d4a87g ("AlSaudiah" Arabic) variant : SA
+السعودیة
+
+// xn--mgbqly7c0a67fbc ("AlSaudiah" Arabic) variant : SA
+السعودیۃ
+
+// xn--mgbqly7cvafr ("AlSaudiah" Arabic) variant : SA
+السعوديه
+
+// xn--ogbpf8fl ("Syria" Arabic) : SY
+سورية
+
+// xn--mgbtf8fl ("Syria" Arabic) variant : SY
+سوريا
+
+// xn--yfro4i67o Singapore ("Singapore" Chinese-Han) : SG
+新加坡
+
+// xn--clchc0ea0b2g2a9gcd ("Singapore" Tamil) : SG
+சிங்கப்பூர்
+
+// xn--o3cw4h ("Thai" Thai) : TH
+// http://www.thnic.co.th
+ไทย
+
+// xn--pgbs0dh ("Tunis") : TN
+// http://nic.tn
+تونس
+
+// xn--kpry57d ("Taiwan" Chinese-Han-Traditional) : TW
+// http://www.twnic.net/english/dn/dn_07a.htm
+台灣
+
+// xn--kprw13d ("Taiwan" Chinese-Han-Simplified) : TW
+// http://www.twnic.net/english/dn/dn_07a.htm
+台湾
+
+// xn--nnx388a ("Taiwan") variant : TW
+臺灣
+
+// xn--j1amh ("ukr" Cyrillic) : UA
+укр
+
+// xn--mgb2ddes ("AlYemen" Arabic) : YE
+اليمن
+
+// xxx : http://icmregistry.com
+xxx
+
+// ye : http://www.y.net.ye/services/domain_name.htm
+*.ye
+
+// za : http://www.zadna.org.za/slds.html
+*.za
+
+// zm : http://en.wikipedia.org/wiki/.zm
+*.zm
+
+// zw : http://en.wikipedia.org/wiki/.zw
+*.zw
+
+// ===END ICANN DOMAINS===
+// ===BEGIN PRIVATE DOMAINS===
+
+// Amazon CloudFront : https://aws.amazon.com/cloudfront/
+// Requested by Donavan Miller 2013-03-22
+cloudfront.net
+
+// Amazon Elastic Compute Cloud: https://aws.amazon.com/ec2/
+// Requested by Osman Surkatty 2013-04-02
+compute.amazonaws.com
+us-east-1.amazonaws.com
+compute-1.amazonaws.com
+z-1.compute-1.amazonaws.com
+z-2.compute-1.amazonaws.com
+ap-northeast-1.compute.amazonaws.com
+ap-southeast-1.compute.amazonaws.com
+ap-southeast-2.compute.amazonaws.com
+eu-west-1.compute.amazonaws.com
+sa-east-1.compute.amazonaws.com
+us-gov-west-1.compute.amazonaws.com
+us-west-1.compute.amazonaws.com
+us-west-2.compute.amazonaws.com
+
+// Amazon Elastic Beanstalk : https://aws.amazon.com/elasticbeanstalk/
+// Requested by Adam Stein 2013-04-02
+elasticbeanstalk.com
+
+// Amazon Elastic Load Balancing : https://aws.amazon.com/elasticloadbalancing/
+// Requested by Scott Vidmar 2013-03-27
+elb.amazonaws.com
+
+// Amazon S3 : https://aws.amazon.com/s3/
+// Requested by Courtney Eckhardt 2013-03-22
+s3.amazonaws.com
+s3-us-west-2.amazonaws.com
+s3-us-west-1.amazonaws.com
+s3-eu-west-1.amazonaws.com
+s3-ap-southeast-1.amazonaws.com
+s3-ap-southeast-2.amazonaws.com
+s3-ap-northeast-1.amazonaws.com
+s3-sa-east-1.amazonaws.com
+s3-us-gov-west-1.amazonaws.com
+s3-fips-us-gov-west-1.amazonaws.com
+s3-website-us-east-1.amazonaws.com
+s3-website-us-west-2.amazonaws.com
+s3-website-us-west-1.amazonaws.com
+s3-website-eu-west-1.amazonaws.com
+s3-website-ap-southeast-1.amazonaws.com
+s3-website-ap-southeast-2.amazonaws.com
+s3-website-ap-northeast-1.amazonaws.com
+s3-website-sa-east-1.amazonaws.com
+s3-website-us-gov-west-1.amazonaws.com
+
+// BetaInABox
+// Requested by adrian@betainabox.com 2012-09-13
+betainabox.com
+
+// CentralNic : http://www.centralnic.com/names/domains
+// Requested by registry 2012-09-27
+ae.org
+ar.com
+br.com
+cn.com
+com.de
+de.com
+eu.com
+gb.com
+gb.net
+gr.com
+hu.com
+hu.net
+jp.net
+jpn.com
+kr.com
+no.com
+qc.com
+ru.com
+sa.com
+se.com
+se.net
+uk.com
+uk.net
+us.com
+us.org
+uy.com
+za.com
+
+// c.la : http://www.c.la/
+c.la
+
+// cloudControl : https://www.cloudcontrol.com/
+// Requested by Tobias Wilken 2013-07-23
+cloudcontrolled.com
+cloudcontrolapp.com
+
+// co.ca : http://registry.co.ca/
+co.ca
+
+// CoDNS B.V.
+co.nl
+co.no
+
+// DreamHost : http://www.dreamhost.com/
+// Requested by Andrew Farmer 2012-10-02
+dreamhosters.com
+
+// DynDNS.com : http://www.dyndns.com/services/dns/dyndns/
+dyndns-at-home.com
+dyndns-at-work.com
+dyndns-blog.com
+dyndns-free.com
+dyndns-home.com
+dyndns-ip.com
+dyndns-mail.com
+dyndns-office.com
+dyndns-pics.com
+dyndns-remote.com
+dyndns-server.com
+dyndns-web.com
+dyndns-wiki.com
+dyndns-work.com
+dyndns.biz
+dyndns.info
+dyndns.org
+dyndns.tv
+at-band-camp.net
+ath.cx
+barrel-of-knowledge.info
+barrell-of-knowledge.info
+better-than.tv
+blogdns.com
+blogdns.net
+blogdns.org
+blogsite.org
+boldlygoingnowhere.org
+broke-it.net
+buyshouses.net
+cechire.com
+dnsalias.com
+dnsalias.net
+dnsalias.org
+dnsdojo.com
+dnsdojo.net
+dnsdojo.org
+does-it.net
+doesntexist.com
+doesntexist.org
+dontexist.com
+dontexist.net
+dontexist.org
+doomdns.com
+doomdns.org
+dvrdns.org
+dyn-o-saur.com
+dynalias.com
+dynalias.net
+dynalias.org
+dynathome.net
+dyndns.ws
+endofinternet.net
+endofinternet.org
+endoftheinternet.org
+est-a-la-maison.com
+est-a-la-masion.com
+est-le-patron.com
+est-mon-blogueur.com
+for-better.biz
+for-more.biz
+for-our.info
+for-some.biz
+for-the.biz
+forgot.her.name
+forgot.his.name
+from-ak.com
+from-al.com
+from-ar.com
+from-az.net
+from-ca.com
+from-co.net
+from-ct.com
+from-dc.com
+from-de.com
+from-fl.com
+from-ga.com
+from-hi.com
+from-ia.com
+from-id.com
+from-il.com
+from-in.com
+from-ks.com
+from-ky.com
+from-la.net
+from-ma.com
+from-md.com
+from-me.org
+from-mi.com
+from-mn.com
+from-mo.com
+from-ms.com
+from-mt.com
+from-nc.com
+from-nd.com
+from-ne.com
+from-nh.com
+from-nj.com
+from-nm.com
+from-nv.com
+from-ny.net
+from-oh.com
+from-ok.com
+from-or.com
+from-pa.com
+from-pr.com
+from-ri.com
+from-sc.com
+from-sd.com
+from-tn.com
+from-tx.com
+from-ut.com
+from-va.com
+from-vt.com
+from-wa.com
+from-wi.com
+from-wv.com
+from-wy.com
+ftpaccess.cc
+fuettertdasnetz.de
+game-host.org
+game-server.cc
+getmyip.com
+gets-it.net
+go.dyndns.org
+gotdns.com
+gotdns.org
+groks-the.info
+groks-this.info
+ham-radio-op.net
+here-for-more.info
+hobby-site.com
+hobby-site.org
+home.dyndns.org
+homedns.org
+homeftp.net
+homeftp.org
+homeip.net
+homelinux.com
+homelinux.net
+homelinux.org
+homeunix.com
+homeunix.net
+homeunix.org
+iamallama.com
+in-the-band.net
+is-a-anarchist.com
+is-a-blogger.com
+is-a-bookkeeper.com
+is-a-bruinsfan.org
+is-a-bulls-fan.com
+is-a-candidate.org
+is-a-caterer.com
+is-a-celticsfan.org
+is-a-chef.com
+is-a-chef.net
+is-a-chef.org
+is-a-conservative.com
+is-a-cpa.com
+is-a-cubicle-slave.com
+is-a-democrat.com
+is-a-designer.com
+is-a-doctor.com
+is-a-financialadvisor.com
+is-a-geek.com
+is-a-geek.net
+is-a-geek.org
+is-a-green.com
+is-a-guru.com
+is-a-hard-worker.com
+is-a-hunter.com
+is-a-knight.org
+is-a-landscaper.com
+is-a-lawyer.com
+is-a-liberal.com
+is-a-libertarian.com
+is-a-linux-user.org
+is-a-llama.com
+is-a-musician.com
+is-a-nascarfan.com
+is-a-nurse.com
+is-a-painter.com
+is-a-patsfan.org
+is-a-personaltrainer.com
+is-a-photographer.com
+is-a-player.com
+is-a-republican.com
+is-a-rockstar.com
+is-a-socialist.com
+is-a-soxfan.org
+is-a-student.com
+is-a-teacher.com
+is-a-techie.com
+is-a-therapist.com
+is-an-accountant.com
+is-an-actor.com
+is-an-actress.com
+is-an-anarchist.com
+is-an-artist.com
+is-an-engineer.com
+is-an-entertainer.com
+is-by.us
+is-certified.com
+is-found.org
+is-gone.com
+is-into-anime.com
+is-into-cars.com
+is-into-cartoons.com
+is-into-games.com
+is-leet.com
+is-lost.org
+is-not-certified.com
+is-saved.org
+is-slick.com
+is-uberleet.com
+is-very-bad.org
+is-very-evil.org
+is-very-good.org
+is-very-nice.org
+is-very-sweet.org
+is-with-theband.com
+isa-geek.com
+isa-geek.net
+isa-geek.org
+isa-hockeynut.com
+issmarterthanyou.com
+isteingeek.de
+istmein.de
+kicks-ass.net
+kicks-ass.org
+knowsitall.info
+land-4-sale.us
+lebtimnetz.de
+leitungsen.de
+likes-pie.com
+likescandy.com
+merseine.nu
+mine.nu
+misconfused.org
+mypets.ws
+myphotos.cc
+neat-url.com
+office-on-the.net
+on-the-web.tv
+podzone.net
+podzone.org
+readmyblog.org
+saves-the-whales.com
+scrapper-site.net
+scrapping.cc
+selfip.biz
+selfip.com
+selfip.info
+selfip.net
+selfip.org
+sells-for-less.com
+sells-for-u.com
+sells-it.net
+sellsyourhome.org
+servebbs.com
+servebbs.net
+servebbs.org
+serveftp.net
+serveftp.org
+servegame.org
+shacknet.nu
+simple-url.com
+space-to-rent.com
+stuff-4-sale.org
+stuff-4-sale.us
+teaches-yoga.com
+thruhere.net
+traeumtgerade.de
+webhop.biz
+webhop.info
+webhop.net
+webhop.org
+worse-than.tv
+writesthisblog.com
+
+// Fastly Inc. http://www.fastly.com/
+// Requested by Vladimir Vuksan 2013-05-31
+a.ssl.fastly.net
+b.ssl.fastly.net
+global.ssl.fastly.net
+a.prod.fastly.net
+global.prod.fastly.net
+
+// GitHub, Inc.
+// Requested by Ben Toews 2013-04-18
+github.io
+
+// GlobeHosting, Inc.
+// Requested by Zoltan Egresi 2013-07-12
+ro.com
+
+// Google, Inc.
+// Requested by Eduardo Vela 2012-10-24
+appspot.com
+blogspot.be
+blogspot.bj
+blogspot.ca
+blogspot.cf
+blogspot.ch
+blogspot.co.at
+blogspot.co.il
+blogspot.co.nz
+blogspot.co.uk
+blogspot.com
+blogspot.com.ar
+blogspot.com.au
+blogspot.com.br
+blogspot.com.es
+blogspot.cv
+blogspot.cz
+blogspot.de
+blogspot.dk
+blogspot.fi
+blogspot.fr
+blogspot.gr
+blogspot.hk
+blogspot.hu
+blogspot.ie
+blogspot.in
+blogspot.it
+blogspot.jp
+blogspot.kr
+blogspot.mr
+blogspot.mx
+blogspot.nl
+blogspot.no
+blogspot.pt
+blogspot.re
+blogspot.ro
+blogspot.se
+blogspot.sg
+blogspot.sk
+blogspot.td
+blogspot.tw
+codespot.com
+googleapis.com
+googlecode.com
+
+// Heroku : https://www.heroku.com/
+// Requested by Tom Maher 2013-05-02
+herokuapp.com
+herokussl.com
+
+// iki.fi
+// Requested by Hannu Aronsson 2009-11-05
+iki.fi
+
+// info.at : http://www.info.at/
+biz.at
+info.at
+
+// Michau Enterprises Limited : http://www.co.pl/
+co.pl
+
+// NYC.mn : http://www.information.nyc.mn
+// Requested by Matthew Brown 2013-03-11
+nyc.mn
+
+// Opera Software, A.S.A.
+// Requested by Yngve Pettersen 2009-11-26
+operaunite.com
+
+// Red Hat, Inc. OpenShift : https://openshift.redhat.com/
+// Requested by Tim Kramer 2012-10-24
+rhcloud.com
+
+// priv.at : http://www.nic.priv.at/
+// Requested by registry 2008-06-09
+priv.at
+
+// ZaNiC : http://www.za.net/
+// Requested by registry 2009-10-03
+za.net
+za.org
+
+// ===END PRIVATE DOMAINS===
diff --git a/src/test/java/org/archive/net/PublicSuffixesTest.java b/src/test/java/org/archive/net/PublicSuffixesTest.java
new file mode 100644
index 00000000..b88acb6d
--- /dev/null
+++ b/src/test/java/org/archive/net/PublicSuffixesTest.java
@@ -0,0 +1,193 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.net;
+
+import java.io.PrintWriter;
+import java.io.StringWriter;
+import java.util.ArrayList;
+import java.util.regex.Matcher;
+
+import junit.framework.TestCase;
+
+import org.archive.net.PublicSuffixes.Node;
+
+/**
+ * Test cases for PublicSuffixes utility. Confirm expected matches/nonmatches
+ * from constructed regex.
+ *
+ * @author gojomo
+ */
+public class PublicSuffixesTest extends TestCase {
+ // test of low level implementation
+
+ public void testCompare() {
+ Node n = new Node("hoge");
+ assertTrue(n.compareTo('a') > 0);
+ assertEquals(-1, n.compareTo('*'));
+ assertEquals(-1, n.compareTo('!'));
+ assertEquals(-1, n.compareTo(new Node("*,")));
+ assertEquals(-1, n.compareTo(new Node("!muga,")));
+ assertEquals(-1, n.compareTo(new Node("")));
+
+ n = new Node("*,");
+ assertEquals(1, n.compareTo('a'));
+ assertEquals(0, n.compareTo('*'));
+ assertEquals(1, n.compareTo('!'));
+ assertEquals(0, n.compareTo(new Node("*,")));
+ assertEquals(1, n.compareTo(new Node("!muga,")));
+ assertEquals(-1, n.compareTo(new Node("")));
+
+ n = new Node("!hoge");
+ assertEquals(1, n.compareTo('a'));
+ assertEquals(-1, n.compareTo('*'));
+ assertEquals(0, n.compareTo('!'));
+ assertEquals(-1, n.compareTo(new Node("*,")));
+ assertEquals(0, n.compareTo(new Node("!muga,")));
+ assertEquals(-1, n.compareTo(new Node("")));
+
+ n = new Node("");
+ assertEquals(1, n.compareTo('a'));
+ assertEquals(1, n.compareTo('*'));
+ assertEquals(1, n.compareTo('!'));
+ assertEquals(0, n.compareTo(new Node("")));
+ }
+
+ protected String dump(Node alt) {
+ StringWriter w = new StringWriter();
+ PublicSuffixes.dump(alt, 0, new PrintWriter(w));
+ return w.toString();
+ }
+ public void testTrie1() {
+ Node alt = new Node(null, new ArrayList());
+ alt.addBranch("ac,");
+ // specifically, should not have empty string as match.
+ assertEquals("(null)\n" +
+ " \"ac,\"\n", dump(alt));
+ alt.addBranch("ac,com,");
+ assertEquals("(null)\n" +
+ " \"ac,\"\n" +
+ " \"com,\"\n" +
+ " \"\"\n", dump(alt));
+ alt.addBranch("ac,edu,");
+ assertEquals("(null)\n" +
+ " \"ac,\"\n" +
+ " \"com,\"\n" +
+ " \"edu,\"\n" +
+ " \"\"\n", dump(alt));
+ }
+ public void testTrie2() {
+ Node alt = new Node(null, new ArrayList());
+ alt.addBranch("ac,");
+ alt.addBranch("*,");
+ assertEquals("(null)\n" +
+ " \"ac,\"\n" +
+ " \"*,\"\n", dump(alt));
+ }
+
+ public void testTrie3() {
+ Node alt = new Node(null, new ArrayList());
+ alt.addBranch("ac,");
+ alt.addBranch("ac,!hoge,");
+ alt.addBranch("ac,*,");
+ // exception goes first.
+ assertEquals("(null)\n" +
+ " \"ac,\"\n" +
+ " \"!hoge,\"\n" +
+ " \"*,\"\n" +
+ " \"\"\n", dump(alt));
+ }
+
+ // test of higher-level functionality
+
+ Matcher m = PublicSuffixes.getTopmostAssignedSurtPrefixPattern()
+ .matcher("");
+
+ public void testBasics() {
+ matchPrefix("com,example,www,", "com,example,");
+ matchPrefix("com,example,", "com,example,");
+ matchPrefix("org,archive,www,", "org,archive,");
+ matchPrefix("org,archive,", "org,archive,");
+ matchPrefix("fr,yahoo,www,", "fr,yahoo,");
+ matchPrefix("fr,yahoo,", "fr,yahoo,");
+ matchPrefix("au,com,foobar,www,", "au,com,foobar,");
+ matchPrefix("au,com,foobar,", "au,com,foobar,");
+ matchPrefix("uk,co,virgin,www,", "uk,co,virgin,");
+ matchPrefix("uk,co,virgin,", "uk,co,virgin,");
+ matchPrefix("au,com,example,www,", "au,com,example,");
+ matchPrefix("au,com,example,", "au,com,example,");
+ matchPrefix("jp,yokohama,public,assigned,www,",
+ "jp,yokohama,public,assigned,");
+ matchPrefix("jp,yokohama,public,assigned,", "jp,yokohama,public,assigned,");
+ }
+
+ public void testDomainWithDash() {
+ matchPrefix("de,bad-site,www", "de,bad-site,");
+ }
+
+ public void testDomainWithNumbers() {
+ matchPrefix("de,archive4u,www", "de,archive4u,");
+ }
+
+ public void testIPV4() {
+ assertEquals("unexpected reduction",
+ "1.2.3.4",
+ PublicSuffixes.reduceSurtToAssignmentLevel("1.2.3.4"));
+ }
+
+ public void testIPV6() {
+ assertEquals("unexpected reduction",
+ "[2001:0db8:85a3:08d3:1319:8a2e:0370:7344]",
+ PublicSuffixes.reduceSurtToAssignmentLevel(
+ "[2001:0db8:85a3:08d3:1319:8a2e:0370:7344]"));
+ }
+
+ public void testExceptions() {
+ matchPrefix("uk,bl,www,", "uk,bl,");
+ matchPrefix("uk,bl,", "uk,bl,");
+ matchPrefix("jp,tokyo,city,subdomain,", "jp,tokyo,city,");
+ matchPrefix("jp,tokyo,city,", "jp,tokyo,city,");
+ }
+
+ public void testFakeTLD() {
+ // we assume any new/unknonwn TLD should be assumed as 2-level;
+ // this is preferable for our grouping purpose but might not be
+ // for a cookie-assigning browser (original purpose of publicsuffixlist)
+ matchPrefix("zzz,example,www,", "zzz,example,");
+ }
+
+ public void testUnsegmentedHostname() {
+ m.reset("example");
+ assertFalse("unexpected match found in 'example'", m.find());
+ }
+
+ public void testTopmostAssignedCaching() {
+ assertSame("topmostAssignedSurtPrefixPattern not cached",PublicSuffixes.getTopmostAssignedSurtPrefixPattern(),PublicSuffixes.getTopmostAssignedSurtPrefixPattern());
+ assertSame("topmostAssignedSurtPrefixRegex not cached",PublicSuffixes.getTopmostAssignedSurtPrefixRegex(),PublicSuffixes.getTopmostAssignedSurtPrefixRegex());
+ }
+
+ // TODO: test UTF domains?
+
+ protected void matchPrefix(String surtDomain, String expectedAssignedPrefix) {
+ m.reset(surtDomain);
+ assertTrue("expected match not found in '" + surtDomain, m.find());
+ assertEquals("expected match not found", expectedAssignedPrefix, m
+ .group());
+ }
+}
From a54dd8eb11b13988a64fed9f0a1e94faf80dc03e Mon Sep 17 00:00:00 2001
From: Noah Levitt
Date: Thu, 5 Dec 2013 17:47:26 -0800
Subject: [PATCH 25/27] moving a bunch of stuff from heritrix-commons to
ia-web-commons so that wayback doesn't have to depend on heritrix-commons
---
pom.xml | 11 +
.../org/archive/format/arc/ARCConstants.java | 211 ++++-
.../ConfigurableX509TrustManager.java | 188 ++++
.../httpclient/HttpRecorderGetMethod.java | 120 +++
.../httpclient/HttpRecorderMethod.java | 107 +++
.../httpclient/HttpRecorderPostMethod.java | 82 ++
.../SingleHttpConnectionManager.java | 70 ++
.../java/org/archive/httpclient/package.html | 24 +
.../org/archive/io/ArchiveFileConstants.java | 24 +
.../java/org/archive/io/ArchiveReader.java | 761 ++++++++++++++++
.../org/archive/io/ArchiveReaderFactory.java | 301 +++++++
.../java/org/archive/io/ArchiveRecord.java | 409 +++++++++
.../org/archive/io/ArchiveRecordHeader.java | 111 +++
.../org/archive/io/ArraySeekInputStream.java | 106 +++
.../archive/io/BufferedSeekInputStream.java | 217 +++++
.../java/org/archive/io/CharSubSequence.java | 90 ++
.../archive/io/CompositeFileInputStream.java | 97 ++
.../org/archive/io/CompositeFileReader.java | 40 +
src/main/java/org/archive/io/Endian.java | 125 +++
.../archive/io/GZIPMembersInputStream.java | 38 +
.../org/archive/io/GenerationFileHandler.java | 200 +++++
.../archive/io/GenericReplayCharSequence.java | 412 +++++++++
src/main/java/org/archive/io/GzipHeader.java | 26 +
.../org/archive/io/HeaderedArchiveRecord.java | 423 +++++++++
.../archive/io/LoudObjectOutputStream.java | 63 ++
.../org/archive/io/MiserOutputStream.java | 82 ++
.../org/archive/io/NoGzipMagicException.java | 26 +
.../io/ObjectPlusFilesInputStream.java | 143 +++
.../io/ObjectPlusFilesOutputStream.java | 134 +++
.../org/archive/io/OriginSeekInputStream.java | 121 +++
.../java/org/archive/io/Preformatter.java | 32 +
.../archive/io/RandomAccessInputStream.java | 180 ++++
.../archive/io/RandomAccessOutputStream.java | 69 ++
src/main/java/org/archive/io/ReadSource.java | 37 +
.../org/archive/io/RecorderIOException.java | 38 +
.../io/RecorderLengthExceededException.java | 39 +
.../archive/io/RecorderTimeoutException.java | 37 +
.../io/RecorderTooMuchHeaderException.java | 40 +
.../org/archive/io/RecordingInputStream.java | 355 ++++++++
.../org/archive/io/RecordingOutputStream.java | 576 ++++++++++++
.../archive/io/RecoverableIOException.java | 83 ++
.../io/RecyclingFastBufferedOutputStream.java | 37 +
.../org/archive/io/ReplayCharSequence.java | 77 ++
.../org/archive/io/ReplayInputStream.java | 325 +++++++
.../archive/io/RepositionableInputStream.java | 133 +++
.../org/archive/io/SafeSeekInputStream.java | 124 +++
.../java/org/archive/io/SeekInputStream.java | 81 ++
src/main/java/org/archive/io/SeekReader.java | 84 ++
.../archive/io/SeekReaderCharSequence.java | 56 ++
.../org/archive/io/SinkHandlerLogThread.java | 34 +
src/main/java/org/archive/io/UTF8Bytes.java | 37 +
src/main/java/org/archive/io/WriterPool.java | 343 +++++++
.../java/org/archive/io/WriterPoolMember.java | 487 ++++++++++
.../org/archive/io/WriterPoolSettings.java | 39 +
.../java/org/archive/io/arc/ARC2WCDX.java | 243 +++++
.../java/org/archive/io/arc/ARCConstants.java | 29 +
.../java/org/archive/io/arc/ARCLocation.java | 37 +
.../java/org/archive/io/arc/ARCReader.java | 553 ++++++++++++
.../org/archive/io/arc/ARCReaderFactory.java | 454 ++++++++++
.../java/org/archive/io/arc/ARCRecord.java | 835 ++++++++++++++++++
.../org/archive/io/arc/ARCRecordMetaData.java | 267 ++++++
.../java/org/archive/io/arc/ARCUtils.java | 240 +++++
.../java/org/archive/io/arc/ARCWriter.java | 459 ++++++++++
.../org/archive/io/arc/ARCWriterPool.java | 69 ++
.../io/arc/WriterPoolSettingsData.java | 80 ++
src/main/java/org/archive/io/package.html | 9 +
.../org/archive/io/warc/WARCConstants.java | 24 +
.../java/org/archive/io/warc/WARCReader.java | 287 ++++++
.../archive/io/warc/WARCReaderFactory.java | 307 +++++++
.../java/org/archive/io/warc/WARCRecord.java | 233 +++++
.../org/archive/io/warc/WARCRecordInfo.java | 139 +++
.../java/org/archive/io/warc/WARCWriter.java | 436 +++++++++
.../org/archive/io/warc/WARCWriterPool.java | 64 ++
.../io/warc/WARCWriterPoolSettings.java | 32 +
.../io/warc/WARCWriterPoolSettingsData.java | 40 +
.../java/org/archive/io/warc/package.html | 38 +
.../archive/net/DownloadURLConnection.java | 131 +++
.../java/org/archive/net/FTPException.java | 56 ++
.../java/org/archive/net/md5/Handler.java | 87 ++
.../org/archive/net/md5/Md5URLConnection.java | 34 +
.../java/org/archive/net/rsync/Handler.java | 71 ++
.../archive/net/rsync/RsyncURLConnection.java | 51 ++
.../org/archive/uid/RecordIDGenerator.java | 72 ++
.../java/org/archive/uid/UUIDGenerator.java | 72 ++
src/main/java/org/archive/uid/package.html | 28 +
src/main/java/org/archive/util/DevUtils.java | 116 +++
src/main/java/org/archive/util/FileUtils.java | 712 +++++++++++++++
.../org/archive/util/InetAddressUtil.java | 116 +++
.../archive/util/IterableLineIterator.java | 26 +
.../java/org/archive/util/LaxHttpParser.java | 242 +++++
.../java/org/archive/util/MimetypeUtils.java | 75 ++
.../java/org/archive/util/ProcessUtils.java | 151 ++++
.../util/ProgressStatisticsReporter.java | 36 +
.../java/org/archive/util/PropertyUtils.java | 114 +++
src/main/java/org/archive/util/Recorder.java | 593 +++++++++++++
src/main/java/org/archive/util/Reporter.java | 56 ++
.../org/archive/util/anvl/ANVLRecord.java | 336 +++++++
.../java/org/archive/util/anvl/Element.java | 73 ++
.../java/org/archive/util/anvl/Label.java | 41 +
.../org/archive/util/anvl/SubElement.java | 78 ++
.../java/org/archive/util/anvl/Value.java | 71 ++
.../java/org/archive/util/anvl/package.html | 42 +
102 files changed, 16459 insertions(+), 1 deletion(-)
create mode 100644 src/main/java/org/archive/httpclient/ConfigurableX509TrustManager.java
create mode 100644 src/main/java/org/archive/httpclient/HttpRecorderGetMethod.java
create mode 100644 src/main/java/org/archive/httpclient/HttpRecorderMethod.java
create mode 100644 src/main/java/org/archive/httpclient/HttpRecorderPostMethod.java
create mode 100644 src/main/java/org/archive/httpclient/SingleHttpConnectionManager.java
create mode 100644 src/main/java/org/archive/httpclient/package.html
create mode 100644 src/main/java/org/archive/io/ArchiveFileConstants.java
create mode 100644 src/main/java/org/archive/io/ArchiveReader.java
create mode 100644 src/main/java/org/archive/io/ArchiveReaderFactory.java
create mode 100644 src/main/java/org/archive/io/ArchiveRecord.java
create mode 100644 src/main/java/org/archive/io/ArchiveRecordHeader.java
create mode 100644 src/main/java/org/archive/io/ArraySeekInputStream.java
create mode 100644 src/main/java/org/archive/io/BufferedSeekInputStream.java
create mode 100644 src/main/java/org/archive/io/CharSubSequence.java
create mode 100644 src/main/java/org/archive/io/CompositeFileInputStream.java
create mode 100644 src/main/java/org/archive/io/CompositeFileReader.java
create mode 100644 src/main/java/org/archive/io/Endian.java
create mode 100644 src/main/java/org/archive/io/GZIPMembersInputStream.java
create mode 100644 src/main/java/org/archive/io/GenerationFileHandler.java
create mode 100644 src/main/java/org/archive/io/GenericReplayCharSequence.java
create mode 100644 src/main/java/org/archive/io/GzipHeader.java
create mode 100644 src/main/java/org/archive/io/HeaderedArchiveRecord.java
create mode 100644 src/main/java/org/archive/io/LoudObjectOutputStream.java
create mode 100644 src/main/java/org/archive/io/MiserOutputStream.java
create mode 100644 src/main/java/org/archive/io/NoGzipMagicException.java
create mode 100644 src/main/java/org/archive/io/ObjectPlusFilesInputStream.java
create mode 100644 src/main/java/org/archive/io/ObjectPlusFilesOutputStream.java
create mode 100644 src/main/java/org/archive/io/OriginSeekInputStream.java
create mode 100644 src/main/java/org/archive/io/Preformatter.java
create mode 100644 src/main/java/org/archive/io/RandomAccessInputStream.java
create mode 100644 src/main/java/org/archive/io/RandomAccessOutputStream.java
create mode 100644 src/main/java/org/archive/io/ReadSource.java
create mode 100644 src/main/java/org/archive/io/RecorderIOException.java
create mode 100644 src/main/java/org/archive/io/RecorderLengthExceededException.java
create mode 100644 src/main/java/org/archive/io/RecorderTimeoutException.java
create mode 100644 src/main/java/org/archive/io/RecorderTooMuchHeaderException.java
create mode 100644 src/main/java/org/archive/io/RecordingInputStream.java
create mode 100644 src/main/java/org/archive/io/RecordingOutputStream.java
create mode 100644 src/main/java/org/archive/io/RecoverableIOException.java
create mode 100644 src/main/java/org/archive/io/RecyclingFastBufferedOutputStream.java
create mode 100644 src/main/java/org/archive/io/ReplayCharSequence.java
create mode 100644 src/main/java/org/archive/io/ReplayInputStream.java
create mode 100644 src/main/java/org/archive/io/RepositionableInputStream.java
create mode 100644 src/main/java/org/archive/io/SafeSeekInputStream.java
create mode 100644 src/main/java/org/archive/io/SeekInputStream.java
create mode 100644 src/main/java/org/archive/io/SeekReader.java
create mode 100644 src/main/java/org/archive/io/SeekReaderCharSequence.java
create mode 100644 src/main/java/org/archive/io/SinkHandlerLogThread.java
create mode 100644 src/main/java/org/archive/io/UTF8Bytes.java
create mode 100644 src/main/java/org/archive/io/WriterPool.java
create mode 100644 src/main/java/org/archive/io/WriterPoolMember.java
create mode 100644 src/main/java/org/archive/io/WriterPoolSettings.java
create mode 100644 src/main/java/org/archive/io/arc/ARC2WCDX.java
create mode 100644 src/main/java/org/archive/io/arc/ARCConstants.java
create mode 100644 src/main/java/org/archive/io/arc/ARCLocation.java
create mode 100644 src/main/java/org/archive/io/arc/ARCReader.java
create mode 100644 src/main/java/org/archive/io/arc/ARCReaderFactory.java
create mode 100644 src/main/java/org/archive/io/arc/ARCRecord.java
create mode 100644 src/main/java/org/archive/io/arc/ARCRecordMetaData.java
create mode 100644 src/main/java/org/archive/io/arc/ARCUtils.java
create mode 100644 src/main/java/org/archive/io/arc/ARCWriter.java
create mode 100644 src/main/java/org/archive/io/arc/ARCWriterPool.java
create mode 100644 src/main/java/org/archive/io/arc/WriterPoolSettingsData.java
create mode 100644 src/main/java/org/archive/io/package.html
create mode 100644 src/main/java/org/archive/io/warc/WARCConstants.java
create mode 100644 src/main/java/org/archive/io/warc/WARCReader.java
create mode 100644 src/main/java/org/archive/io/warc/WARCReaderFactory.java
create mode 100644 src/main/java/org/archive/io/warc/WARCRecord.java
create mode 100644 src/main/java/org/archive/io/warc/WARCRecordInfo.java
create mode 100644 src/main/java/org/archive/io/warc/WARCWriter.java
create mode 100644 src/main/java/org/archive/io/warc/WARCWriterPool.java
create mode 100644 src/main/java/org/archive/io/warc/WARCWriterPoolSettings.java
create mode 100644 src/main/java/org/archive/io/warc/WARCWriterPoolSettingsData.java
create mode 100644 src/main/java/org/archive/io/warc/package.html
create mode 100644 src/main/java/org/archive/net/DownloadURLConnection.java
create mode 100644 src/main/java/org/archive/net/FTPException.java
create mode 100644 src/main/java/org/archive/net/md5/Handler.java
create mode 100644 src/main/java/org/archive/net/md5/Md5URLConnection.java
create mode 100644 src/main/java/org/archive/net/rsync/Handler.java
create mode 100644 src/main/java/org/archive/net/rsync/RsyncURLConnection.java
create mode 100644 src/main/java/org/archive/uid/RecordIDGenerator.java
create mode 100644 src/main/java/org/archive/uid/UUIDGenerator.java
create mode 100644 src/main/java/org/archive/uid/package.html
create mode 100644 src/main/java/org/archive/util/DevUtils.java
create mode 100644 src/main/java/org/archive/util/FileUtils.java
create mode 100644 src/main/java/org/archive/util/InetAddressUtil.java
create mode 100644 src/main/java/org/archive/util/IterableLineIterator.java
create mode 100644 src/main/java/org/archive/util/LaxHttpParser.java
create mode 100644 src/main/java/org/archive/util/MimetypeUtils.java
create mode 100644 src/main/java/org/archive/util/ProcessUtils.java
create mode 100644 src/main/java/org/archive/util/ProgressStatisticsReporter.java
create mode 100644 src/main/java/org/archive/util/PropertyUtils.java
create mode 100644 src/main/java/org/archive/util/Recorder.java
create mode 100644 src/main/java/org/archive/util/Reporter.java
create mode 100644 src/main/java/org/archive/util/anvl/ANVLRecord.java
create mode 100644 src/main/java/org/archive/util/anvl/Element.java
create mode 100644 src/main/java/org/archive/util/anvl/Label.java
create mode 100644 src/main/java/org/archive/util/anvl/SubElement.java
create mode 100644 src/main/java/org/archive/util/anvl/Value.java
create mode 100644 src/main/java/org/archive/util/anvl/package.html
diff --git a/pom.xml b/pom.xml
index 03b1240d..c714fe8c 100644
--- a/pom.xml
+++ b/pom.xml
@@ -124,6 +124,17 @@
httpcore4.3
+
+ joda-time
+ joda-time
+ 1.6
+
+
+ fastutil
+ fastutil
+ 5.0.7
+ compile
+
diff --git a/src/main/java/org/archive/format/arc/ARCConstants.java b/src/main/java/org/archive/format/arc/ARCConstants.java
index 6bfc5a99..a336ddeb 100755
--- a/src/main/java/org/archive/format/arc/ARCConstants.java
+++ b/src/main/java/org/archive/format/arc/ARCConstants.java
@@ -1,8 +1,20 @@
package org.archive.format.arc;
import java.nio.charset.Charset;
+import java.util.Arrays;
+import java.util.List;
+import java.util.zip.Deflater;
+import java.util.zip.GZIPInputStream;
-public interface ARCConstants {
+import org.archive.format.ArchiveFileConstants;
+import org.archive.util.zip.GzipHeader;
+
+/**
+ * Constants used by ARC files and in ARC file processing.
+ *
+ * @author stack
+ */
+public interface ARCConstants extends ArchiveFileConstants {
public final static int MAX_META_LENGTH = 1024 * 32;
public final static Charset ARC_META_CHARSET = Charset.forName("utf-8");
public final static int NEW_LINE_ORD = 10;
@@ -25,4 +37,201 @@ public interface ARCConstants {
public static final String FILEDESC_SCHEME = "filedesc:/";
public static final String DNS_MIME = "text/dns";
public static final String ALEXA_DAT_MIME = "alexa/dat";
+
+ /**
+ * Default maximum ARC file size.
+ */
+ public static final long DEFAULT_MAX_ARC_FILE_SIZE = 100000000;
+
+ /**
+ * Maximum length for a metadata line.
+ */
+ public static final int MAX_METADATA_LINE_LENGTH = (4 * 1024);
+
+ /**
+ * ARC file extention.
+ */
+ public static final String ARC_FILE_EXTENSION = "arc";
+
+ /**
+ * Dot ARC file extension.
+ */
+ public static final String DOT_ARC_FILE_EXTENSION =
+ "." + ARC_FILE_EXTENSION;
+
+ public static final String DOT_COMPRESSED_FILE_EXTENSION =
+ ArchiveFileConstants.DOT_COMPRESSED_FILE_EXTENSION;
+
+ /**
+ * Compressed arc file extension.
+ */
+ public static final String COMPRESSED_ARC_FILE_EXTENSION =
+ ARC_FILE_EXTENSION + DOT_COMPRESSED_FILE_EXTENSION;
+
+ /**
+ * Compressed dot arc file extension.
+ */
+ public static final String DOT_COMPRESSED_ARC_FILE_EXTENSION =
+ DOT_ARC_FILE_EXTENSION + DOT_COMPRESSED_FILE_EXTENSION;
+
+ /**
+ * Encoding to use getting bytes from strings.
+ *
+ * Specify an encoding rather than leave it to chance: i.e whatever the
+ * JVMs encoding. Use an encoding that gets the stream as bytes, not chars.
+ */
+ public static final String DEFAULT_ENCODING = "ISO-8859-1";
+
+ /**
+ * ARC file line seperator character.
+ *
+ * This is what the alexa c-code looks for delimiting lines.
+ */
+ public static final char LINE_SEPARATOR = '\n';
+
+ /**
+ * ARC header field seperator character.
+ */
+ public static final char HEADER_FIELD_SEPARATOR = ' ';
+
+ /**
+ * ARC file *MAGIC NUMBER*.
+ *
+ * Every ARC file must begin w/ this.
+ */
+ public static final String ARC_MAGIC_NUMBER = "filedesc://";
+
+ /**
+ * The FLG.FEXTRA field that is added to ARC files. (See RFC1952 to
+ * understand FLG.FEXTRA).
+ */
+ public static final byte[] ARC_GZIP_EXTRA_FIELD = { 8, 0, 'L', 'X', 4, 0,
+ 0, 0, 0, 0 };
+
+ /**
+ * Key for the ARC Header IP field.
+ *
+ * Lowercased.
+ */
+ public static final String IP_HEADER_FIELD_KEY = "ip-address";
+
+ /**
+ * Key for the ARC Header Result Code field.
+ *
+ * Lowercased.
+ */
+ public static final String CODE_HEADER_FIELD_KEY = "result-code";
+
+ /**
+ * Key for the ARC Header Checksum field.
+ *
+ * Lowercased.
+ */
+ public static final String CHECKSUM_HEADER_FIELD_KEY = "checksum";
+
+ /**
+ * Key for the ARC Header Location field.
+ *
+ * Lowercased.
+ */
+ public static final String LOCATION_HEADER_FIELD_KEY = "location";
+
+ /**
+ * Key for the ARC Header Offset field.
+ *
+ * Lowercased.
+ */
+ public static final String OFFSET_HEADER_FIELD_KEY = "offset";
+
+ /**
+ * Key for the ARC Header filename field.
+ *
+ * Lowercased.
+ */
+ public static final String FILENAME_HEADER_FIELD_KEY = "filename";
+
+ /**
+ * Key for statuscode field.
+ */
+ public static final String STATUSCODE_FIELD_KEY = "statuscode";
+
+ /**
+ * Key for offset field.
+ */
+ public static final String OFFSET_FIELD_KEY = OFFSET_HEADER_FIELD_KEY;
+
+ /**
+ * Key for filename field.
+ */
+ public static final String FILENAME_FIELD_KEY = FILENAME_HEADER_FIELD_KEY;
+
+ /**
+ * Key for checksum field.
+ */
+ public static final String CHECKSUM_FIELD_KEY = CHECKSUM_HEADER_FIELD_KEY;
+
+ /**
+ * Tokenized field prefix.
+ *
+ * Use this prefix for tokenized fields when naming fields in
+ * an index.
+ */
+ public static final String TOKENIZED_PREFIX = "tokenized_";
+
+ /**
+ * Assumed maximum size of a record meta header line.
+ *
+ * This 100k which seems massive but its the same as the LINE_LENGTH from
+ * alexa/include/a_arcio.h:
+ *
+ * #define LINE_LENGTH (100*1024)
+ *
+ */
+ public static final int MAX_HEADER_LINE_LENGTH = 1024 * 100;
+
+ /**
+ * Version 1 required metadata fields.
+ */
+ public static List REQUIRED_VERSION_1_HEADER_FIELDS = Arrays
+ .asList(new String[] { URL_FIELD_KEY, IP_HEADER_FIELD_KEY,
+ DATE_FIELD_KEY, MIMETYPE_FIELD_KEY,
+ LENGTH_FIELD_KEY, VERSION_FIELD_KEY,
+ ABSOLUTE_OFFSET_KEY });
+
+ /**
+ * Minimum possible record length.
+ *
+ * This is a rough calc. When the header is data it will occupy less space.
+ */
+ public static int MINIMUM_RECORD_LENGTH = 1 + "://".length() + 1
+ + ARC_FILE_EXTENSION.length() + " ".length() + +1 + " ".length()
+ + 1 + " ".length() + 1 + "/".length() + 1 + " ".length() + 1;
+
+ /**
+ * Start of a GZIP header that uses default deflater.
+ */
+ public static final byte[] GZIP_HEADER_BEGIN = {
+ (byte) GZIPInputStream.GZIP_MAGIC, // Magic number (short)
+ (byte) (GZIPInputStream.GZIP_MAGIC >> 8), // Magic number (short)
+ Deflater.DEFLATED // Compression method (CM)
+ };
+
+ /**
+ * Length of minimual 'default GZIP header.
+ *
+ * See RFC1952 for explaination of value of 10.
+ */
+ public static final int DEFAULT_GZIP_HEADER_LENGTH =
+ GzipHeader.MINIMAL_GZIP_HEADER_LENGTH;
+
+ /**
+ * set of known errors encountered reading ARCs
+ */
+ public enum ArcRecordErrors {
+ HTTP_HEADER_TRUNCATED,
+ HTTP_STATUS_LINE_INVALID,
+ HTTP_STATUS_LINE_EXCEPTION,
+ }
+
+
}
diff --git a/src/main/java/org/archive/httpclient/ConfigurableX509TrustManager.java b/src/main/java/org/archive/httpclient/ConfigurableX509TrustManager.java
new file mode 100644
index 00000000..45a89ba6
--- /dev/null
+++ b/src/main/java/org/archive/httpclient/ConfigurableX509TrustManager.java
@@ -0,0 +1,188 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.httpclient;
+
+import java.security.KeyStore;
+import java.security.KeyStoreException;
+import java.security.NoSuchAlgorithmException;
+import java.security.cert.CertificateException;
+import java.security.cert.X509Certificate;
+import java.util.logging.Logger;
+
+import javax.net.ssl.TrustManager;
+import javax.net.ssl.TrustManagerFactory;
+import javax.net.ssl.X509TrustManager;
+
+/**
+ * A configurable trust manager built on X509TrustManager.
+ *
+ * If set to 'open' trust, the default, will get us into sites for whom we do
+ * not have the CA or any of intermediary CAs that go to make up the cert chain
+ * of trust. Will also get us past selfsigned and expired certs. 'loose'
+ * trust will get us into sites w/ valid certs even if they are just
+ * selfsigned. 'normal' is any valid cert not including selfsigned. 'strict'
+ * means cert must be valid and the cert DN must match server name.
+ *
+ *
TODO: Move to an ssl subpackage when we have other classes other than
+ * just this one.
+ *
+ * @author stack
+ * @version $Id$
+ */
+public class ConfigurableX509TrustManager implements X509TrustManager
+{
+ /**
+ * Logging instance.
+ */
+ protected static Logger logger = Logger.getLogger(
+ "org.archive.httpclient.ConfigurableX509TrustManager");
+
+ public static enum TrustLevel {
+ /**
+ * Trust anything given us.
+ *
+ * Default setting.
+ *
+ *
See
+ * e502. Disabling Certificate Validation in an HTTPS Connection from
+ * the java almanac for how to trust all.
+ */
+ OPEN,
+
+ /**
+ * Trust any valid cert including self-signed certificates.
+ */
+ LOOSE,
+
+ /**
+ * Normal jsse behavior.
+ *
+ * Seemingly any certificate that supplies valid chain of trust.
+ */
+ NORMAL,
+
+ /**
+ * Strict trust.
+ *
+ * Ensure server has same name as cert DN.
+ */
+ STRICT,
+ }
+
+ /**
+ * Default setting for trust level.
+ */
+ public final static TrustLevel DEFAULT = TrustLevel.OPEN;
+
+ /**
+ * Trust level.
+ */
+ private TrustLevel trustLevel = DEFAULT;
+
+
+ /**
+ * An instance of the SUNX509TrustManager that we adapt variously
+ * depending upon passed configuration.
+ *
+ * We have it do all the work we don't want to.
+ */
+ private X509TrustManager standardTrustManager = null;
+
+
+ public ConfigurableX509TrustManager()
+ throws NoSuchAlgorithmException, KeyStoreException {
+ this(DEFAULT);
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param level Level of trust to effect.
+ *
+ * @throws NoSuchAlgorithmException
+ * @throws KeyStoreException
+ */
+ public ConfigurableX509TrustManager(TrustLevel level)
+ throws NoSuchAlgorithmException, KeyStoreException {
+ super();
+ TrustManagerFactory factory = TrustManagerFactory.
+ getInstance(TrustManagerFactory.getDefaultAlgorithm());
+
+ // Pass in a null (Trust) KeyStore. Null says use the 'default'
+ // 'trust' keystore (KeyStore class is used to hold keys and to hold
+ // 'trusts' (certs)). See 'X509TrustManager Interface' in this doc:
+ // http://java.sun.com
+ // /j2se/1.4.2/docs/guide/security/jsse/JSSERefGuide.html#Introduction
+ factory.init((KeyStore)null);
+ TrustManager[] trustmanagers = factory.getTrustManagers();
+ if (trustmanagers.length == 0) {
+ throw new NoSuchAlgorithmException(TrustManagerFactory.
+ getDefaultAlgorithm() + " trust manager not supported");
+ }
+ this.standardTrustManager = (X509TrustManager)trustmanagers[0];
+
+ this.trustLevel = level;
+ }
+
+ public void checkClientTrusted(X509Certificate[] certificates, String type)
+ throws CertificateException {
+ if (this.trustLevel.equals(TrustLevel.OPEN)) {
+ return;
+ }
+
+ this.standardTrustManager.checkClientTrusted(certificates, type);
+ }
+
+ public void checkServerTrusted(X509Certificate[] certificates, String type)
+ throws CertificateException {
+ if (this.trustLevel.equals(TrustLevel.OPEN)) {
+ return;
+ }
+
+ try {
+ this.standardTrustManager.checkServerTrusted(certificates, type);
+ if (this.trustLevel.equals(TrustLevel.STRICT)) {
+ logger.severe(TrustLevel.STRICT + " not implemented.");
+ }
+ } catch (CertificateException e) {
+ if (this.trustLevel.equals(TrustLevel.LOOSE) &&
+ certificates != null && certificates.length == 1)
+ {
+ // If only one cert and its valid and it caused a
+ // CertificateException, assume its selfsigned.
+ X509Certificate certificate = certificates[0];
+ certificate.checkValidity();
+ } else {
+ // If we got to here, then we're probably NORMAL. Rethrow.
+ throw e;
+ }
+ }
+ }
+
+ public X509Certificate[] getAcceptedIssuers() {
+ return this.standardTrustManager.getAcceptedIssuers();
+ }
+}
diff --git a/src/main/java/org/archive/httpclient/HttpRecorderGetMethod.java b/src/main/java/org/archive/httpclient/HttpRecorderGetMethod.java
new file mode 100644
index 00000000..105c4f7e
--- /dev/null
+++ b/src/main/java/org/archive/httpclient/HttpRecorderGetMethod.java
@@ -0,0 +1,120 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.httpclient;
+
+import java.io.IOException;
+import java.util.logging.Logger;
+
+import org.apache.commons.httpclient.HttpConnection;
+import org.apache.commons.httpclient.HttpException;
+import org.apache.commons.httpclient.HttpState;
+import org.apache.commons.httpclient.methods.GetMethod;
+import org.archive.util.Recorder;
+
+
+/**
+ * Override of GetMethod that marks the passed HttpRecorder w/ the transition
+ * from HTTP head to body and that forces a close on the http connection.
+ *
+ * The actions done in this subclass used to be done by copying
+ * org.apache.commons.HttpMethodBase, overlaying our version in place of the
+ * one that came w/ httpclient. Here is the patch of the difference between
+ * shipped httpclient code and our mods:
+ *
We're not supposed to have access to the underlying connection object;
+ * am only violating contract because see cases where httpclient is skipping
+ * out w/o cleaning up after itself.
+ *
+ * @author stack
+ * @version $Revision$, $Date$
+ */
+public class HttpRecorderGetMethod extends GetMethod {
+
+ protected static Logger logger =
+ Logger.getLogger(HttpRecorderGetMethod.class.getName());
+
+ /**
+ * Instance of http recorder method.
+ */
+ protected HttpRecorderMethod httpRecorderMethod = null;
+
+
+ public HttpRecorderGetMethod(String uri, Recorder recorder) {
+ super(uri);
+ this.httpRecorderMethod = new HttpRecorderMethod(recorder);
+ }
+
+ protected void readResponseBody(HttpState state, HttpConnection connection)
+ throws IOException, HttpException {
+ // We're about to read the body. Mark transition in http recorder.
+ this.httpRecorderMethod.markContentBegin(connection);
+ super.readResponseBody(state, connection);
+ }
+
+ protected boolean shouldCloseConnection(HttpConnection conn) {
+ // Always close connection after each request. As best I can tell, this
+ // is superfluous -- we've set our client to be HTTP/1.0. Doing this
+ // out of paranoia.
+ return true;
+ }
+
+ public int execute(HttpState state, HttpConnection conn)
+ throws HttpException, IOException {
+ // Save off the connection so we can close it on our way out in case
+ // httpclient fails to (We're not supposed to have access to the
+ // underlying connection object; am only violating contract because
+ // see cases where httpclient is skipping out w/o cleaning up
+ // after itself).
+ this.httpRecorderMethod.setConnection(conn);
+ return super.execute(state, conn);
+ }
+
+ protected void addProxyConnectionHeader(HttpState state, HttpConnection conn)
+ throws IOException, HttpException {
+ super.addProxyConnectionHeader(state, conn);
+ this.httpRecorderMethod.handleAddProxyConnectionHeader(this);
+ }
+}
diff --git a/src/main/java/org/archive/httpclient/HttpRecorderMethod.java b/src/main/java/org/archive/httpclient/HttpRecorderMethod.java
new file mode 100644
index 00000000..932e7e98
--- /dev/null
+++ b/src/main/java/org/archive/httpclient/HttpRecorderMethod.java
@@ -0,0 +1,107 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.httpclient;
+
+import java.util.logging.Logger;
+
+import org.apache.commons.httpclient.Header;
+import org.apache.commons.httpclient.HttpConnection;
+import org.apache.commons.httpclient.HttpMethod;
+import org.archive.util.Recorder;
+
+
+/**
+ * This class encapsulates the specializations supplied by the
+ * overrides {@link HttpRecorderGetMethod} and {@link HttpRecorderPostMethod}.
+ *
+ * It keeps instance of HttpRecorder and HttpConnection.
+ *
+ * @author stack
+ * @version $Revision$, $Date$
+ */
+public class HttpRecorderMethod {
+ protected static Logger logger =
+ Logger.getLogger(HttpRecorderMethod.class.getName());
+
+ /**
+ * Instance of http recorder we're using recording this http get.
+ */
+ private Recorder httpRecorder = null;
+
+ /**
+ * Save around so can force close.
+ *
+ * See [ 922080 ] IllegalArgumentException (size is wrong).
+ * https://sourceforge.net/tracker/?func=detail&aid=922080&group_id=73833&atid=539099
+ */
+ private HttpConnection connection = null;
+
+
+ public HttpRecorderMethod(Recorder recorder) {
+ this.httpRecorder = recorder;
+ }
+
+ public void markContentBegin(HttpConnection c) {
+ if (c != this.connection) {
+ // We're checking that we're not being asked to work on
+ // a connection that is other than the one we started
+ // this method#execute with.
+ throw new IllegalArgumentException("Connections differ: " +
+ this.connection + " " + c + " " +
+ Thread.currentThread().getName());
+ }
+ this.httpRecorder.markContentBegin();
+ }
+
+ /**
+ * @return Returns the connection.
+ */
+ public HttpConnection getConnection() {
+ return this.connection;
+ }
+
+ /**
+ * @param connection The connection to set.
+ */
+ public void setConnection(HttpConnection connection) {
+ this.connection = connection;
+ }
+ /**
+ * @return Returns the httpRecorder.
+ */
+ public Recorder getHttpRecorder() {
+ return httpRecorder;
+ }
+
+ /**
+ * If a 'Proxy-Connection' header has been added to the request,
+ * it'll be of a 'keep-alive' type. Until we support 'keep-alives',
+ * override the Proxy-Connection setting and instead pass a 'close'
+ * (Otherwise every request has to timeout before we notice
+ * end-of-document).
+ * @param method Method to find proxy-connection header in.
+ */
+ public void handleAddProxyConnectionHeader(HttpMethod method) {
+ Header h = method.getRequestHeader("Proxy-Connection");
+ if (h != null) {
+ h.setValue("close");
+ method.setRequestHeader(h);
+ }
+ }
+}
diff --git a/src/main/java/org/archive/httpclient/HttpRecorderPostMethod.java b/src/main/java/org/archive/httpclient/HttpRecorderPostMethod.java
new file mode 100644
index 00000000..20f1bfd1
--- /dev/null
+++ b/src/main/java/org/archive/httpclient/HttpRecorderPostMethod.java
@@ -0,0 +1,82 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.httpclient;
+
+import java.io.IOException;
+
+import org.apache.commons.httpclient.HttpConnection;
+import org.apache.commons.httpclient.HttpException;
+import org.apache.commons.httpclient.HttpState;
+import org.apache.commons.httpclient.methods.PostMethod;
+import org.archive.util.Recorder;
+
+
+/**
+ * Override of PostMethod that marks the passed HttpRecorder w/ the transition
+ * from HTTP head to body and that forces a close on the responseConnection.
+ *
+ * This is a copy of {@link HttpRecorderGetMethod}. Only difference is the
+ * parent subclass.
+ *
+ * @author stack
+ * @version $Date$ $Revision$
+ */
+public class HttpRecorderPostMethod extends PostMethod {
+ /**
+ * Instance of http recorder method.
+ */
+ protected HttpRecorderMethod httpRecorderMethod = null;
+
+
+ public HttpRecorderPostMethod(String uri, Recorder recorder) {
+ super(uri);
+ this.httpRecorderMethod = new HttpRecorderMethod(recorder);
+ }
+
+ protected void readResponseBody(HttpState state, HttpConnection connection)
+ throws IOException, HttpException {
+ // We're about to read the body. Mark transition in http recorder.
+ this.httpRecorderMethod.markContentBegin(connection);
+ super.readResponseBody(state, connection);
+ }
+
+ protected boolean shouldCloseConnection(HttpConnection conn) {
+ // Always close connection after each request. As best I can tell, this
+ // is superfluous -- we've set our client to be HTTP/1.0. Doing this
+ // out of paranoia.
+ return true;
+ }
+
+ public int execute(HttpState state, HttpConnection conn)
+ throws HttpException, IOException {
+ // Save off the connection so we can close it on our way out in case
+ // httpclient fails to (We're not supposed to have access to the
+ // underlying connection object; am only violating contract because
+ // see cases where httpclient is skipping out w/o cleaning up
+ // after itself).
+ this.httpRecorderMethod.setConnection(conn);
+ return super.execute(state, conn);
+ }
+
+ protected void addProxyConnectionHeader(HttpState state, HttpConnection conn)
+ throws IOException, HttpException {
+ super.addProxyConnectionHeader(state, conn);
+ this.httpRecorderMethod.handleAddProxyConnectionHeader(this);
+ }
+}
diff --git a/src/main/java/org/archive/httpclient/SingleHttpConnectionManager.java b/src/main/java/org/archive/httpclient/SingleHttpConnectionManager.java
new file mode 100644
index 00000000..4ba6a837
--- /dev/null
+++ b/src/main/java/org/archive/httpclient/SingleHttpConnectionManager.java
@@ -0,0 +1,70 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.httpclient;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.commons.httpclient.HostConfiguration;
+import org.apache.commons.httpclient.HttpConnection;
+import org.apache.commons.httpclient.SimpleHttpConnectionManager;
+
+/**
+ * An HttpClient-compatible HttpConnection "manager" that actually
+ * just gives out a new connection each time -- skipping the overhead
+ * of connection management, since we already throttle our crawler
+ * with external mechanisms.
+ *
+ * @author gojomo
+ */
+public class SingleHttpConnectionManager extends SimpleHttpConnectionManager {
+
+ public SingleHttpConnectionManager() {
+ super();
+ }
+
+ public HttpConnection getConnectionWithTimeout(
+ HostConfiguration hostConfiguration, long timeout) {
+
+ HttpConnection conn = new HttpConnection(hostConfiguration);
+ conn.setHttpConnectionManager(this);
+ conn.getParams().setDefaults(this.getParams());
+ return conn;
+ }
+
+ public void releaseConnection(HttpConnection conn) {
+ // ensure connection is closed
+ conn.close();
+ finishLast(conn);
+ }
+
+ protected static void finishLast(HttpConnection conn) {
+ // copied from superclass because it wasn't made available to subclasses
+ InputStream lastResponse = conn.getLastResponseInputStream();
+ if (lastResponse != null) {
+ conn.setLastResponseInputStream(null);
+ try {
+ lastResponse.close();
+ } catch (IOException ioe) {
+ //FIXME: badness - close to force reconnect.
+ conn.close();
+ }
+ }
+ }
+}
diff --git a/src/main/java/org/archive/httpclient/package.html b/src/main/java/org/archive/httpclient/package.html
new file mode 100644
index 00000000..87ae77ed
--- /dev/null
+++ b/src/main/java/org/archive/httpclient/package.html
@@ -0,0 +1,24 @@
+
+
+
+
+
+
diff --git a/src/main/java/org/archive/io/ArchiveFileConstants.java b/src/main/java/org/archive/io/ArchiveFileConstants.java
new file mode 100644
index 00000000..b1a39194
--- /dev/null
+++ b/src/main/java/org/archive/io/ArchiveFileConstants.java
@@ -0,0 +1,24 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io;
+
+@Deprecated
+public interface ArchiveFileConstants extends org.archive.format.ArchiveFileConstants {
+}
diff --git a/src/main/java/org/archive/io/ArchiveReader.java b/src/main/java/org/archive/io/ArchiveReader.java
new file mode 100644
index 00000000..66056d33
--- /dev/null
+++ b/src/main/java/org/archive/io/ArchiveReader.java
@@ -0,0 +1,761 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io;
+
+
+import java.io.BufferedInputStream;
+import java.io.BufferedWriter;
+import java.io.Closeable;
+import java.io.EOFException;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
+import org.archive.util.MimetypeUtils;
+import org.archive.util.zip.GZIPMembersInputStream;
+
+import com.google.common.io.CountingInputStream;
+
+
+/**
+ * Reader for an Archive file of Archive {@link ArchiveRecord}s.
+ * @author stack
+ * @version $Date$ $Version$
+ */
+public abstract class ArchiveReader implements ArchiveFileConstants, Iterable, Closeable {
+ /**
+ * Is this Archive file compressed?
+ */
+ private boolean compressed = false;
+
+ /**
+ * Should we digest as we read?
+ */
+ private boolean digest = true;
+
+ /**
+ * Should the parse be strict?
+ */
+ private boolean strict = false;
+
+ /**
+ * Archive file input stream.
+ *
+ * Keep it around so we can close it when done.
+ *
+ *
Set in constructor. Should support at least 1 byte mark/reset.
+ * Make it protected so subclasses have access.
+ */
+ protected InputStream in = null;
+
+ /**
+ * Maximum amount of recoverable exceptions in a row.
+ * If more than this amount in a row, we'll let out the exception rather
+ * than go back in for yet another retry.
+ */
+ public static final int MAX_ALLOWED_RECOVERABLES = 10;
+
+
+ /**
+ * The Record currently being read.
+ *
+ * Keep this ongoing reference so we'll close the record even if the caller
+ * doesn't.
+ */
+ private ArchiveRecord currentRecord = null;
+
+ /**
+ * Descriptive string for the Archive file we're going against:
+ * full path, url, etc. -- depends on context in which file was made.
+ */
+ private String identifier = null;
+
+ /**
+ * Archive file version.
+ */
+ private String version = null;
+
+
+ protected ArchiveReader() {
+ super();
+ }
+
+ /**
+ * Convenience method used by subclass constructors.
+ * @param i Identifier for Archive file this reader goes against.
+ */
+ protected void initialize(final String i) {
+ setReaderIdentifier(i);
+ }
+
+ /**
+ * Convenience method for constructors.
+ *
+ * @param f File to read.
+ * @param offset Offset at which to start reading.
+ * @return InputStream to read from.
+ * @throws IOException If failed open or fail to get a memory
+ * mapped byte buffer on file.
+ */
+ protected InputStream getInputStream(final File f, final long offset)
+ throws IOException {
+ FileInputStream fin = new FileInputStream(f);
+ return new BufferedInputStream(fin);
+ }
+
+ public boolean isCompressed() {
+ return this.compressed;
+ }
+
+ /**
+ * Get record at passed offset.
+ *
+ * @param offset Byte index into file at which a record starts.
+ * @return An Archive Record reference.
+ * @throws IOException
+ */
+ public ArchiveRecord get(long offset) throws IOException {
+ cleanupCurrentRecord();
+ long posn = positionForRecord(in);
+ if(offset>=posn) {
+ in.skip(offset-posn);
+ } else {
+ throw new UnsupportedOperationException("no reverse seeking: at "+posn+" requested "+offset);
+ }
+ return createArchiveRecord(this.in, offset);
+ }
+
+ /**
+ * @return Return Archive Record created against current offset.
+ * @throws IOException
+ */
+ public ArchiveRecord get() throws IOException {
+ return createArchiveRecord(this.in, positionForRecord(in));
+ }
+
+ public void close() throws IOException {
+ if (this.in != null) {
+ this.in.close();
+ this.in = null;
+ }
+ }
+
+ /**
+ * Cleanout the current record if there is one.
+ * @throws IOException
+ */
+ protected void cleanupCurrentRecord() throws IOException {
+ if (this.currentRecord != null) {
+ this.currentRecord.close();
+ gotoEOR(this.currentRecord);
+ this.currentRecord = null;
+ }
+ }
+
+ /**
+ * Return an Archive Record homed on offset into
+ * is.
+ * @param is Stream to read Record from.
+ * @param offset Offset to find Record at.
+ * @return ArchiveRecord instance.
+ * @throws IOException
+ */
+ protected abstract ArchiveRecord createArchiveRecord(InputStream is,
+ long offset)
+ throws IOException;
+
+ /**
+ * Skip over any trailing new lines at end of the record so we're lined up
+ * ready to read the next.
+ * @param record
+ * @throws IOException
+ */
+ protected abstract void gotoEOR(ArchiveRecord record) throws IOException;
+
+ public abstract String getFileExtension();
+ public abstract String getDotFileExtension();
+
+ /**
+ * @return Version of this Archive file.
+ */
+ public String getVersion() {
+ return this.version;
+ }
+
+ /**
+ * Validate the Archive file.
+ *
+ * This method iterates over the file throwing exception if it fails
+ * to successfully parse any record.
+ *
+ *
Assumes the stream is at the start of the file.
+ * @return List of all read Archive Headers.
+ *
+ * @throws IOException
+ */
+ public List validate() throws IOException {
+ return validate(-1);
+ }
+
+ /**
+ * Validate the Archive file.
+ *
+ * This method iterates over the file throwing exception if it fails
+ * to successfully parse.
+ *
+ *
We start validation from wherever we are in the stream.
+ *
+ * @param numRecords Number of records expected. Pass -1 if number is
+ * unknown.
+ *
+ * @return List of all read metadatas. As we validate records, we add
+ * a reference to the read metadata.
+ *
+ * @throws IOException
+ */
+ public List validate(int numRecords)
+ throws IOException {
+ List hdrList = new ArrayList();
+ int recordCount = 0;
+ setStrict(true);
+ for (Iterator i = iterator(); i.hasNext();) {
+ recordCount++;
+ ArchiveRecord r = i.next();
+ if (r.getHeader().getLength() <= 0
+ && r.getHeader().getMimetype().
+ equals(MimetypeUtils.NO_TYPE_MIMETYPE)) {
+ throw new IOException("record content is empty.");
+ }
+ r.close();
+ hdrList.add(r.getHeader());
+ }
+
+ if (numRecords != -1) {
+ if (recordCount != numRecords) {
+ throw new IOException("Count of records, "
+ + Integer.toString(recordCount)
+ + " is not equal to expected "
+ + Integer.toString(numRecords));
+ }
+ }
+
+ return hdrList;
+ }
+
+ /**
+ * Test Archive file is valid.
+ * Assumes the stream is at the start of the file. Be aware that this
+ * method makes a pass over the whole file.
+ * @return True if file can be successfully parsed.
+ */
+ public boolean isValid() {
+ boolean valid = false;
+ try {
+ validate();
+ valid = true;
+ } catch(Exception e) {
+ // File is not valid if exception thrown parsing.
+ valid = false;
+ }
+
+ return valid;
+ }
+
+ /**
+ * @return Returns the strict.
+ */
+ public boolean isStrict() {
+ return this.strict;
+ }
+
+ /**
+ * @param s The strict to set.
+ */
+ public void setStrict(boolean s) {
+ this.strict = s;
+ }
+
+ /**
+ * @param d True if we're to digest.
+ */
+ public void setDigest(boolean d) {
+ this.digest = d;
+ }
+
+ /**
+ * @return True if we're digesting as we read.
+ */
+ public boolean isDigest() {
+ return this.digest;
+ }
+
+ protected Logger getLogger() {
+ return Logger.getLogger(this.getClass().getName());
+ }
+
+ /**
+ * Returns an ArchiveRecord iterator.
+ * Of note, on IOException, especially if ZipException reading compressed
+ * ARCs, rather than fail the iteration, try moving to the next record.
+ * If {@link ArchiveReader#strict} is not set, this will usually succeed.
+ * @return An iterator over ARC records.
+ */
+ public Iterator iterator() {
+ // Eat up any record outstanding.
+ try {
+ cleanupCurrentRecord();
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+
+ return new ArchiveRecordIterator();
+ }
+
+ protected void setCompressed(boolean compressed) {
+ this.compressed = compressed;
+ }
+
+ /**
+ * @return The current ARC record or null if none.
+ * After construction has the arcfile header record.
+ * @see #get()
+ */
+ protected ArchiveRecord getCurrentRecord() {
+ return this.currentRecord;
+ }
+
+ protected ArchiveRecord currentRecord(final ArchiveRecord r) {
+ this.currentRecord = r;
+ return r;
+ }
+
+ protected InputStream getIn() {
+ return in;
+ }
+
+ protected void setIn(InputStream in) {
+ this.in = in;
+ }
+
+ protected void setVersion(String version) {
+ this.version = version;
+ }
+
+ public String getReaderIdentifier() {
+ return this.identifier;
+ }
+
+ protected void setReaderIdentifier(final String i) {
+ this.identifier = i;
+ }
+
+ /**
+ * Log on stderr.
+ * Logging should go via the logging system. This method
+ * bypasses the logging system going direct to stderr.
+ * Should not generally be used. Its used for rare messages
+ * that come of cmdline usage of ARCReader ERRORs and WARNINGs.
+ * Override if using ARCReader in a context where no stderr or
+ * where you'd like to redirect stderr to other than System.err.
+ * @param level Level to log message at.
+ * @param message Message to log.
+ */
+ public void logStdErr(Level level, String message) {
+ System.err.println(level.toString() + " " + message);
+ }
+
+// /**
+// * Add buffering to RandomAccessInputStream.
+// */
+// protected class RandomAccessBufferedInputStream
+// extends BufferedInputStream implements RepositionableStream {
+//
+// public RandomAccessBufferedInputStream(RandomAccessInputStream is)
+// throws IOException {
+// super(is);
+// }
+//
+// public RandomAccessBufferedInputStream(RandomAccessInputStream is, int size)
+// throws IOException {
+// super(is, size);
+// }
+//
+// public long position() throws IOException {
+// // Current position is the underlying files position
+// // minus the amount thats in the buffer yet to be read.
+// return ((RandomAccessInputStream)this.in).position() -
+// (this.count - this.pos);
+// }
+//
+// public void position(long position) throws IOException {
+// // Force refill of buffer whenever there's been a seek.
+// this.pos = 0;
+// this.count = 0;
+// ((RandomAccessInputStream)this.in).position(position);
+// }
+//
+// public int available() throws IOException {
+// // Avoid overflow on large datastreams
+// long amount = (long)in.available() + (long)(count - pos);
+// return (amount >= Integer.MAX_VALUE)? Integer.MAX_VALUE: (int)amount;
+// }
+// }
+
+ /**
+ * Inner ArchiveRecord Iterator class.
+ * Throws RuntimeExceptions in {@link #hasNext()} and {@link #next()} if
+ * trouble pulling record from underlying stream.
+ * @author stack
+ */
+ protected class ArchiveRecordIterator implements Iterator {
+ private final Logger logger =
+ Logger.getLogger(this.getClass().getName());
+ /**
+ * @return True if we have more records to read.
+ * @exception RuntimeException Can throw an IOException wrapped in a
+ * RuntimeException if a problem reading underlying stream (Corrupted
+ * gzip, etc.).
+ */
+ public boolean hasNext() {
+ // Call close on any extant record. This will scoot us past
+ // any content not yet read.
+ try {
+ cleanupCurrentRecord();
+ } catch (IOException e) {
+ if (isStrict()) {
+ throw new RuntimeException(e);
+ }
+ if (e instanceof EOFException) {
+ logger.warning("Premature EOF cleaning up " +
+ currentRecord.getHeader().toString() + ": " +
+ e.getMessage());
+ return false;
+ }
+ // If not strict, try going again. We might be able to skip
+ // over the bad record.
+ logger.log(Level.WARNING,"Trying skip of failed record cleanup of " +
+ currentRecord.getHeader().toString() + ": " +
+ e.getMessage(), e);
+ }
+ return innerHasNext();
+ }
+
+ protected boolean innerHasNext(){
+ try {
+ getIn().mark(1);
+ int c = getIn().read();
+ getIn().reset();
+ return c > -1;
+ } catch (IOException e) {
+ logger.log(Level.WARNING,"problem probing for more content",e);
+ return false;
+ }
+ }
+
+ /**
+ * Tries to move to next record if we get
+ * {@link RecoverableIOException}. If not strict
+ * tries to move to next record if we get an
+ * {@link IOException}.
+ * @return Next object.
+ * @exception RuntimeException Throws a runtime exception,
+ * usually a wrapping of an IOException, if trouble getting
+ * a record (Throws exception rather than return null).
+ */
+ public ArchiveRecord next() {
+ long offset = -1;
+ try {
+ offset = positionForRecord(getIn());
+ return exceptionNext();
+ } catch (IOException e) {
+ if (!isStrict()) {
+ // Retry though an IOE. Maybe we will succeed reading
+ // subsequent record.
+ try {
+ if (hasNext()) {
+ getLogger().warning("Bad Record. Trying skip " +
+ "(Record start " + offset + "): " +
+ e.getMessage());
+ return exceptionNext();
+ }
+ // Else we are at last record. Iterator#next is
+ // expecting value. We do not have one. Throw exception.
+ throw new RuntimeException("Retried but no next " +
+ "record (Record start " + offset + ")", e);
+ } catch (IOException e1) {
+ throw new RuntimeException("After retry (Offset " +
+ offset + ")", e1);
+ }
+ }
+ throw new RuntimeException("(Record start " + offset + ")", e);
+ }
+ }
+
+ /**
+ * A next that throws exceptions and has handling of
+ * recoverable exceptions moving us to next record. Can call
+ * hasNext which itself may throw exceptions.
+ * @return Next record.
+ * @throws IOException
+ * @throws RuntimeException Thrown when we've reached maximum
+ * retries.
+ */
+ protected ArchiveRecord exceptionNext()
+ throws IOException, RuntimeException {
+ ArchiveRecord result = null;
+ IOException ioe = null;
+ for (int i = MAX_ALLOWED_RECOVERABLES; i > 0 &&
+ result == null; i--) {
+ ioe = null;
+ try {
+ result = innerNext();
+ } catch (RecoverableIOException e) {
+ ioe = e;
+ getLogger().warning(e.getMessage());
+ if (hasNext()) {
+ continue;
+ }
+ // No records left. Throw exception rather than
+ // return null. The caller is expecting to get
+ // back a record since they've just called
+ // hasNext.
+ break;
+ }
+ }
+ if (ioe != null) {
+ // Then we did MAX_ALLOWED_RECOVERABLES retries. Throw
+ // the recoverable ioe wrapped in a RuntimeException so
+ // it goes out pass checks for IOE.
+ throw new RuntimeException("Retried " +
+ MAX_ALLOWED_RECOVERABLES + " times in a row", ioe);
+ }
+ return result;
+ }
+
+ protected ArchiveRecord innerNext() throws IOException {
+ return get(positionForRecord(getIn()));
+ }
+
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+ }
+
+ protected static long positionForRecord(InputStream in) {
+ return (in instanceof GZIPMembersInputStream)
+ ? ((GZIPMembersInputStream)in).getCurrentMemberStart()
+ : ((CountingInputStream)in).getCount();
+ }
+
+ protected static String stripExtension(final String name,
+ final String ext) {
+ return (!name.endsWith(ext))? name:
+ name.substring(0, name.length() - ext.length());
+ }
+
+ /**
+ * @return short name of Archive file.
+ */
+ public String getFileName() {
+ return (new File(getReaderIdentifier())).getName();
+ }
+
+ /**
+ * @return short name of Archive file.
+ */
+ public String getStrippedFileName() {
+ return getStrippedFileName(getFileName(),
+ getDotFileExtension());
+ }
+
+ /**
+ * @param name Name of ARCFile.
+ * @param dotFileExtension '.arc' or '.warc', etc.
+ * @return short name of Archive file.
+ */
+ public static String getStrippedFileName(String name,
+ final String dotFileExtension) {
+ name = stripExtension(name,
+ ArchiveFileConstants.DOT_COMPRESSED_FILE_EXTENSION);
+ return stripExtension(name, dotFileExtension);
+ }
+
+ /**
+ * @param value Value to test.
+ * @return True if value is 'true', else false.
+ */
+ protected static boolean getTrueOrFalse(final String value) {
+ if (value == null || value.length() <= 0) {
+ return false;
+ }
+ return Boolean.TRUE.toString().equals(value.toLowerCase());
+ }
+
+ /**
+ * @param format Format to use outputting.
+ * @throws IOException
+ * @throws java.text.ParseException
+ * @return True if handled.
+ */
+ protected boolean output(final String format)
+ throws IOException, java.text.ParseException {
+ boolean result = true;
+ // long start = System.currentTimeMillis();
+
+ // Write output as pseudo-CDX file. See
+ // http://www.archive.org/web/researcher/cdx_legend.php
+ // and http://www.archive.org/web/researcher/example_cdx.php.
+ // Hash is hard-coded straight SHA-1 hash of content.
+ if (format.equals(DUMP)) {
+ // No point digesting dumping.
+ setDigest(false);
+ dump(false);
+ } else if (format.equals(GZIP_DUMP)) {
+ // No point digesting dumping.
+ setDigest(false);
+ dump(true);
+ } else if (format.equals(CDX)) {
+ cdxOutput(false);
+ } else if (format.equals(CDX_FILE)) {
+ cdxOutput(true);
+ } else {
+ result = false;
+ }
+ return result;
+ }
+
+ protected void cdxOutput(boolean toFile)
+ throws IOException {
+ BufferedWriter cdxWriter = null;
+ if (toFile) {
+ String cdxFilename = stripExtension(getReaderIdentifier(),
+ DOT_COMPRESSED_FILE_EXTENSION);
+ cdxFilename = stripExtension(cdxFilename, getDotFileExtension());
+ cdxFilename += ('.' + CDX);
+ cdxWriter = new BufferedWriter(new FileWriter(cdxFilename));
+ }
+
+ String header = "CDX b e a m s c " + ((isCompressed()) ? "V" : "v")
+ + " n g";
+ if (toFile) {
+ cdxWriter.write(header);
+ cdxWriter.newLine();
+ } else {
+ System.out.println(header);
+ }
+
+ String strippedFileName = getStrippedFileName();
+ try {
+ for (Iterator ii = iterator(); ii.hasNext();) {
+ ArchiveRecord r = ii.next();
+ if (toFile) {
+ cdxWriter.write(r.outputCdx(strippedFileName));
+ cdxWriter.newLine();
+ } else {
+ System.out.println(r.outputCdx(strippedFileName));
+ }
+ }
+ } finally {
+ if (toFile) {
+ cdxWriter.close();
+ }
+ }
+ }
+
+ /**
+ * Output passed record using passed format specifier.
+ * @param format What format to use outputting.
+ * @throws IOException
+ * @return True if handled.
+ */
+ public boolean outputRecord(final String format)
+ throws IOException {
+ boolean result = true;
+ if (format.equals(CDX)) {
+ System.out.println(get().outputCdx(getStrippedFileName()));
+ } else if(format.equals(ArchiveFileConstants.DUMP)) {
+ // No point digesting if dumping content.
+ setDigest(false);
+ get().dump();
+ } else {
+ result = false;
+ }
+ return result;
+ }
+
+ /**
+ * Dump this file on STDOUT
+ * @throws compress True if dumped output is compressed.
+ * @throws IOException
+ * @throws java.text.ParseException
+ */
+ public abstract void dump(final boolean compress)
+ throws IOException, java.text.ParseException;
+
+ /**
+ * @return an ArchiveReader that will delete a local file on close. Used
+ * when we bring Archive files local and need to clean up afterward.
+ */
+ public abstract ArchiveReader getDeleteFileOnCloseReader(final File f);
+
+ /**
+ * Output passed record using passed format specifier.
+ * @param r ARCReader instance to output.
+ * @param format What format to use outputting.
+ * @throws IOException
+ */
+ protected static void outputRecord(final ArchiveReader r,
+ final String format)
+ throws IOException {
+ if (!r.outputRecord(format)) {
+ throw new IOException("Unsupported format" +
+ " (or unsupported on a single record): " + format);
+ }
+ }
+
+ /**
+ * @return Base Options object filled out with help, digest, strict, etc.
+ * options.
+ */
+ protected static Options getOptions() {
+ Options options = new Options();
+ options.addOption(new Option("h","help", false,
+ "Prints this message and exits."));
+ options.addOption(new Option("o","offset", true,
+ "Outputs record at this offset into file."));
+ options.addOption(new Option("d","digest", true,
+ "Pass true|false. Expensive. Default: true (SHA-1)."));
+ options.addOption(new Option("s","strict", false,
+ "Strict mode. Fails parse if incorrectly formatted file."));
+ options.addOption(new Option("f","format", true,
+ "Output options: 'cdx', cdxfile', 'dump', 'gzipdump'," +
+ "'or 'nohead'. Default: 'cdx'."));
+ return options;
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/io/ArchiveReaderFactory.java b/src/main/java/org/archive/io/ArchiveReaderFactory.java
new file mode 100644
index 00000000..17f14d3a
--- /dev/null
+++ b/src/main/java/org/archive/io/ArchiveReaderFactory.java
@@ -0,0 +1,301 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.HttpURLConnection;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.net.URLConnection;
+
+import org.archive.io.arc.ARCReaderFactory;
+import org.archive.io.warc.WARCReaderFactory;
+import org.archive.net.md5.Md5URLConnection;
+import org.archive.net.rsync.RsyncURLConnection;
+import org.archive.url.UsableURI;
+import org.archive.util.FileUtils;
+
+
+/**
+ * Factory that returns an Archive file Reader.
+ * Returns Readers for ARCs or WARCs.
+ * @author stack
+ * @version $Date$ $Revision$
+ */
+public class ArchiveReaderFactory implements ArchiveFileConstants {
+ // Static block to enable S3 URLs
+ static {
+ if (System.getProperty("java.protocol.handler.pkgs") != null) {
+ System.setProperty("java.protocol.handler.pkgs",
+ System.getProperty("java.protocol.handler.pkgs")
+ + "|" + "org.archive.net");
+ } else {
+ System.setProperty("java.protocol.handler.pkgs", "org.archive.net");
+ }
+ }
+
+ private static final ArchiveReaderFactory factory =
+ new ArchiveReaderFactory();
+
+ /**
+ * Shutdown any public access to default constructor.
+ */
+ protected ArchiveReaderFactory() {
+ super();
+ }
+
+ /**
+ * Get an Archive file Reader on passed path or url.
+ * Does primitive heuristic figuring if path or URL.
+ * @param arcFileOrUrl File path or URL pointing at an Archive file.
+ * @return An Archive file Reader.
+ * @throws IOException
+ * @throws MalformedURLException
+ * @throws IOException
+ */
+ public static ArchiveReader get(final String arcFileOrUrl)
+ throws MalformedURLException, IOException {
+ return ArchiveReaderFactory.factory.getArchiveReader(arcFileOrUrl);
+ }
+
+ protected ArchiveReader getArchiveReader(final String arcFileOrUrl)
+ throws MalformedURLException, IOException {
+ return getArchiveReader(arcFileOrUrl, 0);
+ }
+
+ protected ArchiveReader getArchiveReader(final String arcFileOrUrl,
+ final long offset)
+ throws MalformedURLException, IOException {
+ return UsableURI.hasScheme(arcFileOrUrl) && arcFileOrUrl.indexOf(":")>1?
+ get(new URL(arcFileOrUrl), offset):
+ get(new File(arcFileOrUrl), offset);
+ }
+
+ /**
+ * @param f An Archive file to read.
+ * @return An ArchiveReader
+ * @throws IOException
+ */
+ public static ArchiveReader get(final File f) throws IOException {
+ return ArchiveReaderFactory.factory.getArchiveReader(f);
+ }
+
+ protected ArchiveReader getArchiveReader(final File f)
+ throws IOException {
+ return getArchiveReader(f, 0);
+ }
+
+ /**
+ * @param f An Archive file to read.
+ * @param offset Have returned Reader set to start reading at this offset.
+ * @return An ArchiveReader
+ * @throws IOException
+ */
+ public static ArchiveReader get(final File f, final long offset)
+ throws IOException {
+ return ArchiveReaderFactory.factory.getArchiveReader(f, offset);
+ }
+
+ protected ArchiveReader getArchiveReader(final File f,
+ final long offset)
+ throws IOException {
+ if (ARCReaderFactory.isARCSuffix(f.getName())) {
+ return ARCReaderFactory.get(f, true, offset);
+ } else if (WARCReaderFactory.isWARCSuffix(f.getName())) {
+ return WARCReaderFactory.get(f, offset);
+ }
+ throw new IOException("Unknown file extension (Not ARC nor WARC): "
+ + f.getName());
+ }
+
+ /**
+ * Wrap a Reader around passed Stream.
+ * @param s Identifying String for this Stream used in error messages.
+ * Must be a string that ends with the name of the file we're to put
+ * an ArchiveReader on. This code looks at file endings to figure
+ * whether to return an ARC or WARC reader.
+ * @param is Stream. Stream will be wrapped with implementation of
+ * RepositionableStream unless already supported.
+ * @param atFirstRecord Are we at first Record?
+ * @return ArchiveReader.
+ * @throws IOException
+ */
+ public static ArchiveReader get(final String s, final InputStream is,
+ final boolean atFirstRecord)
+ throws IOException {
+ return ArchiveReaderFactory.factory.getArchiveReader(s, is,
+ atFirstRecord);
+ }
+
+ protected ArchiveReader getArchiveReader(final String id,
+ final InputStream is, final boolean atFirstRecord)
+ throws IOException {
+ final InputStream stream = is;
+ if (ARCReaderFactory.isARCSuffix(id)) {
+ return ARCReaderFactory.get(id, stream, atFirstRecord);
+ } else if (WARCReaderFactory.isWARCSuffix(id)) {
+ return WARCReaderFactory.get(id, stream, atFirstRecord);
+ }
+ throw new IOException("Unknown extension (Not ARC nor WARC): " + id);
+ }
+
+ /**
+ * Get an Archive Reader aligned at offset.
+ * This version of get will not bring the file local but will try to
+ * stream across the net making an HTTP 1.1 Range request on remote
+ * http server (RFC1435 Section 14.35).
+ * @param u HTTP URL for an Archive file.
+ * @param offset Offset into file at which to start fetching.
+ * @return An ArchiveReader aligned at offset.
+ * @throws IOException
+ */
+ public static ArchiveReader get(final URL u, final long offset)
+ throws IOException {
+ return ArchiveReaderFactory.factory.getArchiveReader(u, offset);
+ }
+
+ protected ArchiveReader getArchiveReader(final URL f, final long offset)
+ throws IOException {
+ // Get URL connection.
+ URLConnection connection = f.openConnection();
+ if (connection instanceof HttpURLConnection) {
+ addUserAgent((HttpURLConnection)connection);
+ }
+ if (offset != 0) {
+ // Use a Range request (Assumes HTTP 1.1 on other end). If
+ // length >= 0, add open-ended range header to the request. Else,
+ // because end-byte is inclusive, subtract 1.
+ connection.addRequestProperty("Range", "bytes=" + offset + "-");
+ // TODO: should actually verify that server respected 'Range' request
+ // (spec allows them to ignore; 206 response or Content-Range header
+ // should be present if Range satisfied; multipart/byteranges could be
+ // a problem).
+ }
+
+ return getArchiveReader(f.toString(), connection.getInputStream(), (offset == 0));
+ }
+
+ /**
+ * Get an ARCReader.
+ * Pulls the ARC local into whereever the System Property
+ * java.io.tmpdir points. It then hands back an ARCReader that
+ * points at this local copy. A close on this ARCReader instance will
+ * remove the local copy.
+ * @param u An URL that points at an ARC.
+ * @return An ARCReader.
+ * @throws IOException
+ */
+ public static ArchiveReader get(final URL u)
+ throws IOException {
+ return ArchiveReaderFactory.factory.getArchiveReader(u);
+ }
+
+ protected ArchiveReader getArchiveReader(final URL u)
+ throws IOException {
+ // If url represents a local file then return file it points to.
+ if (u.getPath() != null) {
+ // TODO: Add scheme check and host check.
+ File f = new File(u.getPath());
+ if (f.exists()) {
+ return get(f, 0);
+ }
+ }
+
+ String scheme = u.getProtocol();
+ if (scheme.startsWith("http") || scheme.equals("s3")) {
+ // Try streaming if http or s3 URLs rather than copying local
+ // and then reading (Passing an offset will get us an Reader
+ // that wraps a Stream).
+ return get(u, 0);
+ }
+
+ return makeARCLocal(u.openConnection());
+ }
+
+ protected ArchiveReader makeARCLocal(final URLConnection connection)
+ throws IOException {
+ File localFile = null;
+ if (connection instanceof HttpURLConnection) {
+ // If http url connection, bring down the resource local.
+ String p = connection.getURL().getPath();
+ int index = p.lastIndexOf('/');
+ if (index >= 0) {
+ // Name file for the file we're making local.
+ localFile = File.createTempFile("",p.substring(index + 1));
+ if (localFile.exists()) {
+ // If file of same name already exists in TMPDIR, then
+ // clean it up (Assuming only reason a file of same name in
+ // TMPDIR is because we failed a previous download).
+ localFile.delete();
+ }
+ } else {
+ localFile = File.createTempFile(ArchiveReader.class.getName(),
+ ".tmp");
+ }
+ addUserAgent((HttpURLConnection)connection);
+ connection.connect();
+ try {
+ FileUtils.readFullyToFile(connection.getInputStream(), localFile);
+ } catch (IOException ioe) {
+ localFile.delete();
+ throw ioe;
+ }
+ } else if (connection instanceof RsyncURLConnection) {
+ // Then, connect and this will create a local file.
+ // See implementation of the rsync handler.
+ connection.connect();
+ localFile = ((RsyncURLConnection)connection).getFile();
+ } else if (connection instanceof Md5URLConnection) {
+ // Then, connect and this will create a local file.
+ // See implementation of the md5 handler.
+ connection.connect();
+ localFile = ((Md5URLConnection)connection).getFile();
+ } else {
+ throw new UnsupportedOperationException("No support for " +
+ connection);
+ }
+
+ ArchiveReader reader = null;
+ try {
+ reader = get(localFile, 0);
+ } catch (IOException e) {
+ localFile.delete();
+ throw e;
+ }
+
+ // Return a delegate that does cleanup of downloaded file on close.
+ return reader.getDeleteFileOnCloseReader(localFile);
+ }
+
+ protected void addUserAgent(final HttpURLConnection connection) {
+ connection.addRequestProperty("User-Agent", this.getClass().getName());
+ }
+
+ /**
+ * @param f File to test.
+ * @return True if f is compressed.
+ * @throws IOException
+ */
+ protected boolean isCompressed(final File f) throws IOException {
+ return f.getName().toLowerCase().
+ endsWith(DOT_COMPRESSED_FILE_EXTENSION);
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/io/ArchiveRecord.java b/src/main/java/org/archive/io/ArchiveRecord.java
new file mode 100644
index 00000000..63bfe628
--- /dev/null
+++ b/src/main/java/org/archive/io/ArchiveRecord.java
@@ -0,0 +1,409 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+import java.util.logging.Level;
+
+import org.archive.util.Base32;
+
+/**
+ * Archive file Record.
+ * @author stack
+ * @version $Date$ $Version$
+ */
+public abstract class ArchiveRecord extends InputStream {
+
+ /**
+ * Minimal http response or request header length.
+ *
+ * I've seen in arcs content length of 1 with no header.
+ */
+ protected static final long MIN_HTTP_HEADER_LENGTH =
+ Math.min("HTTP/1.1 200 OK\r\n".length(), "GET / HTTP/1.0\n\r".length());
+
+ protected ArchiveRecordHeader header = null;
+
+ /**
+ * Stream to read this record from.
+ *
+ * Stream can only be read sequentially. Will only return this records'
+ * content returning a -1 if you try to read beyond the end of the current
+ * record.
+ *
+ *
Streams can be markable or not. If they are, we'll be able to roll
+ * back when we've read too far. If not markable, assumption is that
+ * the underlying stream is managing our not reading too much (This pertains
+ * to the skipping over the end of the ARCRecord. See {@link #skip()}.
+ */
+ protected InputStream in = null;
+
+ /**
+ * Position w/i the Record content, within in.
+ * This position is relative within this Record. Its not same as the
+ * Archive file position.
+ */
+ protected long position = 0;
+
+ /**
+ * Set flag when we've reached the end-of-record.
+ */
+ protected boolean eor = false;
+
+ /**
+ * Compute digest on what we read and add to metadata when done.
+ *
+ * Currently hardcoded as sha-1. TODO: Remove when archive records
+ * digest or else, add a facility that allows the arc reader to
+ * compare the calculated digest to that which is recorded in
+ * the arc.
+ *
+ *
Protected instead of private so subclasses can update and complete
+ * the digest.
+ */
+ protected MessageDigest digest = null;
+ private String digestStr = null;
+
+ protected boolean strict = false;
+
+
+ /**
+ * Constructor.
+ *
+ * @param in Stream cue'd up to be at the start of the record this instance
+ * is to represent.
+ * @throws IOException
+ */
+ public ArchiveRecord(InputStream in)
+ throws IOException {
+ this(in, null, 0, true, false);
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param in Stream cue'd up to be at the start of the record this instance
+ * is to represent.
+ * @param header Header data.
+ * @throws IOException
+ */
+ public ArchiveRecord(InputStream in, ArchiveRecordHeader header)
+ throws IOException {
+ this(in, header, 0, true, false);
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param in Stream cue'd up to be at the start of the record this instance
+ * is to represent.
+ * @param header Header data.
+ * @param bodyOffset Offset into the body. Usually 0.
+ * @param digest True if we're to calculate digest for this record. Not
+ * digesting saves about ~15% of cpu during an ARC parse.
+ * @param strict Be strict parsing (Parsing stops if ARC inproperly
+ * formatted).
+ * @throws IOException
+ */
+ public ArchiveRecord(InputStream in, ArchiveRecordHeader header,
+ int bodyOffset, boolean digest, boolean strict)
+ throws IOException {
+ this.in = in;
+ this.header = header;
+ this.position = bodyOffset;
+ if (digest) {
+ try {
+ this.digest = MessageDigest.getInstance("SHA1");
+ } catch (NoSuchAlgorithmException e) {
+ // Convert to IOE because thats more amenable to callers
+ // -- they are dealing with it anyways.
+ throw new IOException(e.getMessage());
+ }
+ }
+ this.strict = strict;
+ }
+
+ public boolean markSupported() {
+ return false;
+ }
+
+ /**
+ * @return Header data for this record.
+ */
+ public ArchiveRecordHeader getHeader() {
+ return this.header;
+ }
+
+ protected void setHeader(ArchiveRecordHeader header) {
+ this.header = header;
+ }
+
+ /**
+ * Calling close on a record skips us past this record to the next record
+ * in the stream.
+ *
+ * It does not actually close the stream. The underlying steam is probably
+ * being used by the next arc record.
+ *
+ * @throws IOException
+ */
+ public void close() throws IOException {
+ if (this.in != null) {
+ skip();
+ this.in = null;
+ if (this.digest != null) {
+ this.digestStr = Base32.encode(this.digest.digest());
+ }
+ }
+ }
+
+ /**
+ * @return Next character in this Record content else -1 if at EOR.
+ * @throws IOException
+ */
+ public int read() throws IOException {
+ int c = -1;
+ if (available() > 0) {
+ c = this.in.read();
+ if (c == -1) {
+ throw new IOException("Premature EOF before end-of-record.");
+ }
+ if (this.digest != null) {
+ this.digest.update((byte) c);
+ }
+ incrementPosition();
+ }
+ return c;
+ }
+
+ public int read(byte[] b, int offset, int length) throws IOException {
+ int read = Math.min(length, available());
+ if (read == -1 || read == 0) {
+ read = -1;
+ } else {
+ read = this.in.read(b, offset, read);
+ if (read == -1) {
+ String msg = "Premature EOF before end-of-record: "
+ + getHeader().getHeaderFields();
+ if (isStrict()) {
+ throw new IOException(msg);
+ }
+ setEor(true);
+ System.err.println(Level.WARNING.toString() + " " + msg);
+ }
+ if (this.digest != null && read >= 0) {
+ this.digest.update(b, offset, read);
+ }
+ incrementPosition(read);
+ }
+ return read;
+ }
+
+ /**
+ * This available is not the stream's available. Its an available based on
+ * what the stated Archive record length is minus what we've read to date.
+ *
+ * @return True if bytes remaining in record content.
+ */
+ public int available() {
+ long amount = getHeader().getLength() - getPosition();
+ return (amount > Integer.MAX_VALUE? Integer.MAX_VALUE: (int)amount);
+ }
+
+ /**
+ * Skip over this records content.
+ *
+ * @throws IOException
+ */
+ protected void skip() throws IOException {
+ if (this.eor) {
+ return;
+ }
+
+ // Read to the end of the body of the record. Exhaust the stream.
+ // Can't skip direct to end because underlying stream may be compressed
+ // and we're calculating the digest for the record.
+ int r = available();
+ while (r > 0 && !this.eor) {
+ skip(r);
+ r = available();
+ }
+ }
+
+ public long skip(long n) throws IOException {
+ final int SKIP_BUFFERSIZE = 1024 * 4;
+ byte[] b = new byte[SKIP_BUFFERSIZE];
+ long total = 0;
+ for (int read = 0; (total < n) && (read != -1);) {
+ read = Math.min(SKIP_BUFFERSIZE, (int) (n - total));
+ // TODO: Interesting is that reading from compressed stream, we only
+ // read about 500 characters at a time though we ask for 4k.
+ // Look at this sometime.
+ read = read(b, 0, read);
+ if (read <= 0) {
+ read = -1;
+ } else {
+ total += read;
+ }
+ }
+ return total;
+ }
+
+ /**
+ * @return Returns the strict.
+ */
+ public boolean isStrict() {
+ return this.strict;
+ }
+
+ /**
+ * @param strict The strict to set.
+ */
+ public void setStrict(boolean strict) {
+ this.strict = strict;
+ }
+
+ protected InputStream getIn() {
+ return this.in;
+ }
+
+ public String getDigestStr() {
+ return this.digestStr;
+ }
+
+ protected void incrementPosition() {
+ this.position++;
+ }
+
+ protected void incrementPosition(final long incr) {
+ this.position += incr;
+ }
+
+ public long getPosition() {
+ return this.position;
+ }
+
+ protected boolean isEor() {
+ return eor;
+ }
+
+ protected void setEor(boolean eor) {
+ this.eor = eor;
+ }
+
+ protected String getStatusCode4Cdx(final ArchiveRecordHeader h) {
+ return "-";
+ }
+
+ protected String getIp4Cdx(final ArchiveRecordHeader h) {
+ return "-";
+ }
+
+ protected String getDigest4Cdx(final ArchiveRecordHeader h) {
+ return getDigestStr() == null? "-": getDigestStr();
+ }
+
+ protected String getMimetype4Cdx(final ArchiveRecordHeader h) {
+ return h.getMimetype();
+ }
+
+ protected String outputCdx(final String strippedFileName)
+ throws IOException {
+ // Read the whole record so we get out a hash. Should be safe calling
+ // close on already closed Record.
+ close();
+ ArchiveRecordHeader h = getHeader();
+ StringBuilder buffer =
+ new StringBuilder(ArchiveFileConstants.CDX_LINE_BUFFER_SIZE);
+ buffer.append(h.getDate());
+ buffer.append(ArchiveFileConstants.SINGLE_SPACE);
+ buffer.append(getIp4Cdx(h));
+ buffer.append(ArchiveFileConstants.SINGLE_SPACE);
+ buffer.append(h.getUrl());
+ buffer.append(ArchiveFileConstants.SINGLE_SPACE);
+ buffer.append(getMimetype4Cdx(h));
+ buffer.append(ArchiveFileConstants.SINGLE_SPACE);
+ buffer.append(getStatusCode4Cdx(h));
+ buffer.append(ArchiveFileConstants.SINGLE_SPACE);
+ buffer.append(getDigest4Cdx(h));
+ buffer.append(ArchiveFileConstants.SINGLE_SPACE);
+ buffer.append(h.getOffset());
+ buffer.append(ArchiveFileConstants.SINGLE_SPACE);
+ buffer.append(h.getLength());
+ buffer.append(ArchiveFileConstants.SINGLE_SPACE);
+ buffer.append(strippedFileName != null? strippedFileName: '-');
+ return buffer.toString();
+ }
+
+ /**
+ * Writes output on STDOUT.
+ * @throws IOException
+ */
+ public void dump()
+ throws IOException {
+ dump(System.out);
+ }
+
+ /**
+ * Writes output on passed os.
+ * @throws IOException
+ */
+ public void dump(final OutputStream os)
+ throws IOException {
+ final byte [] outputBuffer = new byte [16*1024];
+ int read = outputBuffer.length;
+ while ((read = read(outputBuffer, 0, outputBuffer.length)) != -1) {
+ os.write(outputBuffer, 0, read);
+ }
+ os.flush();
+ }
+
+ /**
+ * Is it likely that this record contains headers?
+ * This method will return true if the body is a http response that includes
+ * http response headers or the body is a http request that includes request
+ * headers, etc. Be aware that headers in content are distinct from
+ * {@link ArchiveRecordHeader} 'headers'.
+ * @return True if this Record's content has headers:
+ */
+ public boolean hasContentHeaders() {
+ final String url = getHeader().getUrl();
+ if (url == null) {
+ return false;
+ }
+
+ if (!url.toLowerCase().startsWith("http")) {
+ return false;
+ }
+
+ if (getHeader().getLength() <= MIN_HTTP_HEADER_LENGTH) {
+ return false;
+ }
+
+ return true;
+ }
+
+ protected void setBodyOffset(int bodyOffset) {
+ this.position = bodyOffset;
+ }
+}
diff --git a/src/main/java/org/archive/io/ArchiveRecordHeader.java b/src/main/java/org/archive/io/ArchiveRecordHeader.java
new file mode 100644
index 00000000..953537b1
--- /dev/null
+++ b/src/main/java/org/archive/io/ArchiveRecordHeader.java
@@ -0,0 +1,111 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * Archive Record Header.
+ * @author stack
+ * @version $Date$ $Version$
+ */
+public interface ArchiveRecordHeader {
+ /**
+ * Get the time when the record was created.
+ * @return Date in 14 digit time format (UTC).
+ * @see org.archive.util.ArchiveUtils#parse14DigitDate(String)
+ */
+ public abstract String getDate();
+
+ /**
+ * @return Return length of record.
+ */
+ public abstract long getLength();
+
+ /**
+ * @return Return Content-Length of the contents of the record
+ */
+ public abstract long getContentLength();
+
+
+ /**
+ * @return Record subject-url.
+ */
+ public abstract String getUrl();
+
+ /**
+ * @return Record mimetype.
+ */
+ public abstract String getMimetype();
+
+ /**
+ * @return Record version.
+ */
+ public abstract String getVersion();
+
+ /**
+ * @return Offset into Archive file at which this record begins.
+ */
+ public abstract long getOffset();
+
+ /**
+ * @param key Key to use looking up field value.
+ * @return value for passed key of null if no such entry.
+ */
+ public abstract Object getHeaderValue(final String key);
+
+ /**
+ * @return Header field name keys.
+ */
+ public abstract Set getHeaderFieldKeys();
+
+ /**
+ * @return Map of header fields.
+ */
+ public abstract Map getHeaderFields();
+
+ /**
+ * @return Returns identifier for current Archive file. Be aware this
+ * may not be a file name or file path. It may just be an URL. Depends
+ * on how Archive file was made.
+ */
+ public abstract String getReaderIdentifier();
+
+ /**
+ * @return Identifier for the record. If ARC, the URL + date. If WARC,
+ * the GUID assigned.
+ */
+ public abstract String getRecordIdentifier();
+
+ /**
+ * @return Returns digest as String for this record. Only available after
+ * the record has been read in totality.
+ */
+ public abstract String getDigest();
+
+ /**
+ * Offset at which the content begins.
+ * For ARCs, its used to delimit where http headers end and content begins.
+ * For WARCs, its end of Named Fields before payload starts.
+ */
+ public int getContentBegin();
+
+ public abstract String toString();
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/io/ArraySeekInputStream.java b/src/main/java/org/archive/io/ArraySeekInputStream.java
new file mode 100644
index 00000000..5b30747e
--- /dev/null
+++ b/src/main/java/org/archive/io/ArraySeekInputStream.java
@@ -0,0 +1,106 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+import java.io.IOException;
+
+
+/**
+ * A repositionable stream backed by an array.
+ *
+ * @author pjack
+ */
+public class ArraySeekInputStream extends SeekInputStream {
+
+
+ /**
+ * The array of bytes to read from.
+ */
+ private byte[] array;
+
+
+ /**
+ * The offset in the array of the next byte to read.
+ */
+ private int offset;
+
+
+ /**
+ * Constructor. Note that changes to the given array will be reflected
+ * in the stream.
+ *
+ * @param array The array to read bytes from.
+ */
+ public ArraySeekInputStream(byte[] array) {
+ this.array = array;
+ this.offset = 0;
+ }
+
+
+ @Override
+ public int read() {
+ if (offset >= array.length) {
+ return -1;
+ }
+ int r = array[offset] & 0xFF;
+ offset++;
+ return r;
+ }
+
+
+ @Override
+ public int read(byte[] buf, int ofs, int len) {
+ if (offset >= array.length) {
+ return 0;
+ }
+ len = Math.min(len, array.length - offset);
+ System.arraycopy(array, offset, buf, ofs, len);
+ offset += len;
+ return len;
+ }
+
+
+ @Override
+ public int read(byte[] buf) {
+ return read(buf, 0, buf.length);
+ }
+
+
+ /**
+ * Returns the position of the stream.
+ */
+ public long position() {
+ return offset;
+ }
+
+
+ /**
+ * Repositions the stream.
+ *
+ * @param p the new position for the stream
+ * @throws IOException if the given position is out of bounds
+ */
+ public void position(long p) throws IOException {
+ if ((p < 0) || (p > array.length)) {
+ throw new IOException("Invalid position: " + p);
+ }
+ offset = (int)p;
+ }
+
+}
diff --git a/src/main/java/org/archive/io/BufferedSeekInputStream.java b/src/main/java/org/archive/io/BufferedSeekInputStream.java
new file mode 100644
index 00000000..2fdc72b7
--- /dev/null
+++ b/src/main/java/org/archive/io/BufferedSeekInputStream.java
@@ -0,0 +1,217 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+
+import java.io.IOException;
+
+
+/**
+ * Buffers data from some other SeekInputStream.
+ *
+ * @author pjack
+ */
+public class BufferedSeekInputStream extends SeekInputStream {
+
+
+ /**
+ * The underlying input stream.
+ */
+ final private SeekInputStream input;
+
+
+ /**
+ * The buffered data.
+ */
+ final private byte[] buffer;
+
+
+ /**
+ * The maximum offset of valid data in the buffer. Usually the same
+ * as buffer.length, but may be shorter if we're in the last region
+ * of the stream.
+ */
+ private int maxOffset;
+
+
+ /**
+ * The offset of within the buffer of the next byte to read.
+ */
+ private int offset;
+
+
+ /**
+ * Constructor.
+ *
+ * @param input the underlying input stream
+ * @param capacity the size of the buffer
+ * @throws IOException if an IO occurs filling the first buffer
+ */
+ public BufferedSeekInputStream(SeekInputStream input, int capacity)
+ throws IOException {
+ this.input = input;
+ this.buffer = new byte[capacity];
+ buffer();
+ }
+
+ /**
+ * Fills the buffer.
+ *
+ * @throws IOException if an IO error occurs
+ */
+ private void buffer() throws IOException {
+ int remaining = buffer.length;
+ while (remaining > 0) {
+ int r = input.read(buffer, buffer.length - remaining, remaining);
+ if (r <= 0) {
+ // Not enough information to fill the buffer
+ offset = 0;
+ maxOffset = buffer.length - remaining;
+ return;
+ }
+ remaining -= r;
+ }
+ maxOffset = buffer.length;
+ offset = 0;
+ }
+
+
+ /**
+ * Ensures that the buffer is valid.
+ *
+ * @throws IOException if an IO error occurs
+ */
+ private void ensureBuffer() throws IOException {
+ if (offset >= maxOffset) {
+ buffer();
+ }
+ }
+
+
+ /**
+ * Returns the number of unread bytes in the current buffer.
+ *
+ * @return the remaining bytes
+ */
+ private int remaining() {
+ return maxOffset - offset;
+ }
+
+
+ @Override
+ public int read() throws IOException {
+ ensureBuffer();
+ if (maxOffset == 0) {
+ return -1;
+ }
+ int ch = buffer[offset] & 0xFF;
+ offset++;
+ return ch;
+ }
+
+
+ @Override
+ public int read(byte[] buf, int ofs, int len) throws IOException {
+ ensureBuffer();
+ if (maxOffset == 0) {
+ return 0;
+ }
+ len = Math.min(len, remaining());
+ System.arraycopy(buffer, offset, buf, ofs, len);
+ offset += len;
+ return len;
+ }
+
+
+ @Override
+ public int read(byte[] buf) throws IOException {
+ return read(buf, 0, buf.length);
+ }
+
+
+ @Override
+ public long skip(long c) throws IOException {
+ ensureBuffer();
+ if (maxOffset == 0) {
+ return 0;
+ }
+ int count = (c > Integer.MAX_VALUE) ? Integer.MAX_VALUE : (int)c;
+ int skip = Math.min(count, remaining());
+ offset += skip;
+ return skip;
+ }
+
+
+ /**
+ * Returns the stream's current position.
+ *
+ * @return the current position
+ */
+ public long position() throws IOException {
+ return input.position() - buffer.length + offset;
+ }
+
+
+ /**
+ * Seeks to the given position. This method avoids re-filling the buffer
+ * if at all possible.
+ *
+ * @param p the position to set
+ * @throws IOException if an IO error occurs
+ */
+ public void position(long p) throws IOException {
+ long blockStart = (input.position() - maxOffset)
+ / buffer.length * buffer.length;
+ long blockEnd = blockStart + maxOffset;
+ if ((p >= blockStart) && (p < blockEnd)) {
+ // Desired position is somewhere inside current buffer
+ long adj = p - blockStart;
+ offset = (int)adj;
+ return;
+ }
+ positionDirect(p);
+ }
+
+
+ /**
+ * Positions the underlying stream at the given position, then refills
+ * the buffer.
+ *
+ * @param p the position to set
+ * @throws IOException if an IO error occurs
+ */
+ private void positionDirect(long p) throws IOException {
+ long newBlockStart = p / buffer.length * buffer.length;
+ input.position(newBlockStart);
+ buffer();
+ offset = (int)(p % buffer.length);
+ }
+
+ /**
+ * Close the stream, including the wrapped input stream.
+ */
+ public void close() throws IOException {
+ super.close();
+ if(this.input!=null) {
+ this.input.close();
+ }
+ }
+
+
+}
diff --git a/src/main/java/org/archive/io/CharSubSequence.java b/src/main/java/org/archive/io/CharSubSequence.java
new file mode 100644
index 00000000..1e89da56
--- /dev/null
+++ b/src/main/java/org/archive/io/CharSubSequence.java
@@ -0,0 +1,90 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+
+/**
+ * Provides a subsequence view onto a CharSequence.
+ *
+ * @author gojomo
+ * @version $Revision$, $Date$
+ */
+public class CharSubSequence implements CharSequence {
+
+ protected CharSequence inner;
+ protected int start;
+ protected int end;
+
+ public CharSubSequence(CharSequence inner, int start, int end) {
+ if (end < start) {
+ throw new IllegalArgumentException("Start " + start + " is > " +
+ " than end " + end);
+ }
+
+ if (end < 0 || start < 0) {
+ throw new IllegalArgumentException("Start " + start + " or end " +
+ end + " is < 0.");
+ }
+
+ if (inner == null) {
+ throw new NullPointerException("Passed charsequence is null.");
+ }
+
+ this.inner = inner;
+ this.start = start;
+ this.end = end;
+ }
+
+ /*
+ * (non-Javadoc)
+ * @see java.lang.CharSequence#length()
+ */
+ public int length() {
+ return this.end - this.start;
+ }
+
+ /*
+ * (non-Javadoc)
+ * @see java.lang.CharSequence#charAt(int)
+ */
+ public char charAt(int index) {
+ return this.inner.charAt(this.start + index);
+ }
+
+ /*
+ * (non-Javadoc)
+ * @see java.lang.CharSequence#subSequence(int, int)
+ */
+ public CharSequence subSequence(int begin, int finish) {
+ return new CharSubSequence(this, begin, finish);
+ }
+
+ /*
+ * (non-Javadoc)
+ * @see java.lang.CharSequence#toString()
+ */
+ public String toString() {
+ StringBuffer sb = new StringBuffer(length());
+ // could use StringBuffer.append(CharSequence) if willing to do 1.5 & up
+ for (int i = 0;i filenames;
+
+ /* (non-Javadoc)
+ * @see java.io.InputStream#read()
+ */
+ public int read() throws IOException {
+ int c = super.read();
+ if( c == -1 && filenames.hasNext() ) {
+ cueStream();
+ return read();
+ }
+ return c;
+ }
+ /* (non-Javadoc)
+ * @see java.io.InputStream#read(byte[], int, int)
+ */
+ public int read(byte[] b, int off, int len) throws IOException {
+ int c = super.read(b, off, len);
+ if( c == -1 && filenames.hasNext() ) {
+ cueStream();
+ return read(b,off,len);
+ }
+ return c;
+ }
+ /* (non-Javadoc)
+ * @see java.io.InputStream#read(byte[])
+ */
+ public int read(byte[] b) throws IOException {
+ int c = super.read(b);
+ if( c == -1 && filenames.hasNext() ) {
+ cueStream();
+ return read(b);
+ }
+ return c;
+ }
+
+ /* (non-Javadoc)
+ * @see java.io.InputStream#skip(long)
+ */
+ public long skip(long n) throws IOException {
+ long s = super.skip(n);
+ if( s files) throws IOException {
+ super(null);
+ filenames = files.iterator();
+ cueStream();
+ }
+
+ private void cueStream() throws IOException {
+ if(filenames.hasNext()) {
+ this.in = new FileInputStream(filenames.next());
+ }
+ }
+
+}
diff --git a/src/main/java/org/archive/io/CompositeFileReader.java b/src/main/java/org/archive/io/CompositeFileReader.java
new file mode 100644
index 00000000..14b56219
--- /dev/null
+++ b/src/main/java/org/archive/io/CompositeFileReader.java
@@ -0,0 +1,40 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.List;
+
+
+/**
+ * @author gojomo
+ */
+public class CompositeFileReader extends InputStreamReader {
+
+ /**
+ * @param filenames
+ * @throws IOException
+ */
+ public CompositeFileReader(List filenames) throws IOException {
+ super(new CompositeFileInputStream(filenames));
+ }
+
+}
diff --git a/src/main/java/org/archive/io/Endian.java b/src/main/java/org/archive/io/Endian.java
new file mode 100644
index 00000000..f6d89aaa
--- /dev/null
+++ b/src/main/java/org/archive/io/Endian.java
@@ -0,0 +1,125 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.InputStream;
+
+
+/**
+ * Reads integers stored in big or little endian streams.
+ *
+ * @author pjack
+ */
+public class Endian {
+
+
+ /**
+ * Static utility class.
+ */
+ private Endian() {
+ }
+
+
+ /**
+ * Reads the next little-endian unsigned 16 bit integer from the
+ * given stream.
+ *
+ * @param input the input stream to read from
+ * @return the next 16-bit little-endian integer
+ * @throws IOException if an IO error occurs
+ */
+ public static char littleChar(InputStream input) throws IOException {
+ int lo = input.read();
+ if (lo < 0) {
+ throw new EOFException();
+ }
+ int hi = input.read();
+ if (hi < 0) {
+ throw new EOFException();
+ }
+ return (char)((hi << 8) | lo);
+ }
+
+
+ /**
+ * Reads the next little-endian signed 16-bit integer from the
+ * given stream.
+ *
+ * @param input the input stream to read from
+ * @return the next 16-bit little-endian integer
+ * @throws IOException if an IO error occurs
+ */
+ public static short littleShort(InputStream input) throws IOException {
+ return (short)littleChar(input);
+ }
+
+
+ /**
+ * Reads the next little-endian signed 32-bit integer from the
+ * given stream.
+ *
+ * @param input the input stream to read from
+ * @return the next 32-bit little-endian integer
+ * @throws IOException if an IO error occurs
+ */
+ public static int littleInt(InputStream input) throws IOException {
+ char lo = littleChar(input);
+ char hi = littleChar(input);
+ return (hi << 16) | lo;
+ }
+
+
+ /**
+ * Reads the next big-endian unsigned 16 bit integer from the
+ * given stream.
+ *
+ * @param input the input stream to read from
+ * @return the next 16-bit big-endian integer
+ * @throws IOException if an IO error occurs
+ */
+ public static char bigChar(InputStream input) throws IOException {
+ int hi = input.read();
+ if (hi < 0) {
+ throw new EOFException();
+ }
+ int lo = input.read();
+ if (lo < 0) {
+ throw new EOFException();
+ }
+ return (char)((hi << 8) | lo);
+ }
+
+
+ /**
+ * Reads the next big-endian signed 32-bit integer from the
+ * given stream.
+ *
+ * @param input the input stream to read from
+ * @return the next 32-bit big-endian integer
+ * @throws IOException if an IO error occurs
+ */
+ public static int bigInt(InputStream input) throws IOException {
+ char hi = bigChar(input);
+ char lo = bigChar(input);
+ return (hi << 16) | lo;
+ }
+}
diff --git a/src/main/java/org/archive/io/GZIPMembersInputStream.java b/src/main/java/org/archive/io/GZIPMembersInputStream.java
new file mode 100644
index 00000000..35fb9e90
--- /dev/null
+++ b/src/main/java/org/archive/io/GZIPMembersInputStream.java
@@ -0,0 +1,38 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * @deprecated use {@link org.archive.util.zip.GZIPMembersInputStream}
+ */
+@Deprecated
+public class GZIPMembersInputStream extends org.archive.util.zip.GZIPMembersInputStream {
+
+ public GZIPMembersInputStream(InputStream in) throws IOException {
+ super(in);
+ }
+
+ public GZIPMembersInputStream(InputStream in, int size) throws IOException {
+ super(in, size);
+ }
+
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/io/GenerationFileHandler.java b/src/main/java/org/archive/io/GenerationFileHandler.java
new file mode 100644
index 00000000..c1ce8d79
--- /dev/null
+++ b/src/main/java/org/archive/io/GenerationFileHandler.java
@@ -0,0 +1,200 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.logging.FileHandler;
+import java.util.logging.Formatter;
+import java.util.logging.LogRecord;
+
+import org.archive.util.FileUtils;
+
+
+/**
+ * FileHandler with support for rotating the current file to
+ * an archival name with a specified integer suffix, and
+ * provision of a new replacement FileHandler with the current
+ * filename.
+ *
+ * @author gojomo
+ */
+public class GenerationFileHandler extends FileHandler {
+ private LinkedList filenameSeries = new LinkedList();
+ private boolean shouldManifest = false;
+
+ /**
+ * @return Returns the filenameSeries.
+ */
+ public List getFilenameSeries() {
+ return filenameSeries;
+ }
+
+ /**
+ * Constructor.
+ * @param pattern
+ * @param append
+ * @param shouldManifest
+ * @throws IOException
+ * @throws SecurityException
+ */
+ public GenerationFileHandler(String pattern, boolean append,
+ boolean shouldManifest)
+ throws IOException, SecurityException {
+ super(pattern, append);
+ filenameSeries.addFirst(pattern);
+ this.shouldManifest = shouldManifest;
+ }
+
+ /**
+ * @param filenameSeries
+ * @param shouldManifest
+ * @throws IOException
+ */
+ public GenerationFileHandler(LinkedList filenameSeries,
+ boolean shouldManifest)
+ throws IOException {
+ super((String)filenameSeries.getFirst(), false); // Never append in this case
+ this.filenameSeries = filenameSeries;
+ this.shouldManifest = shouldManifest;
+ }
+
+ /**
+ * Move the current file to a new filename with the storeSuffix in place
+ * of the activeSuffix; continuing logging to a new file under the
+ * original filename.
+ *
+ * @param storeSuffix Suffix to put in place of activeSuffix
+ * @param activeSuffix Suffix to replace with storeSuffix.
+ * @return GenerationFileHandler instance.
+ * @throws IOException
+ */
+ public GenerationFileHandler rotate(String storeSuffix,
+ String activeSuffix)
+ throws IOException {
+ return rotate(storeSuffix, activeSuffix, false);
+ }
+
+ public GenerationFileHandler rotate(String storeSuffix,
+ String activeSuffix, boolean mergeOld) throws IOException {
+ close();
+ String filename = (String) filenameSeries.getFirst();
+ if (!filename.endsWith(activeSuffix)) {
+ throw new FileNotFoundException("Active file does not have"
+ + " expected suffix");
+ }
+ String storeFilename = filename.substring(0, filename.length()
+ - activeSuffix.length())
+ + storeSuffix;
+ File activeFile = new File(filename);
+ File storeFile = new File(storeFilename);
+ FileUtils.moveAsideIfExists(storeFile);
+
+ if (mergeOld) {
+ File fileToAppendTo = new File(filenameSeries.getLast());
+ for (int i = filenameSeries.size() - 2; i >= 0; i--) {
+ File f = new File(filenameSeries.get(i));
+ FileUtils.appendTo(fileToAppendTo, f);
+ f.delete();
+ }
+ filenameSeries.clear();
+ filenameSeries.add(filename);
+ if (!fileToAppendTo.renameTo(storeFile)) {
+ throw new IOException("Unable to move " + fileToAppendTo + " to "
+ + storeFilename);
+ }
+ } else {
+ if (!activeFile.renameTo(storeFile)) {
+ throw new IOException("Unable to move " + filename + " to "
+ + storeFilename);
+ }
+ }
+ filenameSeries.add(1, storeFilename);
+ GenerationFileHandler newGfh = new GenerationFileHandler(
+ filenameSeries, shouldManifest);
+ newGfh.setFormatter(this.getFormatter());
+ return newGfh;
+ }
+
+ /**
+ * @return True if should manifest.
+ */
+ public boolean shouldManifest() {
+ return this.shouldManifest;
+ }
+
+ /**
+ * Constructor-helper that rather than clobbering any existing
+ * file, moves it aside with a timestamp suffix.
+ *
+ * @param filename
+ * @param append
+ * @param shouldManifest
+ * @return
+ * @throws SecurityException
+ * @throws IOException
+ */
+ public static GenerationFileHandler makeNew(String filename, boolean append, boolean shouldManifest) throws SecurityException, IOException {
+ FileUtils.moveAsideIfExists(new File(filename));
+ return new GenerationFileHandler(filename, append, shouldManifest);
+ }
+
+ @Override
+ public void publish(LogRecord record) {
+ // when possible preformat outside synchronized superclass method
+ // (our most involved UriProcessingFormatter can cache result)
+ Formatter f = getFormatter();
+ if(!(f instanceof Preformatter)) {
+ super.publish(record);
+ } else {
+ try {
+ ((Preformatter)f).preformat(record);
+ super.publish(record);
+ } finally {
+ ((Preformatter)f).clear();
+ }
+ }
+ }
+//
+// TODO: determine if there's another way to have this optimization without
+// negative impact on log-following (esp. in web UI)
+// /**
+// * Flush only 1/100th of the usual once-per-record, to reduce the time
+// * spent holding the synchronization lock. (Flush is primarily called in
+// * a superclass's synchronized publish()).
+// *
+// * The eventual close calls a direct flush on the target writer, so all
+// * rotates/ends will ultimately be fully flushed.
+// *
+// * @see java.util.logging.StreamHandler#flush()
+// */
+// @Override
+// public synchronized void flush() {
+// flushCount++;
+// if(flushCount==100) {
+// super.flush();
+// flushCount=0;
+// }
+// }
+// int flushCount;
+
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/io/GenericReplayCharSequence.java b/src/main/java/org/archive/io/GenericReplayCharSequence.java
new file mode 100644
index 00000000..1af3922b
--- /dev/null
+++ b/src/main/java/org/archive/io/GenericReplayCharSequence.java
@@ -0,0 +1,412 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.nio.CharBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.charset.CharacterCodingException;
+import java.nio.charset.Charset;
+import java.text.NumberFormat;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import org.apache.commons.io.IOUtils;
+import org.archive.util.DevUtils;
+
+import com.google.common.base.Charsets;
+import com.google.common.primitives.Ints;
+
+/**
+ * (Replay)CharSequence view on recorded streams.
+ *
+ * For small streams, use {@link InMemoryReplayCharSequence}.
+ *
+ *
Call {@link close()} on this class when done to clean up resources.
+ *
+ * @contributor stack
+ * @contributor nlevitt
+ * @version $Revision$, $Date$
+ */
+public class GenericReplayCharSequence implements ReplayCharSequence {
+
+ protected static Logger logger = Logger
+ .getLogger(GenericReplayCharSequence.class.getName());
+
+ /**
+ * Name of the encoding we use writing out concatenated decoded prefix
+ * buffer and decoded backing file.
+ *
+ *
This define is also used as suffix for the file that holds the
+ * decodings. The name of the file that holds the decoding is the name
+ * of the backing file w/ this encoding for a suffix.
+ *
+ *
See Encoding.
+ */
+ public static final Charset WRITE_ENCODING = Charsets.UTF_16BE;
+
+ private static final long MAP_MAX_BYTES = 64 * 1024 * 1024; // 64M
+
+ /**
+ * When the memory map moves away from the beginning of the file
+ * (to the "right") in order to reach a certain index, it will
+ * map up to this many bytes preceding (to the left of) the target character.
+ * Consequently it will map up to
+ * MAP_MAX_BYTES - MAP_TARGET_LEFT_PADDING
+ * bytes to the right of the target.
+ */
+ private static final long MAP_TARGET_LEFT_PADDING_BYTES = (long) (MAP_MAX_BYTES * 0.01);
+
+ /**
+ * Total length of character stream to replay minus the HTTP headers
+ * if present.
+ *
+ * If the backing file is larger than Integer.MAX_VALUE (i.e. 2gb),
+ * only the first Integer.MAX_VALUE characters are available through this API.
+ * We're overriding java.lang.CharSequence so that we can use
+ * java.util.regex directly on the data, and the CharSequence
+ * API uses int for the length and index.
+ */
+ protected int length;
+
+ /** counter of decoding exceptions for report at end */
+ protected long decodingExceptions = 0;
+ protected CharacterCodingException codingException = null;
+
+ /**
+ * Byte offset into the file where the memory mapped portion begins.
+ */
+ private long mapByteOffset;
+
+ // XXX do we need to keep the input stream around?
+ private FileInputStream backingFileIn = null;
+
+ private FileChannel backingFileChannel = null;
+
+ private long bytesPerChar;
+
+ private CharBuffer mappedBuffer = null;
+
+ /**
+ * File that has decoded content.
+ *
+ * Keep it around so we can remove on close.
+ */
+ private File decodedFile = null;
+
+ /*
+ * This portion of the CharSequence precedes what's in the backing file. In
+ * cases where we decodeToFile(), this is always empty, because we decode
+ * the entire input stream.
+ */
+ private CharBuffer prefixBuffer = null;
+
+ private boolean isOpen = true;
+
+ protected Charset charset = null;
+
+ /**
+ * Constructor.
+ *
+ * @param contentReplayInputStream inputStream of content
+ * @param charset Encoding to use reading the passed prefix
+ * buffer and backing file. Must not be null.
+ * @param backingFilename Path to backing file with content in excess of
+ * whats in buffer.
+ *
+ * @throws IOException
+ */
+ public GenericReplayCharSequence(InputStream contentReplayInputStream,
+ int prefixMax,
+ String backingFilename,
+ Charset charset) throws IOException {
+ super();
+ logger.fine("characterEncoding=" + charset + " backingFilename="
+ + backingFilename);
+
+ if(charset==null) {
+ charset = ReplayCharSequence.FALLBACK_CHARSET;
+ }
+ // decodes only up to Integer.MAX_VALUE characters
+ decode(contentReplayInputStream, prefixMax, backingFilename, charset);
+
+ this.bytesPerChar = 2;
+
+ if(length>prefixBuffer.position()) {
+ this.backingFileIn = new FileInputStream(decodedFile);
+ this.backingFileChannel = backingFileIn.getChannel();
+ this.mapByteOffset = 0;
+ updateMemoryMappedBuffer();
+ }
+ }
+
+ private void updateMemoryMappedBuffer() {
+ long charLength = (long) this.length() - (long) prefixBuffer.limit(); // in characters
+ long mapSize = Math.min((charLength * bytesPerChar) - mapByteOffset, MAP_MAX_BYTES);
+ logger.fine("updateMemoryMappedBuffer: mapOffset="
+ + NumberFormat.getInstance().format(mapByteOffset)
+ + " mapSize=" + NumberFormat.getInstance().format(mapSize));
+ try {
+ // TODO: stress-test without these possibly-costly requests!
+// System.gc();
+// System.runFinalization();
+ // TODO: Confirm the READ_ONLY works. I recall it not working.
+ // The buffers seem to always say that the buffer is writable.
+ mappedBuffer = backingFileChannel.map(
+ FileChannel.MapMode.READ_ONLY, mapByteOffset, mapSize)
+ .asReadOnlyBuffer().asCharBuffer();
+ } catch (IOException e) {
+ // TODO convert this to a runtime error?
+ DevUtils.logger.log(Level.SEVERE,
+ " backingFileChannel.map() mapByteOffset=" + mapByteOffset
+ + " mapSize=" + mapSize + "\n" + "decodedFile="
+ + decodedFile + " length=" + length + "\n"
+ + DevUtils.extraInfo(), e);
+ throw new RuntimeException(e);
+ }
+ }
+
+ /**
+ * Converts the first Integer.MAX_VALUE characters from the
+ * file backingFilename from encoding encoding to
+ * encoding WRITE_ENCODING and saves as
+ * this.decodedFile, which is named backingFilename
+ * + "." + WRITE_ENCODING.
+ *
+ * @throws IOException
+ */
+ protected void decode(InputStream inStream, int prefixMax,
+ String backingFilename, Charset charset) throws IOException {
+
+ this.charset = charset;
+
+ // TODO: consider if BufferedReader is helping any
+ // TODO: consider adding TBW 'LimitReader' to stop reading at
+ // Integer.MAX_VALUE characters because of charAt(int) limit
+ BufferedReader reader = new BufferedReader(new InputStreamReader(
+ inStream, charset));
+
+ logger.fine("backingFilename=" + backingFilename + " encoding="
+ + charset + " decodedFile=" + decodedFile);
+
+ this.prefixBuffer = CharBuffer.allocate(prefixMax);
+
+ long count = 0;
+ while(count < prefixMax) {
+ int read = reader.read(prefixBuffer);
+ if(read<0) {
+ break;
+ }
+ count += read;
+ }
+
+ int ch = reader.read();
+ if(ch >= 0) {
+ count++;
+
+ // more to decode to file overflow
+ this.decodedFile = new File(backingFilename + "." + WRITE_ENCODING);
+
+ FileOutputStream fos;
+ try {
+ fos = new FileOutputStream(this.decodedFile);
+ } catch (FileNotFoundException e) {
+ // Windows workaround attempt
+ System.gc();
+ System.runFinalization();
+ this.decodedFile = new File(decodedFile.getAbsolutePath()+".win");
+ logger.info("Windows 'file with a user-mapped section open' "
+ + "workaround gc/finalization/name-extension performed.");
+ // try again
+ fos = new FileOutputStream(this.decodedFile);
+ }
+
+ Writer writer = new OutputStreamWriter(fos,WRITE_ENCODING);
+ writer.write(ch);
+ count += IOUtils.copyLarge(reader, writer);
+ writer.close();
+ reader.close();
+ }
+
+ this.length = Ints.saturatedCast(count);
+ if(count>Integer.MAX_VALUE) {
+ logger.warning("input stream is longer than Integer.MAX_VALUE="
+ + NumberFormat.getInstance().format(Integer.MAX_VALUE)
+ + " characters -- only first "
+ + NumberFormat.getInstance().format(Integer.MAX_VALUE)
+ + " are accessible through this GenericReplayCharSequence");
+ }
+
+ logger.fine("decode: decoded " + count + " characters" +
+ ((decodedFile==null) ? ""
+ : " ("+(count-prefixBuffer.length())+" to "+decodedFile+")"));
+ }
+
+ /**
+ * Get character at passed absolute position.
+ * @param index Index into content
+ * @return Character at offset index.
+ */
+ public char charAt(int index) {
+ if (index < 0 || index >= this.length()) {
+ throw new IndexOutOfBoundsException("index=" + index
+ + " - should be between 0 and length()=" + this.length());
+ }
+
+ // is it in the buffer
+ if (index < prefixBuffer.limit()) {
+ return prefixBuffer.get(index);
+ }
+
+ // otherwise we gotta get it from disk via memory map
+ long charFileIndex = (long) index - (long) prefixBuffer.limit();
+ long charFileLength = (long) this.length() - (long) prefixBuffer.limit(); // in characters
+ if (charFileIndex * bytesPerChar < mapByteOffset) {
+ logger.log(Level.WARNING,"left-fault; probably don't want to use CharSequence that far backward");
+ }
+ if (charFileIndex * bytesPerChar < mapByteOffset
+ || charFileIndex - (mapByteOffset / bytesPerChar) >= mappedBuffer.limit()) {
+ // fault
+ /*
+ * mapByteOffset is bounded by 0 and file size +/- size of the map,
+ * and starts as close to fileIndex -
+ * MAP_TARGET_LEFT_PADDING_BYTES as it can while also not
+ * being smaller than it needs to be.
+ */
+ mapByteOffset = Math.min(charFileIndex * bytesPerChar - MAP_TARGET_LEFT_PADDING_BYTES,
+ charFileLength * bytesPerChar - MAP_MAX_BYTES);
+ mapByteOffset = Math.max(0, mapByteOffset);
+ updateMemoryMappedBuffer();
+ }
+
+ return mappedBuffer.get((int)(charFileIndex-(mapByteOffset/bytesPerChar)));
+ }
+
+ public CharSequence subSequence(int start, int end) {
+ return new CharSubSequence(this, start, end);
+ }
+
+ private void deleteFile(File fileToDelete) {
+ deleteFile(fileToDelete, null);
+ }
+
+ private void deleteFile(File fileToDelete, final Exception e) {
+ if (e != null) {
+ // Log why the delete to help with debug of
+ // java.io.FileNotFoundException:
+ // ....tt53http.ris.UTF-16BE.
+ logger.severe("Deleting " + fileToDelete + " because of "
+ + e.toString());
+ }
+ if (fileToDelete != null && fileToDelete.exists()) {
+ logger.fine("deleting file: " + fileToDelete);
+ fileToDelete.delete();
+ }
+ }
+
+
+ @Override
+ public boolean isOpen() {
+ return this.isOpen;
+ }
+
+ public void close() throws IOException {
+ this.isOpen = false;
+
+ logger.fine("closing");
+
+ if (this.backingFileChannel != null && this.backingFileChannel.isOpen()) {
+ this.backingFileChannel.close();
+ }
+ if (backingFileIn != null) {
+ backingFileIn.close();
+ }
+
+ deleteFile(this.decodedFile);
+
+ // clear decodedFile -- so that double-close (as in finalize()) won't
+ // delete a later instance with same name see bug [ 1218961 ]
+ // "failed get of replay" in ExtractorHTML... usu: UTF-16BE
+ this.decodedFile = null;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.lang.Object#finalize()
+ */
+ protected void finalize() throws Throwable {
+ super.finalize();
+ logger.fine("finalizing");
+ close();
+ }
+
+ /**
+ * Convenience method for getting a substring.
+ *
+ * @deprecated please use subSequence() and then toString() directly
+ */
+ public String substring(int offset, int len) {
+ return subSequence(offset, offset + len).toString();
+ }
+
+ public String toString() {
+ StringBuilder sb = new StringBuilder(this.length());
+ sb.append(this);
+ return sb.toString();
+ }
+
+ public int length() {
+ return length;
+ }
+
+ /* (non-Javadoc)
+ * @see org.archive.io.ReplayCharSequence#getDecodeExceptionCount()
+ */
+ @Override
+ public long getDecodeExceptionCount() {
+ return decodingExceptions;
+ }
+
+
+ /* (non-Javadoc)
+ * @see org.archive.io.ReplayCharSequence#getCodingException()
+ */
+ @Override
+ public CharacterCodingException getCodingException() {
+ return codingException;
+ }
+
+ /* (non-Javadoc)
+ * @see org.archive.io.ReplayCharSequence#getCharset()
+ */
+ public Charset getCharset() {
+ return charset;
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/io/GzipHeader.java b/src/main/java/org/archive/io/GzipHeader.java
new file mode 100644
index 00000000..6b8263bc
--- /dev/null
+++ b/src/main/java/org/archive/io/GzipHeader.java
@@ -0,0 +1,26 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+/**
+ * @deprecated use {@link org.archive.util.zip.GzipHeader}
+ */
+@Deprecated
+public class GzipHeader extends org.archive.util.zip.GzipHeader {
+}
diff --git a/src/main/java/org/archive/io/HeaderedArchiveRecord.java b/src/main/java/org/archive/io/HeaderedArchiveRecord.java
new file mode 100644
index 00000000..3cce595b
--- /dev/null
+++ b/src/main/java/org/archive/io/HeaderedArchiveRecord.java
@@ -0,0 +1,423 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.PrintStream;
+
+import org.apache.commons.httpclient.Header;
+import org.apache.commons.httpclient.HttpParser;
+import org.apache.commons.httpclient.StatusLine;
+import org.apache.commons.httpclient.util.EncodingUtil;
+import org.archive.io.arc.ARCConstants;
+import org.archive.util.LaxHttpParser;
+
+/**
+ * An ArchiveRecord whose content has a preamble of RFC822-like headers: e.g.
+ * The ArchiveRecord is a http response that leads off with http response
+ * headers. Use this ArchiveRecord Decorator to get at the content headers and
+ * the header/content demarcation.
+ *
+ * @author stack
+ * @author Olaf Freyer
+ */
+public class HeaderedArchiveRecord extends ArchiveRecord {
+ private int contentHeadersLength = -1;
+ private int statusCode = -1;
+
+ /**
+ * Http header bytes.
+ *
+ * If non-null and bytes available, give out its contents before we
+ * go back to the underlying stream.
+ */
+ private InputStream contentHeaderStream = null;
+
+ /**
+ * Content headers.
+ *
+ * Only available after the reading of headers.
+ */
+ private Header [] contentHeaders = null;
+
+
+ public HeaderedArchiveRecord(final ArchiveRecord ar) throws IOException {
+ super(ar);
+ }
+
+ public HeaderedArchiveRecord(final ArchiveRecord ar,
+ final boolean readContentHeader) throws IOException {
+ super(ar);
+ if (readContentHeader) {
+ this.contentHeaderStream = readContentHeaders();
+ }
+ }
+
+ /**
+ * Skip over the the content headers if present.
+ *
+ * Subsequent reads will get the body.
+ *
+ *
Calling this method in the midst of reading the header
+ * will make for strange results. Otherwise, safe to call
+ * at any time though before reading any of the record
+ * content is only time that it makes sense.
+ *
+ *
After calling this method, you can call
+ * {@link #getContentHeaders()} to get the read http header.
+ *
+ * @throws IOException
+ */
+ public void skipHttpHeader() throws IOException {
+ if (this.contentHeaderStream == null) {
+ return;
+ }
+ // Empty the contentHeaderStream
+ for (int available = this.contentHeaderStream.available();
+ this.contentHeaderStream != null
+ && (available = this.contentHeaderStream.available()) > 0;) {
+ // We should be in this loop once only we should only do this
+ // buffer allocation once.
+ byte[] buffer = new byte[available];
+ // The read nulls out httpHeaderStream when done with it so
+ // need check for null in the loop control line.
+ read(buffer, 0, available);
+ }
+ }
+
+ public void dumpHttpHeader() throws IOException {
+ dumpHttpHeader(System.out);
+ }
+
+ public void dumpHttpHeader(final PrintStream stream) throws IOException {
+ if (this.contentHeaderStream == null) {
+ return;
+ }
+ // Dump the httpHeaderStream to STDOUT
+ for (int available = this.contentHeaderStream.available();
+ this.contentHeaderStream != null
+ && (available = this.contentHeaderStream.available()) > 0;) {
+ // We should be in this loop only once and should do this
+ // buffer allocation once.
+ byte[] buffer = new byte[available];
+ // The read nulls out httpHeaderStream when done with it so
+ // need check for null in the loop control line.
+ int read = read(buffer, 0, available);
+ stream.write(buffer, 0, read);
+ }
+ }
+
+ /**
+ * Read header if present. Technique borrowed from HttpClient HttpParse
+ * class. Using http parser code for now. Later move to more generic header
+ * parsing code if there proves a need.
+ *
+ * @return ByteArrayInputStream with the http header in it or null if no
+ * http header.
+ * @throws IOException
+ */
+ private InputStream readContentHeaders() throws IOException {
+ // If judged a record that doesn't have an http header, return
+ // immediately.
+ if (!hasContentHeaders()) {
+ return null;
+ }
+ byte [] statusBytes = LaxHttpParser.readRawLine(getIn());
+ int eolCharCount = getEolCharsCount(statusBytes);
+ if (eolCharCount <= 0) {
+ throw new IOException("Failed to read raw lie where one " +
+ " was expected: " + new String(statusBytes));
+ }
+ String statusLine = EncodingUtil.getString(statusBytes, 0,
+ statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING);
+ if (statusLine == null) {
+ throw new NullPointerException("Expected status line is null");
+ }
+ // TODO: Tighten up this test.
+ boolean isHttpResponse = StatusLine.startsWithHTTP(statusLine);
+ boolean isHttpRequest = false;
+ if (!isHttpResponse) {
+ isHttpRequest = statusLine.toUpperCase().startsWith("GET") ||
+ !statusLine.toUpperCase().startsWith("POST");
+ }
+ if (!isHttpResponse && !isHttpRequest) {
+ throw new UnexpectedStartLineIOException("Failed parse of " +
+ "status line: " + statusLine);
+ }
+ this.statusCode = isHttpResponse?
+ (new StatusLine(statusLine)).getStatusCode(): -1;
+
+ // Save off all bytes read. Keep them as bytes rather than
+ // convert to strings so we don't have to worry about encodings
+ // though this should never be a problem doing http headers since
+ // its all supposed to be ascii.
+ ByteArrayOutputStream baos =
+ new ByteArrayOutputStream(statusBytes.length + 4 * 1024);
+ baos.write(statusBytes);
+
+ // Now read rest of the header lines looking for the separation
+ // between header and body.
+ for (byte [] lineBytes = null; true;) {
+ lineBytes = LaxHttpParser.readRawLine(getIn());
+ eolCharCount = getEolCharsCount(lineBytes);
+ if (eolCharCount <= 0) {
+ throw new IOException("Failed reading headers: " +
+ ((lineBytes != null)? new String(lineBytes): null));
+ }
+ // Save the bytes read.
+ baos.write(lineBytes);
+ if ((lineBytes.length - eolCharCount) <= 0) {
+ // We've finished reading the http header.
+ break;
+ }
+ }
+
+ byte [] headerBytes = baos.toByteArray();
+ // Save off where content body, post content headers, starts.
+ this.contentHeadersLength = headerBytes.length;
+ ByteArrayInputStream bais =
+ new ByteArrayInputStream(headerBytes);
+ if (!bais.markSupported()) {
+ throw new IOException("ByteArrayInputStream does not support mark");
+ }
+ bais.mark(headerBytes.length);
+ // Read the status line. Don't let it into the parseHeaders function.
+ // It doesn't know what to do with it.
+ bais.read(statusBytes, 0, statusBytes.length);
+ this.contentHeaders = LaxHttpParser.parseHeaders(bais,
+ ARCConstants.DEFAULT_ENCODING);
+ bais.reset();
+ return bais;
+ }
+
+ public static class UnexpectedStartLineIOException
+ extends RecoverableIOException {
+ private static final long serialVersionUID = 1L;
+
+ public UnexpectedStartLineIOException(final String reason) {
+ super(reason);
+ }
+ }
+
+ /**
+ * @param bytes Array of bytes to examine for an EOL.
+ * @return Count of end-of-line characters or zero if none.
+ */
+ private int getEolCharsCount(byte [] bytes) {
+ int count = 0;
+ if (bytes != null && bytes.length >=1 &&
+ bytes[bytes.length - 1] == '\n') {
+ count++;
+ if (bytes.length >=2 && bytes[bytes.length -2] == '\r') {
+ count++;
+ }
+ }
+ return count;
+ }
+
+ /**
+ * @return If headers are for a http response AND the headers have been
+ * read, return status code. Else return -1.
+ */
+ public int getStatusCode() {
+ return this.statusCode;
+ }
+
+ /**
+ * @return Returns length of content headers or -1 if headers have
+ * not yet been read.
+ */
+ public int getContentHeadersLength() {
+ return this.contentHeadersLength;
+ }
+
+ public Header[] getContentHeaders() {
+ return contentHeaders;
+ }
+
+ /**
+ * @return Next character in this ARCRecord's content else -1 if at end of
+ * this record.
+ * @throws IOException
+ */
+ public int read() throws IOException {
+ int c = -1;
+ if (this.contentHeaderStream != null &&
+ (this.contentHeaderStream.available() > 0)) {
+ // If http header, return bytes from it before we go to underlying
+ // stream.
+ c = this.contentHeaderStream.read();
+ // If done with the header stream, null it out.
+ if (this.contentHeaderStream.available() <= 0) {
+ this.contentHeaderStream = null;
+ }
+ // do not increment position -
+ // the underlying ArchiveRecord stream allready did this
+ // incrementPosition();
+ } else {
+ c = super.read();
+ }
+ return c;
+ }
+
+ public int read(byte [] b, int offset, int length) throws IOException {
+ int read = -1;
+ if (this.contentHeaderStream != null &&
+ (this.contentHeaderStream.available() > 0)) {
+ // If http header, return bytes from it before we go to underlying
+ // stream.
+ read = Math.min(length, this.contentHeaderStream.available());
+ if (read == 0) {
+ read = -1;
+ } else {
+ read = this.contentHeaderStream.read(b, offset, read);
+ }
+ // If done with the header stream, null it out.
+ if (this.contentHeaderStream.available() <= 0) {
+ this.contentHeaderStream = null;
+ }
+ // do not increment position -
+ // the underlying ArchiveRecord stream allready did this
+ //incrementPosition();
+ } else {
+ read = super.read(b, offset, length);
+ }
+ return read;
+ }
+
+ @Override
+ public int available() {
+ return ((ArchiveRecord)this.in).available();
+ }
+
+ @Override
+ public void close() throws IOException {
+ ((ArchiveRecord)this.in).close();
+ }
+
+ @Override
+ public void dump() throws IOException {
+ ((ArchiveRecord)this.in).dump();
+ }
+
+ @Override
+ public void dump(OutputStream os) throws IOException {
+ ((ArchiveRecord)this.in).dump(os);
+ }
+
+ @Override
+ protected String getDigest4Cdx(ArchiveRecordHeader h) {
+ return ((ArchiveRecord)this.in).getDigest4Cdx(h);
+ }
+
+ @Override
+ public String getDigestStr() {
+ return ((ArchiveRecord)this.in).getDigestStr();
+ }
+
+ @Override
+ public ArchiveRecordHeader getHeader() {
+ return ((ArchiveRecord)this.in).getHeader();
+ }
+
+ @Override
+ protected String getIp4Cdx(ArchiveRecordHeader h) {
+ return ((ArchiveRecord)this.in).getIp4Cdx(h);
+ }
+
+ @Override
+ protected String getMimetype4Cdx(ArchiveRecordHeader h) {
+ return ((ArchiveRecord)this.in).getMimetype4Cdx(h);
+ }
+
+ @Override
+ public long getPosition() {
+ return ((ArchiveRecord)this.in).getPosition();
+ }
+
+ @Override
+ protected String getStatusCode4Cdx(ArchiveRecordHeader h) {
+ return ((ArchiveRecord)this.in).getStatusCode4Cdx(h);
+ }
+
+ @Override
+ public boolean hasContentHeaders() {
+ return ((ArchiveRecord)this.in).hasContentHeaders();
+ }
+
+ @Override
+ protected void incrementPosition() {
+ ((ArchiveRecord)this.in).incrementPosition();
+ }
+
+ @Override
+ protected void incrementPosition(long incr) {
+ ((ArchiveRecord)this.in).incrementPosition(incr);
+ }
+
+ @Override
+ protected boolean isEor() {
+ return ((ArchiveRecord)this.in).isEor();
+ }
+
+ @Override
+ public boolean isStrict() {
+ return ((ArchiveRecord)this.in).isStrict();
+ }
+
+ @Override
+ public boolean markSupported() {
+ return ((ArchiveRecord)this.in).markSupported();
+ }
+
+ @Override
+ protected String outputCdx(String strippedFileName) throws IOException {
+ return ((ArchiveRecord)this.in).outputCdx(strippedFileName);
+ }
+
+ @Override
+ protected void setEor(boolean eor) {
+ ((ArchiveRecord)this.in).setEor(eor);
+ }
+
+ @Override
+ protected void setHeader(ArchiveRecordHeader header) {
+ ((ArchiveRecord)this.in).setHeader(header);
+ }
+
+ @Override
+ public void setStrict(boolean strict) {
+ ((ArchiveRecord)this.in).setStrict(strict);
+ }
+
+ @Override
+ protected void skip() throws IOException {
+ ((ArchiveRecord)this.in).skip();
+ }
+
+ @Override
+ public long skip(long n) throws IOException {
+ return ((ArchiveRecord)this.in).skip(n);
+ }
+}
diff --git a/src/main/java/org/archive/io/LoudObjectOutputStream.java b/src/main/java/org/archive/io/LoudObjectOutputStream.java
new file mode 100644
index 00000000..959c2620
--- /dev/null
+++ b/src/main/java/org/archive/io/LoudObjectOutputStream.java
@@ -0,0 +1,63 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io;
+
+import java.io.IOException;
+import java.io.ObjectOutputStream;
+import java.io.OutputStream;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.logging.Logger;
+
+/**
+ * ObjectOutputStream that logs class name of each object that is written
+ * to the stream. Useful for tracking down sources of NotSerializableException.
+ *
+ * @author pjack
+ *
+ */
+public class LoudObjectOutputStream extends ObjectOutputStream {
+
+
+ final private static Logger LOGGER = Logger.getLogger(
+ LoudObjectOutputStream.class.getName());
+
+ // Only log each class name once
+ private Set alreadyLogged = new HashSet();
+
+ public LoudObjectOutputStream(OutputStream out) throws IOException {
+ super(out);
+ this.enableReplaceObject(true);
+ }
+
+
+ @Override
+ protected Object replaceObject(Object obj) throws IOException {
+ if (obj != null) {
+ String name = obj.getClass().getName();
+ if (alreadyLogged.add(name)) {
+ LOGGER.info("WROTE: " + name);
+ }
+ }
+ return obj;
+ }
+
+
+}
diff --git a/src/main/java/org/archive/io/MiserOutputStream.java b/src/main/java/org/archive/io/MiserOutputStream.java
new file mode 100644
index 00000000..f10ac9ca
--- /dev/null
+++ b/src/main/java/org/archive/io/MiserOutputStream.java
@@ -0,0 +1,82 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io;
+
+import java.io.FilterOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+
+/**
+ * A filter stream that both counts bytes written, and optionally swallows
+ * flush() requests.
+ *
+ * @contributor gojomo
+ */
+public class MiserOutputStream extends FilterOutputStream {
+ protected long count;
+ protected boolean passFlushes;
+
+ /**
+ * Wraps another output stream, counting the number of bytes written.
+ *
+ * @param out the output stream to be wrapped
+ */
+ public MiserOutputStream(OutputStream out) {
+ this(out,true);
+ }
+
+ /**
+ * Wraps another output stream, counting the number of bytes written.
+ *
+ * @param out the output stream to be wrapped
+ */
+ public MiserOutputStream(OutputStream out, boolean passFlushes) {
+ super(out);
+ this.passFlushes = passFlushes;
+ }
+
+ /** Returns the number of bytes written. */
+ public long getCount() {
+ return count;
+ }
+
+ @Override public void write(byte[] b, int off, int len) throws IOException {
+ out.write(b, off, len);
+ count += len;
+ }
+
+ @Override public void write(int b) throws IOException {
+ out.write(b);
+ count++;
+ }
+
+ @Override
+ public void close() throws IOException {
+ passFlushes = true;
+ super.close();
+ }
+
+ @Override
+ public void flush() throws IOException {
+ if(passFlushes) {
+ super.flush();
+ }
+ }
+}
diff --git a/src/main/java/org/archive/io/NoGzipMagicException.java b/src/main/java/org/archive/io/NoGzipMagicException.java
new file mode 100644
index 00000000..27d1058a
--- /dev/null
+++ b/src/main/java/org/archive/io/NoGzipMagicException.java
@@ -0,0 +1,26 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+/**
+ * @deprecated use {@link org.archive.util.zip.NoGzipMagicException}
+ */
+@Deprecated
+public class NoGzipMagicException extends org.archive.util.zip.NoGzipMagicException {
+}
diff --git a/src/main/java/org/archive/io/ObjectPlusFilesInputStream.java b/src/main/java/org/archive/io/ObjectPlusFilesInputStream.java
new file mode 100644
index 00000000..892860ed
--- /dev/null
+++ b/src/main/java/org/archive/io/ObjectPlusFilesInputStream.java
@@ -0,0 +1,143 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.ObjectInputStream;
+import java.util.Iterator;
+import java.util.LinkedList;
+
+import org.archive.util.FileUtils;
+
+
+/**
+ * Enhanced ObjectOutputStream with support for restoring
+ * files that had been saved, in parallel with object
+ * serialization.
+ *
+ * @author gojomo
+ *
+ */
+public class ObjectPlusFilesInputStream extends ObjectInputStream {
+ protected LinkedList auxiliaryDirectoryStack = new LinkedList();
+ protected LinkedList postRestoreTasks = new LinkedList();
+
+ /**
+ * Instantiate over the given stream and using the supplied
+ * auxiliary storage directory.
+ *
+ * @param in
+ * @param storeDir
+ * @throws IOException
+ */
+ public ObjectPlusFilesInputStream(InputStream in, File storeDir)
+ throws IOException {
+ super(in);
+ auxiliaryDirectoryStack.addFirst(storeDir);
+ }
+
+ /**
+ * Push another default storage directory for use
+ * until popped.
+ *
+ * @param dir
+ */
+ public void pushAuxiliaryDirectory(String dir) {
+ auxiliaryDirectoryStack.
+ addFirst(new File(getAuxiliaryDirectory(), dir));
+ }
+
+ /**
+ * Discard the top auxiliary directory.
+ */
+ public void popAuxiliaryDirectory() {
+ auxiliaryDirectoryStack.removeFirst();
+ }
+
+ /**
+ * Return the top auxiliary directory, from
+ * which saved files are restored.
+ *
+ * @return Auxillary directory.
+ */
+ public File getAuxiliaryDirectory() {
+ return (File)auxiliaryDirectoryStack.getFirst();
+ }
+
+ /**
+ * Restore a file from storage, using the name and length
+ * info on the serialization stream and the file from the
+ * current auxiliary directory, to the given File.
+ *
+ * @param destination
+ * @throws IOException
+ */
+ public void restoreFile(File destination) throws IOException {
+ String nameAsStored = readUTF();
+ long lengthAtStoreTime = readLong();
+ File storedFile = new File(getAuxiliaryDirectory(),nameAsStored);
+ FileUtils.copyFile(storedFile, destination, lengthAtStoreTime);
+ }
+
+ /**
+ * Restore a file from storage, using the name and length
+ * info on the serialization stream and the file from the
+ * current auxiliary directory, to the given File.
+ *
+ * @param directory
+ * @throws IOException
+ */
+ public void restoreFileTo(File directory) throws IOException {
+ String nameAsStored = readUTF();
+ long lengthAtStoreTime = readLong();
+ File storedFile = new File(getAuxiliaryDirectory(),nameAsStored);
+ File destination = new File(directory,nameAsStored);
+ FileUtils.copyFile(storedFile, destination, lengthAtStoreTime);
+ }
+
+ /**
+ * Register a task to be done when the ObjectPlusFilesInputStream
+ * is closed.
+ *
+ * @param task
+ */
+ public void registerFinishTask(Runnable task) {
+ postRestoreTasks.addFirst(task);
+ }
+
+ private void doFinishTasks() {
+ Iterator iter = postRestoreTasks.iterator();
+ while(iter.hasNext()) {
+ ((Runnable)iter.next()).run();
+ }
+ }
+
+ /**
+ * In addition to default, do any registered cleanup tasks.
+ *
+ * @see java.io.InputStream#close()
+ */
+ public void close() throws IOException {
+ super.close();
+ doFinishTasks();
+ }
+}
diff --git a/src/main/java/org/archive/io/ObjectPlusFilesOutputStream.java b/src/main/java/org/archive/io/ObjectPlusFilesOutputStream.java
new file mode 100644
index 00000000..224f24e7
--- /dev/null
+++ b/src/main/java/org/archive/io/ObjectPlusFilesOutputStream.java
@@ -0,0 +1,134 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.ObjectOutputStream;
+import java.io.OutputStream;
+import java.util.LinkedList;
+
+import org.archive.util.FileUtils;
+
+
+/**
+ * Enhanced ObjectOutputStream which maintains (a stack of) auxiliary
+ * directories and offers convenience methods for serialized objects
+ * to save their related disk files alongside their serialized version.
+ *
+ * @author gojomo
+ */
+public class ObjectPlusFilesOutputStream extends ObjectOutputStream {
+ protected LinkedList auxiliaryDirectoryStack = new LinkedList();
+
+ /**
+ * Constructor
+ *
+ * @param out
+ * @param topDirectory
+ * @throws java.io.IOException
+ */
+ public ObjectPlusFilesOutputStream(OutputStream out, File topDirectory) throws IOException {
+ super(out);
+ auxiliaryDirectoryStack.addFirst(topDirectory);
+ }
+
+ /**
+ * Add another subdirectory for any file-capture needs during the
+ * current serialization.
+ *
+ * @param dir
+ */
+ public void pushAuxiliaryDirectory(String dir) {
+ auxiliaryDirectoryStack.addFirst(new File(getAuxiliaryDirectory(),dir));
+ }
+
+ /**
+ * Remove the top subdirectory.
+ *
+ */
+ public void popAuxiliaryDirectory() {
+ auxiliaryDirectoryStack.removeFirst();
+ }
+
+ /**
+ * Return the current auxiliary directory for storing
+ * files associated with serialized objects.
+ *
+ * @return Auxillary directory.
+ */
+ public File getAuxiliaryDirectory() {
+ return (File)auxiliaryDirectoryStack.getFirst();
+ }
+
+ /**
+ * Store a snapshot of an object's supporting file to the
+ * current auxiliary directory. Should only be used for
+ * files which are strictly appended-to, because it tries
+ * to use a "hard link" where possible (meaning that
+ * future edits to the original file's contents will
+ * also affect the snapshot).
+ *
+ * Remembers current file extent to allow a future restore
+ * to ignore subsequent appended data.
+ *
+ * @param file
+ * @throws IOException
+ */
+ public void snapshotAppendOnlyFile(File file) throws IOException {
+ // write filename
+ String name = file.getName();
+ writeUTF(name);
+ // write current file length
+ writeLong(file.length());
+ File auxDir = getAuxiliaryDirectory();
+ if(!auxDir.exists()) {
+ FileUtils.ensureWriteableDirectory(auxDir);
+ }
+ File destination = new File(auxDir,name);
+ hardlinkOrCopy(file, destination);
+ }
+
+ /**
+ * Create a backup of this given file, first by trying a "hard
+ * link", then by using a copy if hard linking is unavailable
+ * (either because it is unsupported or the origin and checkpoint
+ * directories are on different volumes).
+ *
+ * @param file
+ * @param destination
+ * @throws IOException
+ */
+ private void hardlinkOrCopy(File file, File destination) throws IOException {
+ // For Linux/UNIX, try a hard link first.
+ Process link = Runtime.getRuntime().exec("ln "+file.getAbsolutePath()+" "+destination.getAbsolutePath());
+ // TODO NTFS also supports hard links; add appropriate try
+ try {
+ link.waitFor();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ if(link.exitValue()!=0) {
+ // hard link failed
+ FileUtils.copyFile(file,destination);
+ }
+ }
+
+}
diff --git a/src/main/java/org/archive/io/OriginSeekInputStream.java b/src/main/java/org/archive/io/OriginSeekInputStream.java
new file mode 100644
index 00000000..00605d82
--- /dev/null
+++ b/src/main/java/org/archive/io/OriginSeekInputStream.java
@@ -0,0 +1,121 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+
+import java.io.IOException;
+
+
+/**
+ * Alters the origin of some other SeekInputStream. This class allows you
+ * to completely ignore everything in the underlying stream before a specified
+ * position, the origin position.
+ *
+ *
With the exception of {@link #position()} and {@link position(long)},
+ * all of the methods in this class simply delegate to the underlying input
+ * stream. The position methods adjust the position of the
+ * underlying stream relative to the origin specified at construction time.
+ *
+ * @author pjack
+ */
+public class OriginSeekInputStream extends SeekInputStream {
+
+
+ /**
+ * The underlying stream.
+ */
+ final private SeekInputStream input;
+
+
+ /**
+ * The origin position. In other words, this.position(0)
+ * resolves to input.position(start).
+ */
+ final private long origin;
+
+
+ /**
+ * Constructor.
+ *
+ * @param input the underlying stream
+ * @param origin the origin position
+ * @throws IOException if an IO error occurs
+ */
+ public OriginSeekInputStream(SeekInputStream input, long origin)
+ throws IOException {
+ this.input = input;
+ this.origin = origin;
+ input.position(origin);
+ }
+
+
+ @Override
+ public int available() throws IOException {
+ return input.available();
+ }
+
+
+ @Override
+ public int read() throws IOException {
+ return input.read();
+ }
+
+
+ @Override
+ public int read(byte[] buf, int ofs, int len) throws IOException {
+ return input.read(buf, ofs, len);
+ }
+
+
+ @Override
+ public int read(byte[] buf) throws IOException {
+ return input.read(buf);
+ }
+
+
+ @Override
+ public long skip(long count) throws IOException {
+ return input.skip(count);
+ }
+
+
+ /**
+ * Returns the position of the underlying stream relative to the origin.
+ *
+ * @return the relative position
+ * @throws IOException if an IO error occurs
+ */
+ public long position() throws IOException {
+ return input.position() - origin;
+ }
+
+
+ /**
+ * Positions the underlying stream relative to the origin.
+ * In other words, this.position(0) resolves to input.position(origin),
+ * where input is underlying stream and origin is the origin specified
+ * at construction time.
+ *
+ * @param p the new position for this stream
+ * @throws IOException if an IO error occurs
+ */
+ public void position(long p) throws IOException {
+ input.position(p + origin);
+ }
+}
diff --git a/src/main/java/org/archive/io/Preformatter.java b/src/main/java/org/archive/io/Preformatter.java
new file mode 100644
index 00000000..dcd31bb6
--- /dev/null
+++ b/src/main/java/org/archive/io/Preformatter.java
@@ -0,0 +1,32 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+import java.util.logging.LogRecord;
+
+/**
+ * Interface indicating a logging Formatter can preformat a record (outside
+ * the standard-implementation synchronized block) and cache it, returning it
+ * for the next request for formatting from the same thread.
+ * @contributor gojomo
+ */
+public interface Preformatter {
+ public void preformat(LogRecord record);
+ public void clear();
+}
diff --git a/src/main/java/org/archive/io/RandomAccessInputStream.java b/src/main/java/org/archive/io/RandomAccessInputStream.java
new file mode 100644
index 00000000..d8dd260b
--- /dev/null
+++ b/src/main/java/org/archive/io/RandomAccessInputStream.java
@@ -0,0 +1,180 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+
+import java.io.File;
+import java.io.IOException;
+import java.io.RandomAccessFile;
+
+
+/**
+ * Wraps a RandomAccessFile with an InputStream interface.
+ *
+ * @author gojomo
+ */
+public class RandomAccessInputStream extends SeekInputStream {
+
+ /**
+ * Reference to the random access file this stream is reading from.
+ */
+ private RandomAccessFile raf = null;
+
+ /**
+ * When mark is called, save here the current position so we can go back
+ * on reset.
+ */
+ private long markpos = -1;
+
+ /**
+ * True if we are to close the underlying random access file when this
+ * stream is closed.
+ */
+ private boolean sympathyClose;
+
+ /**
+ * Constructor.
+ *
+ * If using this constructor, caller created the RAF and therefore
+ * its assumed wants to control close of the RAF. The RAF.close
+ * is not called if this constructor is used on close of this stream.
+ *
+ * @param raf RandomAccessFile to wrap.
+ * @throws IOException
+ */
+ public RandomAccessInputStream(RandomAccessFile raf)
+ throws IOException {
+ this(raf, false, 0);
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param file File to get RAFIS on. Creates an RAF from passed file.
+ * Closes the created RAF when this stream is closed.
+ * @throws IOException
+ */
+ public RandomAccessInputStream(final File file)
+ throws IOException {
+ this(new RandomAccessFile(file, "r"), true, 0);
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param file File to get RAFIS on. Creates an RAF from passed file.
+ * Closes the created RAF when this stream is closed.
+ * @param offset
+ * @throws IOException
+ */
+ public RandomAccessInputStream(final File file, final long offset)
+ throws IOException {
+ this(new RandomAccessFile(file, "r"), true, offset);
+ }
+
+ /**
+ * @param raf RandomAccessFile to wrap.
+ * @param sympathyClose Set to true if we are to close the RAF
+ * file when this stream is closed.
+ * @param offset
+ * @throws IOException
+ */
+ public RandomAccessInputStream(final RandomAccessFile raf,
+ final boolean sympathyClose, final long offset)
+ throws IOException {
+ super();
+ this.sympathyClose = sympathyClose;
+ this.raf = raf;
+ if (offset > 0) {
+ this.raf.seek(offset);
+ }
+ }
+
+ /* (non-Javadoc)
+ * @see java.io.InputStream#read()
+ */
+ public int read() throws IOException {
+ return this.raf.read();
+ }
+
+ /* (non-Javadoc)
+ * @see java.io.InputStream#read(byte[], int, int)
+ */
+ public int read(byte[] b, int off, int len) throws IOException {
+ return this.raf.read(b, off, len);
+ }
+
+ /* (non-Javadoc)
+ * @see java.io.InputStream#read(byte[])
+ */
+ public int read(byte[] b) throws IOException {
+ return this.raf.read(b);
+ }
+
+ /* (non-Javadoc)
+ * @see java.io.InputStream#skip(long)
+ */
+ public long skip(long n) throws IOException {
+ this.raf.seek(this.raf.getFilePointer() + n);
+ return n;
+ }
+
+ public long position() throws IOException {
+ return this.raf.getFilePointer();
+ }
+
+ public void position(long position) throws IOException {
+ this.raf.seek(position);
+ }
+
+ public int available() throws IOException {
+ long amount = this.raf.length() - this.position();
+ return (amount >= Integer.MAX_VALUE)? Integer.MAX_VALUE: (int)amount;
+ }
+
+ public boolean markSupported() {
+ return true;
+ }
+
+ public synchronized void mark(int readlimit) {
+ try {
+ this.markpos = position();
+ } catch (IOException e) {
+ // Set markpos to -1. Will cause exception reset.
+ this.markpos = -1;
+ }
+ }
+
+ public synchronized void reset() throws IOException {
+ if (this.markpos == -1) {
+ throw new IOException("Mark has not been set.");
+ }
+ position(this.markpos);
+ }
+
+ public void close() throws IOException {
+ try {
+ super.close();
+ } finally {
+ if (this.sympathyClose) {
+ this.raf.close();
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/io/RandomAccessOutputStream.java b/src/main/java/org/archive/io/RandomAccessOutputStream.java
new file mode 100644
index 00000000..225f995f
--- /dev/null
+++ b/src/main/java/org/archive/io/RandomAccessOutputStream.java
@@ -0,0 +1,69 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.RandomAccessFile;
+
+
+/**
+ * Wraps a RandomAccessFile with OutputStream interface.
+ *
+ * @author gojomo
+ */
+public class RandomAccessOutputStream extends OutputStream {
+ protected RandomAccessFile raf;
+
+ /**
+ * Wrap the given RandomAccessFile
+ */
+ public RandomAccessOutputStream(RandomAccessFile raf) {
+ super();
+ this.raf = raf;
+ }
+
+ /* (non-Javadoc)
+ * @see java.io.OutputStream#write(int)
+ */
+ public void write(int b) throws IOException {
+ raf.write(b);
+ }
+
+ /* (non-Javadoc)
+ * @see java.io.OutputStream#close()
+ */
+ public void close() throws IOException {
+ raf.close();
+ }
+
+ /* (non-Javadoc)
+ * @see java.io.OutputStream#write(byte[], int, int)
+ */
+ public void write(byte[] b, int off, int len) throws IOException {
+ raf.write(b, off, len);
+ }
+
+ /* (non-Javadoc)
+ * @see java.io.OutputStream#write(byte[])
+ */
+ public void write(byte[] b) throws IOException {
+ raf.write(b);
+ }
+}
diff --git a/src/main/java/org/archive/io/ReadSource.java b/src/main/java/org/archive/io/ReadSource.java
new file mode 100644
index 00000000..a3c29967
--- /dev/null
+++ b/src/main/java/org/archive/io/ReadSource.java
@@ -0,0 +1,37 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io;
+
+import java.io.Reader;
+
+/**
+ * Interface for objects that can provide a Reader view of their
+ * contents.
+ *
+ */
+public interface ReadSource {
+ /**
+ * Obtain a Reader. Not named 'getReader' so that it is not
+ * considered a simple costless read-only property by
+ * bean-convention introspection tools.
+ * @return a Reader on this object
+ */
+ Reader obtainReader();
+}
diff --git a/src/main/java/org/archive/io/RecorderIOException.java b/src/main/java/org/archive/io/RecorderIOException.java
new file mode 100644
index 00000000..07b30061
--- /dev/null
+++ b/src/main/java/org/archive/io/RecorderIOException.java
@@ -0,0 +1,38 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+import java.io.IOException;
+
+/**
+ *
+ * @author Gordon Mohr
+ */
+public class RecorderIOException extends IOException {
+
+ private static final long serialVersionUID = 5907470275350314277L;
+
+ public RecorderIOException() {
+ super();
+ }
+
+ public RecorderIOException(String msg) {
+ super(msg);
+ }
+}
diff --git a/src/main/java/org/archive/io/RecorderLengthExceededException.java b/src/main/java/org/archive/io/RecorderLengthExceededException.java
new file mode 100644
index 00000000..8c3e067d
--- /dev/null
+++ b/src/main/java/org/archive/io/RecorderLengthExceededException.java
@@ -0,0 +1,39 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+
+/**
+ * Indicates a length exception thrown by the Recorder.
+ *
+ * @author Gordon Mohr
+ */
+public class RecorderLengthExceededException
+extends RecorderIOException {
+
+ private static final long serialVersionUID = 6655419033414648444L;
+
+ public RecorderLengthExceededException() {
+ super();
+ }
+
+ public RecorderLengthExceededException(String msg) {
+ super(msg);
+ }
+}
diff --git a/src/main/java/org/archive/io/RecorderTimeoutException.java b/src/main/java/org/archive/io/RecorderTimeoutException.java
new file mode 100644
index 00000000..32be5b5d
--- /dev/null
+++ b/src/main/java/org/archive/io/RecorderTimeoutException.java
@@ -0,0 +1,37 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+/**
+ * Indicates a timeout thrown by the RecordingInputStream.
+ *
+ * @author Gordon Mohr
+ */
+public class RecorderTimeoutException extends RecorderIOException {
+
+ private static final long serialVersionUID = 7433214063765078269L;
+
+ public RecorderTimeoutException() {
+ super();
+ }
+
+ public RecorderTimeoutException(String msg) {
+ super(msg);
+ }
+}
diff --git a/src/main/java/org/archive/io/RecorderTooMuchHeaderException.java b/src/main/java/org/archive/io/RecorderTooMuchHeaderException.java
new file mode 100644
index 00000000..23f5d264
--- /dev/null
+++ b/src/main/java/org/archive/io/RecorderTooMuchHeaderException.java
@@ -0,0 +1,40 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+
+/**
+ * Indicates a too much header material exception thrown by the Recorder
+ * (specificially the RecordingOutputStream)
+ *
+ * @author Gordon Mohr
+ */
+public class RecorderTooMuchHeaderException
+extends RecorderIOException {
+
+ private static final long serialVersionUID = 3528516034898129150L;
+
+ public RecorderTooMuchHeaderException() {
+ super();
+ }
+
+ public RecorderTooMuchHeaderException(String msg) {
+ super(msg);
+ }
+}
diff --git a/src/main/java/org/archive/io/RecordingInputStream.java b/src/main/java/org/archive/io/RecordingInputStream.java
new file mode 100644
index 00000000..b46905ed
--- /dev/null
+++ b/src/main/java/org/archive/io/RecordingInputStream.java
@@ -0,0 +1,355 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.SocketException;
+import java.net.SocketTimeoutException;
+import java.security.MessageDigest;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import org.apache.commons.io.IOUtils;
+
+
+/**
+ * Stream which records all data read from it, which it acquires from a wrapped
+ * input stream.
+ *
+ * Makes use of a RecordingOutputStream for recording because of its being
+ * file backed so we can write massive amounts of data w/o worrying about
+ * overflowing memory.
+ *
+ * @author gojomo
+ *
+ */
+public class RecordingInputStream
+ extends InputStream {
+
+ protected static Logger logger =
+ Logger.getLogger("org.archive.io.RecordingInputStream");
+
+ /**
+ * Where we are recording to.
+ */
+ private RecordingOutputStream recordingOutputStream;
+
+ /**
+ * Stream to record.
+ */
+ private InputStream in = null;
+
+ /**
+ * Reusable buffer to avoid reallocation on each readFullyUntil
+ */
+ protected byte[] drainBuffer = new byte[16*1024];
+
+ /**
+ * Create a new RecordingInputStream.
+ *
+ * @param bufferSize Size of buffer to use.
+ * @param backingFilename Name of backing file.
+ */
+ public RecordingInputStream(int bufferSize, String backingFilename)
+ {
+ this.recordingOutputStream = new RecordingOutputStream(bufferSize,
+ backingFilename);
+ }
+
+ public void open(InputStream wrappedStream) throws IOException {
+ logger.fine(Thread.currentThread().getName() + " opening " +
+ wrappedStream + ", " + Thread.currentThread().getName());
+ if(isOpen()) {
+ // error; should not be opening/wrapping in an unclosed
+ // stream remains open
+ throw new IOException("RIS already open for "
+ +Thread.currentThread().getName());
+ }
+ try {
+ this.in = wrappedStream;
+ this.recordingOutputStream.open();
+ } catch (IOException ioe) {
+ close(); // ...and rethrow...
+ throw ioe;
+ }
+ }
+
+ public int read() throws IOException {
+ if (!isOpen()) {
+ throw new IOException("Stream closed " +
+ Thread.currentThread().getName());
+ }
+ int b = this.in.read();
+ if (b != -1) {
+ assert this.recordingOutputStream != null: "ROS is null " +
+ Thread.currentThread().getName();
+ this.recordingOutputStream.write(b);
+ }
+ return b;
+ }
+
+ public int read(byte[] b, int off, int len) throws IOException {
+ if (!isOpen()) {
+ throw new IOException("Stream closed " +
+ Thread.currentThread().getName());
+ }
+ int count = this.in.read(b,off,len);
+ if (count > 0) {
+ assert this.recordingOutputStream != null: "ROS is null " +
+ Thread.currentThread().getName();
+ this.recordingOutputStream.write(b,off,count);
+ }
+ return count;
+ }
+
+ public int read(byte[] b) throws IOException {
+ if (!isOpen()) {
+ throw new IOException("Stream closed " +
+ Thread.currentThread().getName());
+ }
+ int count = this.in.read(b);
+ if (count > 0) {
+ assert this.recordingOutputStream != null: "ROS is null " +
+ Thread.currentThread().getName();
+ this.recordingOutputStream.write(b,0,count);
+ }
+ return count;
+ }
+
+ public void close() throws IOException {
+ if (logger.isLoggable(Level.FINE)) {
+ logger.fine(Thread.currentThread().getName() + " closing " +
+ this.in + ", " + Thread.currentThread().getName());
+ }
+ IOUtils.closeQuietly(this.in);
+ this.in = null;
+ IOUtils.closeQuietly(this.recordingOutputStream);
+ }
+
+ public ReplayInputStream getReplayInputStream() throws IOException {
+ return this.recordingOutputStream.getReplayInputStream();
+ }
+
+ public ReplayInputStream getMessageBodyReplayInputStream() throws IOException {
+ return this.recordingOutputStream.getMessageBodyReplayInputStream();
+ }
+
+ public long readFully() throws IOException {
+ while(read(drainBuffer) != -1) {
+ // Empty out stream.
+ continue;
+ }
+ return this.recordingOutputStream.getSize();
+ }
+
+ /**
+ * Read all of a stream (Or read until we timeout or have read to the max).
+ * @param softMaxLength Maximum length to read; if zero or < 0, then no
+ * limit. If met, return normally.
+ * @param hardMaxLength Maximum length to read; if zero or < 0, then no
+ * limit. If exceeded, throw RecorderLengthExceededException
+ * @param timeout Timeout in milliseconds for total read; if zero or
+ * negative, timeout is Long.MAX_VALUE. If exceeded, throw
+ * RecorderTimeoutException
+ * @param maxBytesPerMs How many bytes per millisecond.
+ * @throws IOException failed read.
+ * @throws RecorderLengthExceededException
+ * @throws RecorderTimeoutException
+ * @throws InterruptedException
+ */
+ public void readFullyOrUntil(long softMaxLength)
+ throws IOException, RecorderLengthExceededException,
+ RecorderTimeoutException, InterruptedException {
+ // Check we're open before proceeding.
+ if (!isOpen()) {
+ // TODO: should this be a noisier exception-raising error?
+ return;
+ }
+
+ long totalBytes = 0L;
+ long bytesRead = -1L;
+ long maxToRead = -1;
+ while (true) {
+ try {
+ // read no more than soft max
+ maxToRead = (softMaxLength <= 0)
+ ? drainBuffer.length
+ : Math.min(drainBuffer.length, softMaxLength - totalBytes);
+ // nor more than hard max
+ maxToRead = Math.min(maxToRead, recordingOutputStream.getRemainingLength());
+ // but always at least 1 (to trigger hard max exception
+ maxToRead = Math.max(maxToRead, 1);
+
+ bytesRead = read(drainBuffer,0,(int)maxToRead);
+ if (bytesRead == -1) {
+ break;
+ }
+ totalBytes += bytesRead;
+
+ if (Thread.interrupted()) {
+ throw new InterruptedException("Interrupted during IO");
+ }
+ } catch (SocketTimeoutException e) {
+ // A socket timeout is just a transient problem, meaning
+ // nothing was available in the configured timeout period,
+ // but something else might become available later.
+ // Take this opportunity to check the overall
+ // timeout (below). One reason for this timeout is
+ // servers that keep up the connection, 'keep-alive', even
+ // though we asked them to not keep the connection open.
+ if (logger.isLoggable(Level.FINE)) {
+ logger.log(Level.FINE, "socket timeout", e);
+ }
+ // check for interrupt
+ if (Thread.interrupted()) {
+ throw new InterruptedException("Interrupted during IO");
+ }
+ // check for overall timeout
+ recordingOutputStream.checkLimits();
+ } catch (SocketException se) {
+ throw se;
+ } catch (NullPointerException e) {
+ // [ 896757 ] NPEs in Andy's Th-Fri Crawl.
+ // A crawl was showing NPE's in this part of the code but can
+ // not reproduce. Adding this rethrowing catch block w/
+ // diagnostics to help should we come across the problem in the
+ // future.
+ throw new NullPointerException("Stream " + this.in + ", " +
+ e.getMessage() + " " + Thread.currentThread().getName());
+ }
+
+ // if have read 'enough', just finish
+ if (softMaxLength > 0 && totalBytes >= softMaxLength) {
+ break; // return
+ }
+ }
+ }
+
+ public long getSize() {
+ return this.recordingOutputStream.getSize();
+ }
+
+ public void markContentBegin() {
+ this.recordingOutputStream.markMessageBodyBegin();
+ }
+
+ public long getContentBegin() {
+ return this.recordingOutputStream.getMessageBodyBegin();
+ }
+
+ public void startDigest() {
+ this.recordingOutputStream.startDigest();
+ }
+
+ /**
+ * Convenience method for setting SHA1 digest.
+ */
+ public void setSha1Digest() {
+ this.recordingOutputStream.setSha1Digest();
+ }
+
+ /**
+ * Sets a digest algorithm which may be applied to recorded data.
+ * As usually only a subset of the recorded data should
+ * be fed to the digest, you must also call startDigest()
+ * to begin digesting.
+ *
+ * @param algorithm
+ */
+ public void setDigest(String algorithm) {
+ this.recordingOutputStream.setDigest(algorithm);
+ }
+
+ /**
+ * Sets a digest function which may be applied to recorded data.
+ * As usually only a subset of the recorded data should
+ * be fed to the digest, you must also call startDigest()
+ * to begin digesting.
+ *
+ * @param md
+ */
+ public void setDigest(MessageDigest md) {
+ this.recordingOutputStream.setDigest(md);
+ }
+
+ /**
+ * Return the digest value for any recorded, digested data. Call
+ * only after all data has been recorded; otherwise, the running
+ * digest state is ruined.
+ *
+ * @return the digest final value
+ */
+ public byte[] getDigestValue() {
+ return this.recordingOutputStream.getDigestValue();
+ }
+
+ public long getResponseContentLength() {
+ return this.recordingOutputStream.getResponseContentLength();
+ }
+
+ public void closeRecorder() throws IOException {
+ this.recordingOutputStream.closeRecorder();
+ }
+
+ /**
+ * @return True if we've been opened.
+ */
+ public boolean isOpen()
+ {
+ return this.in != null;
+ }
+
+ @Override
+ public synchronized void mark(int readlimit) {
+ this.in.mark(readlimit);
+ this.recordingOutputStream.mark();
+ }
+
+ @Override
+ public boolean markSupported() {
+ return this.in.markSupported();
+ }
+
+ @Override
+ public synchronized void reset() throws IOException {
+ this.in.reset();
+ this.recordingOutputStream.reset();
+ }
+
+ /**
+ * Set limits to be enforced by internal recording-out
+ */
+ public void setLimits(long hardMax, long timeoutMs, long maxRateKBps) {
+ recordingOutputStream.setLimits(hardMax, timeoutMs, maxRateKBps);
+ }
+
+ /**
+ * Expose the amount of in-memory buffering used by the internal
+ * recording stream.
+ * @return int buffer size
+ */
+ public int getRecordedBufferLength() {
+ return recordingOutputStream.getBufferLength();
+ }
+
+ public void clearForReuse() throws IOException {
+ recordingOutputStream.clearForReuse();
+ }
+}
diff --git a/src/main/java/org/archive/io/RecordingOutputStream.java b/src/main/java/org/archive/io/RecordingOutputStream.java
new file mode 100644
index 00000000..4d0713da
--- /dev/null
+++ b/src/main/java/org/archive/io/RecordingOutputStream.java
@@ -0,0 +1,576 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io;
+
+import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
+
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+
+/**
+ * An output stream that records all writes to wrapped output
+ * stream.
+ *
+ * A RecordingOutputStream can be wrapped around any other
+ * OutputStream to record all bytes written to it. You can
+ * then request a ReplayInputStream to read those bytes.
+ *
+ *
The RecordingOutputStream uses an in-memory buffer and
+ * backing disk file to allow it to record streams of
+ * arbitrary length limited only by available disk space.
+ *
+ *
As long as the stream recorded is smaller than the
+ * in-memory buffer, no disk access will occur.
+ *
+ *
Recorded content can be recovered as a ReplayInputStream
+ * (via getReplayInputStream() or, for only the content after
+ * the content-begin-mark is set, getContentReplayInputStream() )
+ * or as a ReplayCharSequence (via getReplayCharSequence()).
+ *
+ *
This class is also used as a straight output stream
+ * by {@link RecordingInputStream} to which it records all reads.
+ * {@link RecordingInputStream} is exploiting the file backed buffer
+ * facility of this class passing null for the stream
+ * to wrap. TODO: Make a FileBackedOutputStream class that is
+ * subclassed by RecordingInputStream.
+ *
+ * @author gojomo
+ *
+ */
+public class RecordingOutputStream extends OutputStream {
+ protected static Logger logger =
+ Logger.getLogger(RecordingOutputStream.class.getName());
+
+ /**
+ * Size of recording.
+ *
+ * Later passed to ReplayInputStream on creation. It uses it to know when
+ * EOS.
+ */
+ protected long size = 0;
+
+ protected String backingFilename;
+ protected OutputStream diskStream = null;
+
+ /**
+ * Buffer we write recordings to.
+ *
+ * We write all recordings here first till its full. Thereafter we
+ * write the backing file.
+ */
+ private byte[] buffer;
+
+ /** current virtual position in the recording */
+ private long position;
+
+ /** flag to disable recording */
+ private boolean recording;
+
+ /**
+ * Reusable buffer for FastBufferedOutputStream
+ */
+ protected byte[] bufStreamBuf =
+ new byte [ FastBufferedOutputStream.DEFAULT_BUFFER_SIZE ];
+
+ /**
+ * True if we're to digest content.
+ */
+ private boolean shouldDigest = false;
+
+ /**
+ * Digest instance.
+ */
+ private MessageDigest digest = null;
+
+ /**
+ * Define for SHA1 algarithm.
+ */
+ private static final String SHA1 = "SHA1";
+
+ /**
+ * Maximum amount of header material to accept without the content
+ * body beginning -- if more, throw a RecorderTooMuchHeaderException.
+ * TODO: make configurable? make smaller?
+ */
+ protected static final long MAX_HEADER_MATERIAL = 1024*1024; // 1MB
+
+ // configurable max length, max time limits
+ /** maximum length of material to record before throwing exception */
+ protected long maxLength = Long.MAX_VALUE;
+ /** maximum time to record before throwing exception */
+ protected long timeoutMs = Long.MAX_VALUE;
+ /** maximum rate to record (adds delays to hit target rate) */
+ protected long maxRateBytesPerMs = Long.MAX_VALUE;
+ /** time recording begins for timeout, rate calculations */
+ protected long startTime = Long.MAX_VALUE;
+
+ /**
+ * When recording HTTP, where the content-body starts.
+ */
+ protected long messageBodyBeginMark;
+
+ /**
+ * Stream to record.
+ */
+ private OutputStream out = null;
+
+ // mark/reset support
+ /** furthest position reached before any reset()s */
+ private long maxPosition = 0;
+ /** remembered position to reset() to */
+ private long markPosition = 0;
+
+ /**
+ * Create a new RecordingOutputStream.
+ *
+ * @param bufferSize Buffer size to use.
+ * @param backingFilename Name of backing file to use.
+ */
+ public RecordingOutputStream(int bufferSize, String backingFilename) {
+ this.buffer = new byte[bufferSize];
+ this.backingFilename = backingFilename;
+ recording = true;
+ }
+
+ /**
+ * Wrap the given stream, both recording and passing along any data written
+ * to this RecordingOutputStream.
+ *
+ * @throws IOException If failed creation of backing file.
+ */
+ public void open() throws IOException {
+ this.open(null);
+ }
+
+ /**
+ * Wrap the given stream, both recording and passing along any data written
+ * to this RecordingOutputStream.
+ *
+ * @param wrappedStream Stream to wrap. May be null for case where we
+ * want to write to a file backed stream only.
+ *
+ * @throws IOException If failed creation of backing file.
+ */
+ public void open(OutputStream wrappedStream) throws IOException {
+ if(isOpen()) {
+ // error; should not be opening/wrapping in an unclosed
+ // stream remains open
+ throw new IOException("ROS already open for "
+ +Thread.currentThread().getName());
+ }
+ clearForReuse();
+ this.out = wrappedStream;
+ if (this.diskStream == null) {
+ // TODO: Fix so we only make file when its actually needed.
+ FileOutputStream fis = new FileOutputStream(this.backingFilename);
+
+ this.diskStream = new RecyclingFastBufferedOutputStream(fis, bufStreamBuf);
+ }
+ startTime = System.currentTimeMillis();
+ }
+
+ public void write(int b) throws IOException {
+ if(position< maxPosition) {
+ if(position+len<=maxPosition) {
+ // revisiting; do nothing but advance position
+ position += len;
+ return;
+ }
+ // consume part of the array doing nothing but advancing position
+ long consumeRange = maxPosition - position;
+ position += consumeRange;
+ off += consumeRange;
+ len -= consumeRange;
+ }
+ if(recording) {
+ record(b, off, len);
+ }
+ if (this.out != null) {
+ this.out.write(b, off, len);
+ }
+ checkLimits();
+ }
+
+ /**
+ * Check any enforced limits.
+ */
+ protected void checkLimits() throws RecorderIOException {
+ // too much material before finding end of headers?
+ if (messageBodyBeginMark<0) {
+ // no mark yet
+ if(position>MAX_HEADER_MATERIAL) {
+ throw new RecorderTooMuchHeaderException();
+ }
+ }
+ // overlong?
+ if(position>maxLength) {
+ throw new RecorderLengthExceededException();
+ }
+ // taking too long?
+ long duration = System.currentTimeMillis() - startTime;
+ duration = Math.max(duration,1); // !divzero
+ if(duration>timeoutMs) {
+ throw new RecorderTimeoutException();
+ }
+ // need to throttle reading to hit max configured rate?
+ if(position/duration > maxRateBytesPerMs) {
+ long desiredDuration = position / maxRateBytesPerMs;
+ try {
+ Thread.sleep(desiredDuration-duration);
+ } catch (InterruptedException e) {
+ logger.log(Level.WARNING,
+ "bandwidth throttling sleep interrupted", e);
+ }
+ }
+ }
+
+ /**
+ * Record the given byte for later recovery
+ *
+ * @param b Int to record.
+ *
+ * @exception IOException Failed write to backing file.
+ */
+ private void record(int b) throws IOException {
+ if (this.shouldDigest) {
+ this.digest.update((byte)b);
+ }
+ if (this.position >= this.buffer.length) {
+ // TODO: Its possible to call write w/o having first opened a
+ // stream. Protect ourselves against this.
+ assert this.diskStream != null: "Diskstream is null";
+ this.diskStream.write(b);
+ } else {
+ this.buffer[(int) this.position] = (byte) b;
+ }
+ this.position++;
+ }
+
+ /**
+ * Record the given byte-array range for recovery later
+ *
+ * @param b Buffer to record.
+ * @param off Offset into buffer at which to start recording.
+ * @param len Length of buffer to record.
+ *
+ * @exception IOException Failed write to backing file.
+ */
+ private void record(byte[] b, int off, int len) throws IOException {
+ if(this.shouldDigest) {
+ assert this.digest != null: "Digest is null.";
+ this.digest.update(b, off, len);
+ }
+ tailRecord(b, off, len);
+ }
+
+ /**
+ * Record without digesting.
+ *
+ * @param b Buffer to record.
+ * @param off Offset into buffer at which to start recording.
+ * @param len Length of buffer to record.
+ *
+ * @exception IOException Failed write to backing file.
+ */
+ private void tailRecord(byte[] b, int off, int len) throws IOException {
+ if(this.position >= this.buffer.length){
+ // TODO: Its possible to call write w/o having first opened a
+ // stream. Lets protect ourselves against this.
+ if (this.diskStream == null) {
+ throw new IOException("diskstream is null");
+ }
+ this.diskStream.write(b, off, len);
+ this.position += len;
+ } else {
+ assert this.buffer != null: "Buffer is null";
+ int toCopy = (int)Math.min(this.buffer.length - this.position, len);
+ assert b != null: "Passed buffer is null";
+ System.arraycopy(b, off, this.buffer, (int)this.position, toCopy);
+ this.position += toCopy;
+ // TODO verify these are +1 -1 right
+ if (toCopy < len) {
+ tailRecord(b, off + toCopy, len - toCopy);
+ }
+ }
+ }
+
+ public void close() throws IOException {
+ if(messageBodyBeginMark<0) {
+ // if unset, consider 0 posn as content-start
+ // (so that a -1 never survives to replay step)
+ messageBodyBeginMark = 0;
+ }
+ if (this.out != null) {
+ this.out.close();
+ this.out = null;
+ }
+ closeRecorder();
+ }
+
+ protected synchronized void closeDiskStream()
+ throws IOException {
+ if (this.diskStream != null) {
+ this.diskStream.close();
+ this.diskStream = null;
+ }
+ }
+
+ public void closeRecorder() throws IOException {
+ recording = false;
+ closeDiskStream(); // if any
+ // This setting of size is important. Its passed to ReplayInputStream
+ // on creation. It uses it to know EOS.
+ if (this.size == 0) {
+ this.size = this.position;
+ }
+ }
+
+ /* (non-Javadoc)
+ * @see java.io.OutputStream#flush()
+ */
+ public void flush() throws IOException {
+ if (this.out != null) {
+ this.out.flush();
+ }
+ if (this.diskStream != null) {
+ this.diskStream.flush();
+ }
+ }
+
+ public ReplayInputStream getReplayInputStream() throws IOException {
+ return getReplayInputStream(0);
+ }
+
+ public ReplayInputStream getReplayInputStream(long skip) throws IOException {
+ // If this method is being called, then assumption must be that the
+ // stream is closed. If it ain't, then the stream gotten won't work
+ // -- the size will zero so any attempt at a read will get back EOF.
+ assert this.out == null: "Stream is still open.";
+ ReplayInputStream replay = new ReplayInputStream(this.buffer,
+ this.size, this.messageBodyBeginMark, this.backingFilename);
+ replay.skip(skip);
+ return replay;
+ }
+
+ /**
+ * Return a replay stream, cued up to begining of content
+ *
+ * @throws IOException
+ * @return An RIS.
+ */
+ public ReplayInputStream getMessageBodyReplayInputStream() throws IOException {
+ return getReplayInputStream(this.messageBodyBeginMark);
+ }
+
+ public long getSize() {
+ return this.size;
+ }
+
+ /**
+ * Remember the current position as the start of the "message
+ * body". Useful when recording HTTP traffic as a way to start
+ * replays after the headers.
+ */
+ public void markMessageBodyBegin() {
+ this.messageBodyBeginMark = this.position;
+ startDigest();
+ }
+
+ /**
+ * Return stored message-body-begin-mark (which is also end-of-headers)
+ */
+ public long getMessageBodyBegin() {
+ return this.messageBodyBeginMark;
+ }
+
+ /**
+ * Starts digesting recorded data, if a MessageDigest has been
+ * set.
+ */
+ public void startDigest() {
+ if (this.digest != null) {
+ this.digest.reset();
+ this.shouldDigest = true;
+ }
+ }
+
+ /**
+ * Convenience method for setting SHA1 digest.
+ * @see #setDigest(String)
+ */
+ public void setSha1Digest() {
+ setDigest(SHA1);
+ }
+
+
+ /**
+ * Sets a digest function which may be applied to recorded data.
+ * The difference between calling this method and {@link #setDigest(MessageDigest)}
+ * is that this method tries to reuse MethodDigest instance if already allocated
+ * and of appropriate algorithm.
+ * @param algorithm Message digest algorithm to use.
+ * @see #setDigest(MessageDigest)
+ */
+ public void setDigest(String algorithm) {
+ try {
+ // Reuse extant digest if its sha1 algorithm.
+ if (this.digest == null ||
+ !this.digest.getAlgorithm().equals(algorithm)) {
+ setDigest(MessageDigest.getInstance(algorithm));
+ }
+ } catch (NoSuchAlgorithmException e) {
+ e.printStackTrace();
+ }
+ }
+
+ /**
+ * Sets a digest function which may be applied to recorded data.
+ *
+ * As usually only a subset of the recorded data should
+ * be fed to the digest, you must also call startDigest()
+ * to begin digesting.
+ *
+ * @param md Message digest function to use.
+ */
+ public void setDigest(MessageDigest md) {
+ this.digest = md;
+ }
+
+ /**
+ * Return the digest value for any recorded, digested data. Call
+ * only after all data has been recorded; otherwise, the running
+ * digest state is ruined.
+ *
+ * @return the digest final value
+ */
+ public byte[] getDigestValue() {
+ if(this.digest == null) {
+ return null;
+ }
+ return this.digest.digest();
+ }
+
+ public long getResponseContentLength() {
+ return this.size - this.messageBodyBeginMark;
+ }
+
+ /**
+ * @return True if this ROS is open.
+ */
+ public boolean isOpen() {
+ return this.out != null;
+ }
+
+ public int getBufferLength() {
+ return this.buffer.length;
+ }
+
+ /**
+ * When used alongside a mark-supporting RecordingInputStream, remember
+ * a position reachable by a future reset().
+ */
+ public void mark() {
+ // remember this position for subsequent reset()
+ this.markPosition = position;
+ }
+
+ /**
+ * When used alongside a mark-supporting RecordingInputStream, reset
+ * the position to that saved by previous mark(). Until the position
+ * again reached "new" material, none of the bytes pushed to this
+ * stream will be digested or recorded.
+ */
+ public void reset() {
+ // take note of furthest-position-reached to avoid double-recording
+ maxPosition = Math.max(maxPosition, position);
+ // reset to previous position
+ position = markPosition;
+ }
+
+ /**
+ * Set limits on length, time, and rate to enforce.
+ *
+ * @param length
+ * @param milliseconds
+ * @param rateKBps
+ */
+ public void setLimits(long length, long milliseconds, long rateKBps) {
+ maxLength = (length>0) ? length : Long.MAX_VALUE;
+ timeoutMs = (milliseconds>0) ? milliseconds : Long.MAX_VALUE;
+ maxRateBytesPerMs = (rateKBps>0) ? rateKBps*1024/1000 : Long.MAX_VALUE;
+ }
+
+ /**
+ * Reset limits to effectively-unlimited defaults
+ */
+ public void resetLimits() {
+ maxLength = Long.MAX_VALUE;
+ timeoutMs = Long.MAX_VALUE;
+ maxRateBytesPerMs = Long.MAX_VALUE;
+ }
+
+ /**
+ * Return number of bytes that could be recorded without hitting
+ * length limit
+ *
+ * @return long byte count
+ */
+ public long getRemainingLength() {
+ return maxLength - position;
+ }
+
+ public void clearForReuse() throws IOException {
+ this.out = null;
+ this.position = 0;
+ this.markPosition = 0;
+ this.maxPosition = 0;
+ this.size = 0;
+ this.messageBodyBeginMark = -1;
+ // ensure recording turned on
+ this.recording = true;
+ // Always begins false; must use startDigest() to begin
+ this.shouldDigest = false;
+ if (this.diskStream != null) {
+ closeDiskStream();
+ }
+ }
+}
+
diff --git a/src/main/java/org/archive/io/RecoverableIOException.java b/src/main/java/org/archive/io/RecoverableIOException.java
new file mode 100644
index 00000000..5ce2251a
--- /dev/null
+++ b/src/main/java/org/archive/io/RecoverableIOException.java
@@ -0,0 +1,83 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+import java.io.IOException;
+import java.io.PrintStream;
+import java.io.PrintWriter;
+
+/**
+ * A decorator on IOException for IOEs that are likely not fatal or at least
+ * merit retry.
+ * @author stack
+ * @version $Date$, $Revision$
+ */
+public class RecoverableIOException extends IOException {
+ private static final long serialVersionUID = 6194776587381865451L;
+ private final IOException decoratedIOException;
+
+ public RecoverableIOException(final String message) {
+ this(new IOException(message));
+ }
+
+ public RecoverableIOException(final IOException ioe) {
+ super();
+ this.decoratedIOException = ioe;
+ }
+
+ public Throwable getCause() {
+ return this.decoratedIOException.getCause();
+ }
+
+ public String getLocalizedMessage() {
+ return this.decoratedIOException.getLocalizedMessage();
+ }
+
+ public String getMessage() {
+ return this.decoratedIOException.getMessage();
+ }
+
+ public StackTraceElement[] getStackTrace() {
+ return this.decoratedIOException.getStackTrace();
+ }
+
+ public synchronized Throwable initCause(Throwable cause) {
+ return this.decoratedIOException.initCause(cause);
+ }
+
+ public void printStackTrace() {
+ this.decoratedIOException.printStackTrace();
+ }
+
+ public void printStackTrace(PrintStream s) {
+ this.decoratedIOException.printStackTrace(s);
+ }
+
+ public void printStackTrace(PrintWriter s) {
+ this.decoratedIOException.printStackTrace(s);
+ }
+
+ public void setStackTrace(StackTraceElement[] stackTrace) {
+ this.decoratedIOException.setStackTrace(stackTrace);
+ }
+
+ public String toString() {
+ return this.decoratedIOException.toString();
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/io/RecyclingFastBufferedOutputStream.java b/src/main/java/org/archive/io/RecyclingFastBufferedOutputStream.java
new file mode 100644
index 00000000..a3b76e46
--- /dev/null
+++ b/src/main/java/org/archive/io/RecyclingFastBufferedOutputStream.java
@@ -0,0 +1,37 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
+
+import java.io.OutputStream;
+
+/**
+ * FastBufferedOutputStream that accepts a passed-in buffer (avoiding
+ * reallocation).
+ */
+public class RecyclingFastBufferedOutputStream extends FastBufferedOutputStream {
+ public RecyclingFastBufferedOutputStream( final OutputStream os, final byte[] buffer ) {
+ super(os);
+ this.buffer = buffer;
+ avail = buffer.length;
+ }
+}
+
+
diff --git a/src/main/java/org/archive/io/ReplayCharSequence.java b/src/main/java/org/archive/io/ReplayCharSequence.java
new file mode 100644
index 00000000..aa9b9587
--- /dev/null
+++ b/src/main/java/org/archive/io/ReplayCharSequence.java
@@ -0,0 +1,77 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.nio.charset.CharacterCodingException;
+import java.nio.charset.Charset;
+
+import com.google.common.base.Charsets;
+
+
+/**
+ * CharSequence interface with addition of a {@link #close()} method.
+ *
+ * Users of implementations of this interface must call {@link #close()} so
+ * implementations get a chance at cleaning up after themselves.
+ *
+ * @author stack
+ * @version $Revision$, $Date$
+ */
+public interface ReplayCharSequence extends CharSequence, Closeable {
+
+ /** charset to use in replay when declared value
+ * is absent/illegal/unavailable */
+ public Charset FALLBACK_CHARSET = Charsets.ISO_8859_1; // TODO: should this be UTF-8?
+
+ /**
+ * Call this method when done so implementation has chance to clean up
+ * resources.
+ *
+ * @throws IOException Problem cleaning up file system resources.
+ */
+ public void close() throws IOException;
+
+ /**
+ * Report count of decoder errors silently eaten during ReplayCharSequence
+ * use. May be less than the number of individual decoding anomalies in
+ * underlying content (if decoding method doesn't allow counting individual
+ * errors).
+ */
+ public long getDecodeExceptionCount();
+
+ /**
+ * Return the first coding-exception encountered, if the count > 0.
+ * @return CharacterCodingException
+ */
+ public CharacterCodingException getCodingException();
+
+ /**
+ * @return false if {@link #close()} has been called
+ */
+ public boolean isOpen();
+
+ /**
+ * Return the effective Charset used to create this CharSequence from
+ * (raw byte) source material.
+ */
+ public Charset getCharset();
+}
diff --git a/src/main/java/org/archive/io/ReplayInputStream.java b/src/main/java/org/archive/io/ReplayInputStream.java
new file mode 100644
index 00000000..fccf5fd3
--- /dev/null
+++ b/src/main/java/org/archive/io/ReplayInputStream.java
@@ -0,0 +1,325 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+
+import org.apache.commons.io.IOUtils;
+import org.archive.util.ArchiveUtils;
+import org.archive.util.FileUtils;
+
+
+/**
+ * Replays the bytes recorded from a RecordingInputStream or
+ * RecordingOutputStream.
+ *
+ * This InputStream supports mark and reset.
+ *
+ * @author gojomo
+ */
+public class ReplayInputStream extends SeekInputStream
+{
+ private static final int DEFAULT_BUFFER_SIZE = 256*1024; // 256KiB
+ private BufferedSeekInputStream diskStream;
+ private byte[] buffer;
+ private long position;
+
+ /**
+ * Total size of stream content.
+ *
+ * Size of data to replay.
+ */
+ private long size = -1;
+
+ /**
+ * Where the response body starts, if marked
+ */
+ protected long responseBodyStart = -1;
+
+
+ /**
+ * Constructor.
+ *
+ * @param buffer Buffer to read from.
+ * @param size Size of data to replay.
+ * @param responseBodyStart Start of the response body.
+ * @param backingFilename Backing file that sits behind the buffer. If
+ * size > than buffer then we go to backing file to read
+ * data that is beyond buffer.length.
+ *
+ * @throws IOException If we fail to open an input stream on
+ * backing file.
+ */
+ public ReplayInputStream(byte[] buffer, long size, long responseBodyStart,
+ String backingFilename)
+ throws IOException
+ {
+ this(buffer, size, backingFilename);
+ this.responseBodyStart = responseBodyStart;
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param buffer Buffer to read from.
+ * @param size Size of data to replay.
+ * @param backingFilename Backing file that sits behind the buffer. If
+ * size > than buffer then we go to backing file to read
+ * data that is beyond buffer.length.
+ * @throws IOException If we fail to open an input stream on
+ * backing file.
+ */
+ public ReplayInputStream(byte[] buffer, long size, String backingFilename)
+ throws IOException
+ {
+ this.buffer = buffer;
+ this.size = size;
+ if (size > buffer.length) {
+ setupDiskStream(new File(backingFilename));
+ }
+ }
+
+ protected void setupDiskStream(File backingFile) throws IOException {
+ RandomAccessInputStream rais = new RandomAccessInputStream(backingFile);
+ diskStream = new BufferedSeekInputStream(rais, 4096);
+ }
+
+ protected File backingFile;
+
+ /**
+ * Create a ReplayInputStream from the given source stream. Requires
+ * reading the entire stream (and possibly overflowing to a temporary
+ * file). Primary reason for doing so would be to have a repositionable
+ * version of the original stream's contents.
+ *
+ * If created via this constructor, use the destroy() method to ensure
+ * prompt deletion of any associated tmp file when done.
+ *
+ * @param fillStream
+ * @throws IOException
+ */
+ public ReplayInputStream(InputStream fillStream) throws IOException {
+ this.buffer = new byte[DEFAULT_BUFFER_SIZE];
+ long count = ArchiveUtils.readFully(fillStream, buffer);
+ if(fillStream.available()>0) {
+ this.backingFile = File.createTempFile("tid"+Thread.currentThread().getId(), "ris");
+ count += FileUtils.readFullyToFile(fillStream, backingFile);
+ setupDiskStream(backingFile);
+ }
+ this.size = count;
+ }
+
+ /**
+ * Close & destroy any internally-generated temporary files.
+ */
+ public void destroy() {
+ IOUtils.closeQuietly(this);
+ if(backingFile!=null) {
+ FileUtils.deleteSoonerOrLater(backingFile);
+ }
+ }
+
+ public long setToResponseBodyStart() throws IOException {
+ position(responseBodyStart);
+ return this.position;
+ }
+
+
+ /* (non-Javadoc)
+ * @see java.io.InputStream#read()
+ */
+ public int read() throws IOException {
+ if (position == size) {
+ return -1; // EOF
+ }
+ if (position < buffer.length) {
+ // Convert to unsigned int.
+ int c = buffer[(int) position] & 0xFF;
+ position++;
+ return c;
+ }
+ int c = diskStream.read();
+ if (c >= 0) {
+ position++;
+ }
+ return c;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.io.InputStream#read(byte[], int, int)
+ */
+ public int read(byte[] b, int off, int len) throws IOException {
+ if (position == size) {
+ return -1; // EOF
+ }
+ if (position < buffer.length) {
+ int toCopy = (int)Math.min(size - position,
+ Math.min(len, buffer.length - position));
+ System.arraycopy(buffer, (int)position, b, off, toCopy);
+ if (toCopy > 0) {
+ position += toCopy;
+ }
+ return toCopy;
+ }
+ // into disk zone
+ int read = diskStream.read(b,off,len);
+ if(read>0) {
+ position += read;
+ }
+ return read;
+ }
+
+ public void readFullyTo(OutputStream os) throws IOException {
+ byte[] buf = new byte[4096];
+ int c = read(buf);
+ while (c != -1) {
+ os.write(buf,0,c);
+ c = read(buf);
+ }
+ }
+
+ /*
+ * Like 'readFullyTo', but only reads the header-part.
+ * Starts from the beginning each time it is called.
+ */
+ public void readHeaderTo(OutputStream os) throws IOException {
+ position = 0;
+ byte[] buf = new byte[(int)responseBodyStart];
+ int c = read(buf,0,buf.length);
+ if(c != -1) {
+ os.write(buf,0,c);
+ }
+ }
+
+ /*
+ * Like 'readFullyTo', but only reads the content-part.
+ */
+ public void readContentTo(OutputStream os) throws IOException {
+ setToResponseBodyStart();
+ byte[] buf = new byte[4096];
+ int c = read(buf);
+ while (c != -1) {
+ os.write(buf,0,c);
+ c = read(buf);
+ }
+ }
+
+ /**
+ * Convenience method to copy content out to target stream.
+ * @param os stream to write content to
+ * @param maxSize maximum count of bytes to copy
+ * @throws IOException
+ */
+ public void readContentTo(OutputStream os, long maxSize) throws IOException {
+ setToResponseBodyStart();
+ byte[] buf = new byte[4096];
+ int c = read(buf);
+ long tot = 0;
+ while (c != -1 && tot < maxSize) {
+ os.write(buf,0,c);
+ c = read(buf);
+ tot += c;
+ }
+ }
+
+ /* (non-Javadoc)
+ * @see java.io.InputStream#close()
+ */
+ public void close() throws IOException {
+ super.close();
+ if(diskStream != null) {
+ diskStream.close();
+ }
+ }
+
+ /**
+ * Total size of stream content.
+ * @return Returns the size.
+ */
+ public long getSize()
+ {
+ return size;
+ }
+
+ /**
+ * Total size of header.
+ * @return the size of the header.
+ */
+ public long getHeaderSize()
+ {
+ return responseBodyStart;
+ }
+
+ /**
+ * Total size of content.
+ * @return the size of the content.
+ */
+ public long getContentSize()
+ {
+ return size - responseBodyStart;
+ }
+
+ /**
+ * @return Amount THEORETICALLY remaining (TODO: Its not theoretical
+ * seemingly. The class implemetentation depends on it being exact).
+ */
+ public long remaining() {
+ return size - position;
+ }
+
+
+ /**
+ * Reposition the stream.
+ *
+ * @param p the new position for this stream
+ * @throws IOException if an IO error occurs
+ */
+ public void position(long p) throws IOException {
+ if (p < 0) {
+ throw new IOException("Negative seek offset.");
+ }
+ if (p > size) {
+ throw new IOException("Desired position exceeds size.");
+ }
+ if (p < buffer.length) {
+ // Only seek file if necessary
+ if (position > buffer.length) {
+ diskStream.position(0);
+ }
+ } else {
+ diskStream.position(p - buffer.length);
+ }
+ this.position = p;
+ }
+
+
+ public long position() throws IOException {
+ return position;
+ }
+
+ protected byte[] getBuffer() {
+ return buffer;
+ }
+}
diff --git a/src/main/java/org/archive/io/RepositionableInputStream.java b/src/main/java/org/archive/io/RepositionableInputStream.java
new file mode 100644
index 00000000..6f885130
--- /dev/null
+++ b/src/main/java/org/archive/io/RepositionableInputStream.java
@@ -0,0 +1,133 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+import it.unimi.dsi.fastutil.io.RepositionableStream;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * Wrapper around an {@link InputStream} to make a primitive Repositionable
+ * stream. Uses a {@link BufferedInputStream}. Calls mark on every read so
+ * we'll remember at least the last thing read (You can only backup on the
+ * last thing read -- not last 2 or 3 things read). Used by
+ * {@link GzippedInputStream} when reading streams over a network. Wraps a
+ * HTTP, etc., stream so we can back it up if needs be after the
+ * GZIP inflater has done a fill of its full buffer though it only needed
+ * the first few bytes to finish decompressing the current GZIP member.
+ *
+ *
TODO: More robust implementation. Tried to use the it.unimi.dsi.io
+ * FastBufferdInputStream but relies on FileChannel ByteBuffers and if not
+ * present -- as would be the case reading from a network stream, the main
+ * application for this instance -- then it expects the underlying stream
+ * implements RepositionableStream interface so chicken or egg problem.
+ * @author stack
+ */
+public class RepositionableInputStream extends BufferedInputStream implements
+ RepositionableStream {
+ private long position = 0;
+ private long markPosition = -1;
+
+ public RepositionableInputStream(InputStream in) {
+ super(in);
+ }
+
+ public RepositionableInputStream(InputStream in, int size) {
+ super(in, size);
+ }
+
+ public int read(byte[] b) throws IOException {
+ int read = super.read(b);
+ if (read != -1) {
+ position += read;
+ }
+ return read;
+ }
+
+ public synchronized int read(byte[] b, int offset, int ct)
+ throws IOException {
+ // Mark the underlying stream so that we'll remember what we are about
+ // to read unless a mark has been set in this RepositionableStream
+ // (We have two levels of mark). In this latter case we want the
+ // underlying stream to preserve its mark position so aligns with
+ // this RS when eset is called.
+ if (!isMarked()) {
+ super.mark((ct > offset)? ct - offset: ct);
+ }
+ int read = super.read(b, offset, ct);
+ if (read != -1) {
+ position += read;
+ }
+ return read;
+ }
+
+ public int read() throws IOException {
+ // Mark the underlying stream so that we'll remember what we are about
+ // to read unless a mark has been set in this RepositionableStream
+ // (We have two levels of mark). In this latter case we want the
+ // underlying stream to preserve its mark position so aligns with
+ // this RS when eset is called.
+ if (!isMarked()) {
+ super.mark(1);
+ }
+ int c = super.read();
+ if (c != -1) {
+ position++;
+ }
+ return c;
+ }
+
+ public void position(final long offset) {
+ if (this.position == offset) {
+ return;
+ }
+ int diff = (int)(offset - this.position);
+ long lowerBound = this.position - this.pos;
+ long upperBound = lowerBound + this.count;
+ if (offset < lowerBound || offset >= upperBound) {
+ throw new IllegalAccessError("Offset goes outside " +
+ "current this.buf (TODO: Do buffer fills if positive)");
+ }
+ this.position = offset;
+ this.pos += diff;
+ // Clear any mark.
+ this.markPosition = -1;
+ }
+
+ public void mark(int readlimit) {
+ this.markPosition = this.position;
+ super.mark(readlimit);
+ }
+
+ public void reset() throws IOException {
+ super.reset();
+ this.position = this.markPosition;
+ this.markPosition = -1;
+ }
+
+ protected boolean isMarked() {
+ return this.markPosition != -1;
+ }
+
+ public long position() {
+ return this.position;
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/io/SafeSeekInputStream.java b/src/main/java/org/archive/io/SafeSeekInputStream.java
new file mode 100644
index 00000000..0d8f83b1
--- /dev/null
+++ b/src/main/java/org/archive/io/SafeSeekInputStream.java
@@ -0,0 +1,124 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+
+import java.io.IOException;
+
+
+/**
+ * Enables multiple concurrent streams based on the same underlying stream.
+ *
+ * @author pjack
+ */
+public class SafeSeekInputStream extends SeekInputStream {
+
+
+ /**
+ * The underlying stream.
+ */
+ private SeekInputStream input;
+
+
+ /**
+ * The expected position of the underlying stream.
+ */
+ private long expected;
+
+
+ /**
+ * Constructor. The given stream will be positioned to 0 so that an
+ * accurate position can be tracked.
+ *
+ * @param input the underlying input stream
+ * @throws IOException if an IO error occurs
+ */
+ public SafeSeekInputStream(SeekInputStream input) throws IOException {
+ this.input = input;
+ this.expected = input.position();
+ }
+
+
+ /**
+ * Ensures that the underlying stream's position is what we expect to be.
+ *
+ * @throws IOException if an IO error occurs
+ */
+ private void ensure() throws IOException {
+ if (expected != input.position()) {
+ input.position(expected);
+ }
+ }
+
+
+ @Override
+ public int read() throws IOException {
+ ensure();
+ int c = input.read();
+ if (c >= 0) {
+ expected++;
+ }
+ return c;
+ }
+
+
+ @Override
+ public int read(byte[] buf, int ofs, int len) throws IOException {
+ ensure();
+ int r = input.read(buf, ofs, len);
+ if (r > 0) {
+ expected += r;
+ }
+ return r;
+ }
+
+
+ @Override
+ public int read(byte[] buf) throws IOException {
+ ensure();
+ int r = input.read(buf);
+ if (r > 0) {
+ expected += r;
+ }
+ return r;
+ }
+
+
+ @Override
+ public long skip(long c) throws IOException {
+ ensure();
+ long r = input.skip(c);
+ if (r > 0) {
+ expected += r;
+ }
+ return r;
+ }
+
+
+ public void position(long p) throws IOException {
+ input.position(p);
+ expected = p;
+ }
+
+
+ public long position() throws IOException {
+ return expected;
+ }
+
+}
diff --git a/src/main/java/org/archive/io/SeekInputStream.java b/src/main/java/org/archive/io/SeekInputStream.java
new file mode 100644
index 00000000..177724ec
--- /dev/null
+++ b/src/main/java/org/archive/io/SeekInputStream.java
@@ -0,0 +1,81 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+
+import it.unimi.dsi.fastutil.io.RepositionableStream;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+
+/**
+ * Base class for repositionable input streams.
+ *
+ * @author pjack
+ */
+public abstract class SeekInputStream extends InputStream
+implements RepositionableStream {
+
+
+ /**
+ * The marked file position. A value less than zero
+ * indicates that no mark has been set.
+ */
+ private long mark = -1;
+
+
+ /**
+ * Marks the current position of the stream. The limit parameter is
+ * ignored; the mark will remain valid until reset is called or the
+ * stream is closed.
+ *
+ * @param limit ignored
+ */
+ public void mark(int limit) {
+ try {
+ this.mark = position();
+ } catch (IOException e) {
+ mark = -1;
+ }
+ }
+
+
+ /**
+ * Resets this stream to its marked position.
+ *
+ * @throws IOException if there is no mark, or if an IO error occurs
+ */
+ public void reset() throws IOException {
+ if (mark < 0) {
+ throw new IOException("No mark.");
+ }
+ position(mark);
+ }
+
+
+ /**
+ * Returns true, since SeekInputStreams support mark/reset by default.
+ *
+ * @return true
+ */
+ public boolean markSupported() {
+ return true;
+ }
+}
diff --git a/src/main/java/org/archive/io/SeekReader.java b/src/main/java/org/archive/io/SeekReader.java
new file mode 100644
index 00000000..4abf7847
--- /dev/null
+++ b/src/main/java/org/archive/io/SeekReader.java
@@ -0,0 +1,84 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+
+import java.io.IOException;
+import java.io.Reader;
+
+import it.unimi.dsi.fastutil.io.RepositionableStream;
+
+
+/**
+ * Base class for repositionable readers.
+ *
+ * @author pjack
+ */
+public abstract class SeekReader extends Reader
+implements RepositionableStream {
+
+
+ /**
+ * The marked file position. A value less than zero
+ * indicates that no mark has been set.
+ */
+ private long mark = -1;
+
+
+ /**
+ * Marks the current position of the stream. The limit parameter is
+ * ignored; the mark will remain valid until reset is called or the
+ * stream is closed.
+ *
+ * @param limit ignored
+ */
+ @Override
+ public void mark(int limit) {
+ try {
+ this.mark = position();
+ } catch (IOException e) {
+ mark = -1;
+ }
+ }
+
+
+ /**
+ * Resets this stream to its marked position.
+ *
+ * @throws IOException if there is no mark, or if an IO error occurs
+ */
+ @Override
+ public void reset() throws IOException {
+ if (mark < 0) {
+ throw new IOException("No mark.");
+ }
+ position(mark);
+ }
+
+
+ /**
+ * Returns true, since SeekInputStreams support mark/reset by default.
+ *
+ * @return true
+ */
+ @Override
+ public boolean markSupported() {
+ return true;
+ }
+}
diff --git a/src/main/java/org/archive/io/SeekReaderCharSequence.java b/src/main/java/org/archive/io/SeekReaderCharSequence.java
new file mode 100644
index 00000000..a9b4880f
--- /dev/null
+++ b/src/main/java/org/archive/io/SeekReaderCharSequence.java
@@ -0,0 +1,56 @@
+package org.archive.io;
+
+import java.io.IOException;
+
+public class SeekReaderCharSequence implements CharSequence {
+
+
+ final private SeekReader reader;
+ final private int size;
+
+
+ public SeekReaderCharSequence(SeekReader reader, int size) {
+ this.reader = reader;
+ this.size = size;
+ }
+
+
+ public int length() {
+ return size;
+ }
+
+
+ public char charAt(int index) {
+ if ((index < 0) || (index >= length())) {
+ throw new IndexOutOfBoundsException(Integer.toString(index));
+ }
+ try {
+ reader.position(index);
+ int r = reader.read();
+ if (r < 0) {
+ throw new IllegalStateException("EOF");
+ }
+ return (char)reader.read();
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+
+ public CharSequence subSequence(int start, int end) {
+ return new CharSubSequence(this, start, end);
+ }
+
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ try {
+ reader.position(0);
+ for (int ch = reader.read(); ch >= 0; ch = reader.read()) {
+ sb.append((char)ch);
+ }
+ return sb.toString();
+ } catch (IOException e) {
+ throw new IllegalStateException(e);
+ }
+ }
+}
diff --git a/src/main/java/org/archive/io/SinkHandlerLogThread.java b/src/main/java/org/archive/io/SinkHandlerLogThread.java
new file mode 100644
index 00000000..0070785e
--- /dev/null
+++ b/src/main/java/org/archive/io/SinkHandlerLogThread.java
@@ -0,0 +1,34 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io;
+
+
+/**
+ * Implemented by threads that provide extra information.
+ *
+ * TODO: rename class, rename getCurrentProcessorName()
+ */
+public interface SinkHandlerLogThread {
+
+ String getName();
+ String getCurrentProcessorName();
+ int getSerialNumber();
+
+}
diff --git a/src/main/java/org/archive/io/UTF8Bytes.java b/src/main/java/org/archive/io/UTF8Bytes.java
new file mode 100644
index 00000000..c280b08d
--- /dev/null
+++ b/src/main/java/org/archive/io/UTF8Bytes.java
@@ -0,0 +1,37 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+import java.io.UnsupportedEncodingException;
+
+/**
+ * Marker Interface for instances that can be serialized as UTF8 bytes.
+ * TODO: Do we need a UTF8Stream Marker Interface?
+ * @author stack
+ * @version $Date$ $Version$
+ */
+public interface UTF8Bytes {
+ public static final String UTF8 = "UTF-8";
+
+ /**
+ * @return Instance as UTF-8 bytes.
+ * @throws UnsupportedEncodingException
+ */
+ public byte [] getUTF8Bytes() throws UnsupportedEncodingException;
+}
diff --git a/src/main/java/org/archive/io/WriterPool.java b/src/main/java/org/archive/io/WriterPool.java
new file mode 100644
index 00000000..2dc385a1
--- /dev/null
+++ b/src/main/java/org/archive/io/WriterPool.java
@@ -0,0 +1,343 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.LinkedList;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import org.json.JSONArray;
+import org.json.JSONException;
+import org.json.JSONObject;
+
+/**
+ * Pool of Writers.
+ *
+ * Abstract. Override and pass in the Constructor a factory that creates
+ * {@link WriterPoolMember} implementations.
+ *
+ * @author stack
+ */
+public abstract class WriterPool {
+ private final Logger logger = Logger.getLogger(this.getClass().getName());
+
+ /**
+ * Used to generate unique filename sequences.
+ */
+ final protected AtomicInteger serialNo;
+
+ /**
+ * Default maximum active number of files in the pool.
+ */
+ public static final int DEFAULT_MAX_ACTIVE = 1;
+
+ /** Assumed largest possible value of maxActive; pool will have this
+ * maximum capacity, so dynamic changes beyond this number won't work. */
+ protected static final int LARGEST_MAX_ACTIVE = 255;
+
+ /**
+ * Maximum time to wait on a free file before considering
+ * making a new one (if not already at max)
+ */
+ public static final int DEFAULT_MAX_WAIT_FOR_IDLE = 500;
+
+ /**
+ * File settings.
+ * Keep in data structure rather than as individual values.
+ */
+ protected final WriterPoolSettings settings;
+
+ /** maximum number of writers to create at a time*/
+ protected int maxActive;
+ /** maximum ms to wait before considering creation of a writer */
+ protected int maxWait;
+ /** current count of active writers; only read/mutated in synchronized blocks */
+ protected int currentActive = 0;
+ /** round-robin queue of available writers */
+ protected BlockingQueue availableWriters;
+
+ /** system time when writer was last wanted (because one was not ready in time) */
+ protected long lastWriterNeededTime;
+ /** system time when writer was last 'rolled over' (imminent creation of new file) */
+ protected long lastWriterRolloverTime;
+
+ /**
+ * Constructor
+ * @param serial Used to generate unique filename sequences
+ * @param factory Factory that knows how to make a {@link WriterPoolMember}.
+ * @param settings Settings for this pool.
+ * @param poolMaximumActive
+ * @param poolMaximumWait
+ */
+ public WriterPool(final AtomicInteger serial,
+ final WriterPoolSettings settings,
+ final int poolMaximumActive, final int poolMaximumWait) {
+ logger.info("Initial configuration:" +
+ " prefix=" + settings.getPrefix() +
+ ", template=" + settings.getTemplate() +
+ ", compress=" + settings.getCompress() +
+ ", maxSize=" + settings.getMaxFileSizeBytes() +
+ ", maxActive=" + poolMaximumActive +
+ ", maxWait=" + poolMaximumWait);
+ this.settings = settings;
+ this.maxActive = poolMaximumActive;
+ this.maxWait = poolMaximumWait;
+ availableWriters = new ArrayBlockingQueue(LARGEST_MAX_ACTIVE, true);
+ this.serialNo = serial;
+ }
+
+ /**
+ * Check out a {@link WriterPoolMember}.
+ *
+ * This method should be followed by a call to
+ * {@link #returnFile(WriterPoolMember)} or
+ * {@link #invalidateFile(WriterPoolMember)} else pool starts leaking.
+ *
+ * @return Writer checked out of a pool of files or created
+ * @throws IOException Problem getting Writer from pool (Converted
+ * from Exception to IOException so this pool can live as a good citizen
+ * down in depths of ARCSocketFactory).
+ */
+ public WriterPoolMember borrowFile()
+ throws IOException {
+ WriterPoolMember writer = null;
+ while(writer == null) {
+ try {
+ writer = availableWriters.poll(maxWait,TimeUnit.MILLISECONDS);
+ } catch (InterruptedException e) {
+ // nothing to do but proceed
+ }
+ if(writer==null) {
+ writer = makeNewWriterIfAppropriate();
+ }
+ }
+ return writer;
+ }
+
+ /**
+ * Create a new writer instance, if still below maxActive count.
+ * Remember times to help make later decision when writer should
+ * be discarded.
+ *
+ * @return WriterPoolMember or null if already at max
+ */
+ protected synchronized WriterPoolMember makeNewWriterIfAppropriate() {
+ long now = System.currentTimeMillis();
+ lastWriterNeededTime = now;
+ if(currentActive < maxActive) {
+ currentActive++;
+ lastWriterRolloverTime = now;
+ return makeWriter();
+ }
+ return null;
+ }
+
+ /**
+ * @return new WriterPoolMember of appropriate type
+ */
+ protected abstract WriterPoolMember makeWriter();
+
+ /**
+ * Discard a previously-used writer, cleanly closing it and leaving it out
+ * of the pool.
+ * @param writer
+ * @throws IOException
+ */
+ public synchronized void destroyWriter(WriterPoolMember writer) throws IOException {
+ currentActive--;
+ writer.close();
+ }
+ /**
+ * Return a writer, for likely reuse unless (1) writer's current file has
+ * reached its target size; and (2) there's been no demand for additional
+ * writers since the last time a new writer-file was rolled-over. In that
+ * case, the possibly-superfluous writer instance is discarded.
+ * @param writer Writer to return to the pool.
+ * @throws IOException Problem returning File to pool.
+ */
+ public void returnFile(WriterPoolMember writer)
+ throws IOException {
+ synchronized(this) {
+ if(writer.isOversize()) {
+ // maybe retire writer rather than recycle
+ if(lastWriterNeededTime<=lastWriterRolloverTime) {
+ // no timeouts waiting for recycled writer since last writer rollover
+ destroyWriter(writer);
+ return;
+ } else {
+ // reuse writer instance, causing new file to be created
+ lastWriterRolloverTime = System.currentTimeMillis();
+ }
+ }
+ }
+ if(!availableWriters.offer(writer)) {
+ logger.log(Level.WARNING, "writer unreturnable to available pool; closing early");
+ destroyWriter(writer);
+ }
+ }
+
+ /**
+ * Close and discard a writer that experienced a potentially-corrupting
+ * error.
+ * @param f writer with problem
+ * @throws IOException
+ */
+ public synchronized void invalidateFile(WriterPoolMember f)
+ throws IOException {
+ try {
+ destroyWriter(f);
+ } catch (Exception e) {
+ // Convert exception.
+ throw new IOException(e.getMessage());
+ }
+ // It'll have been closed. Rename with an '.invalid' suffix so it
+ // gets attention.
+ File file = f.getFile();
+ file.renameTo(new File(file.getAbsoluteFile() +
+ WriterPoolMember.INVALID_SUFFIX));
+ }
+
+ /**
+ * @return Number of {@link WriterPoolMember}s checked out of pool.
+ * @throws java.lang.UnsupportedOperationException
+ */
+ public synchronized int getNumActive()
+ throws UnsupportedOperationException {
+ return currentActive - getNumIdle();
+ }
+
+ /**
+ * @return Number of {@link WriterPoolMember} instances still in the pool.
+ * @throws java.lang.UnsupportedOperationException
+ */
+ public int getNumIdle()
+ throws UnsupportedOperationException {
+ return availableWriters.size();
+ }
+
+ /**
+ * Close all {@link WriterPoolMember}s in pool.
+ */
+ public void close() {
+ Collection writers = drainAllWriters();
+ for (WriterPoolMember writer: writers) {
+ try {
+ destroyWriter(writer);
+ } catch (IOException e) {
+ logger.log(Level.WARNING,"problem closing writer",e);
+ }
+ }
+ }
+
+ /**
+ * @return Returns settings.
+ */
+ public WriterPoolSettings getSettings() {
+ return this.settings;
+ }
+
+ /**
+ * @return State of the pool string
+ */
+ protected String getPoolState() {
+ StringBuffer buffer = new StringBuffer("Active ");
+ buffer.append(getNumActive());
+ buffer.append(" of max ");
+ buffer.append(maxActive);
+ buffer.append(", idle ");
+ buffer.append(getNumIdle());
+ return buffer.toString();
+ }
+
+ /**
+ * Returns the atomic integer used to generate serial numbers
+ * for files.
+ *
+ * @return the serial number generator
+ */
+ public AtomicInteger getSerialNo() {
+ return serialNo;
+ }
+
+ /**
+ * Drains all the active writers from {@link #availableWriters}, blocking to
+ * wait for any writers currently in use to become available.
+ *
+ *
+ * When finished with writers, call availableWriters.addAll(...) to put them
+ * back into the rotation.
+ *
+ * @return all the active writers
+ */
+ protected synchronized Collection drainAllWriters() {
+ LinkedList writers = new LinkedList();
+ availableWriters.drainTo(writers);
+
+ while (writers.size() < currentActive) {
+ try {
+ WriterPoolMember w = availableWriters.take();
+ writers.add(w);
+ } catch (InterruptedException e) {
+ logger.severe("caught " + e + " while waiting for writers to free up; returning only "
+ + writers.size() + " of " + currentActive + " active writers");
+ break;
+ }
+ }
+
+ return writers;
+ }
+
+ public void flush() {
+ Collection writers = drainAllWriters();
+
+ for (WriterPoolMember writer: writers) {
+ try {
+ writer.flush();
+ } catch (IOException e) {
+ logger.log(Level.WARNING, "problem flushing writer " + writer, e);
+ }
+ }
+
+ availableWriters.addAll(writers);
+ }
+
+ public JSONArray jsonStatus() throws JSONException {
+ Collection writers = drainAllWriters();
+
+ JSONArray ja = new JSONArray();
+ for (WriterPoolMember w: writers) {
+ JSONObject jo = new JSONObject();
+ jo.put("file", w.getFile());
+ jo.put("position", w.getPosition());
+ ja.put(jo);
+ }
+
+ availableWriters.addAll(writers);
+
+ return ja;
+ }
+}
diff --git a/src/main/java/org/archive/io/WriterPoolMember.java b/src/main/java/org/archive/io/WriterPoolMember.java
new file mode 100644
index 00000000..6ea6b295
--- /dev/null
+++ b/src/main/java/org/archive/io/WriterPoolMember.java
@@ -0,0 +1,487 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.text.DecimalFormat;
+import java.text.NumberFormat;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Properties;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.logging.Logger;
+import java.util.zip.GZIPOutputStream;
+
+import org.archive.util.ArchiveUtils;
+import org.archive.util.FileUtils;
+import org.archive.util.PropertyUtils;
+
+
+
+/**
+ * Member of {@link WriterPool}.
+ * Implements rotating off files, file naming with some guarantee of
+ * uniqueness, and position in file. Subclass to pick up functionality for a
+ * particular Writer type.
+ * @author stack
+ * @version $Date$ $Revision$
+ */
+public abstract class WriterPoolMember implements ArchiveFileConstants {
+ private final Logger logger = Logger.getLogger(this.getClass().getName());
+
+ public static final String UTF8 = "UTF-8";
+
+ /**
+ * Default archival-aggregate filename template.
+ *
+ * Under usual assumptions -- hostnames aren't shared among crawling hosts;
+ * processes have unique PIDs and admin ports; timestamps inside one process
+ * don't repeat (see UniqueTimestampService); clocks are generally
+ * accurate -- will generate a unique name.
+ *
+ * Stands for Internet Archive Heritrix.
+ */
+ public static final String DEFAULT_TEMPLATE =
+ "${prefix}-${timestamp17}-${serialno}-${heritrix.pid}~${heritrix.hostname}~${heritrix.port}";
+
+ /**
+ * Default for file prefix.
+ */
+ public static final String DEFAULT_PREFIX = "WEB";
+
+ /**
+ * Reference to file we're currently writing.
+ */
+ protected File f = null;
+
+ /** Output stream for file. */
+ protected OutputStream out = null;
+ /** Counting stream for metering */
+ protected MiserOutputStream countOut = null;
+
+ /** reusable buffer for recycling scenarios */
+ protected byte[] rebuf;
+
+ protected WriterPoolSettings settings;
+ private final String extension;
+
+ /**
+ * Creation date for the current file.
+ * Set by {@link #createFile()}.
+ */
+ protected String currentTimestamp = "UNSET!!!";
+
+ protected String currentBasename;
+
+ /**
+ * A running sequence used making unique file names.
+ */
+ final private AtomicInteger serialNo;
+
+ /**
+ * Directories round-robin index.
+ */
+ protected static int roundRobinIndex = 0;
+
+ /**
+ * NumberFormat instance for formatting serial number.
+ *
+ * Pads serial number with zeros.
+ */
+ protected static NumberFormat serialNoFormatter = new DecimalFormat("00000");
+
+
+ /**
+ * Buffer to reuse writing streams.
+ */
+ protected final byte [] scratchbuffer = new byte[4 * 1024];
+
+
+ /**
+ * Constructor.
+ * Takes a stream. Use with caution. There is no upperbound check on size.
+ * Will just keep writing.
+ *
+ * @param serialNo used to create unique filename sequences
+ * @param out Where to write.
+ * @param file File the out is connected to.
+ * @param cmprs Compress the content written.
+ * @param a14DigitDate If null, we'll write current time.
+ * @throws IOException
+ */
+ protected WriterPoolMember(AtomicInteger serialNo,
+ final OutputStream out, final File file,
+ final WriterPoolSettings settings)
+ throws IOException {
+ this(serialNo, settings, null);
+ this.countOut = (out instanceof MiserOutputStream)
+ ? (MiserOutputStream)out
+ : new MiserOutputStream(out, settings.getFrequentFlushes());
+ this.out = this.countOut;
+ this.f = file;
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param serialNo used to create unique filename sequences
+ * @param dirs Where to drop files.
+ * @param prefix File prefix to use.
+ * @param cmprs Compress the records written.
+ * @param maxSize Maximum size for ARC files written.
+ * @param template filenaming template to use
+ * @param extension Extension to give file.
+ */
+ public WriterPoolMember(AtomicInteger serialNo,
+ final WriterPoolSettings settings, final String extension) {
+ this.settings = settings;
+ this.extension = extension;
+ this.serialNo = serialNo;
+ }
+
+ /**
+ * Call this method just before/after any significant write.
+ *
+ * Call at the end of the writing of a record or just before we start
+ * writing a new record. Will close current file and open a new file
+ * if file size has passed out maxSize.
+ *
+ *
Creates and opens a file if none already open. One use of this method
+ * then is after construction, call this method to add the metadata, then
+ * call {@link #getPosition()} to find offset of first record.
+ *
+ * TODO: perhaps this should be called checkForNewOpen? because it also
+ * handles initial open, even when not rolling oversize
+ *
+ * @exception IOException
+ */
+ public void checkSize() throws IOException {
+ if (this.out == null || isOversize()) {
+ createFile();
+ }
+ }
+
+ /** Check if underlying file has already reached its target size.
+ * @return boolean true if file has reached target size and due to be closed
+ */
+ public boolean isOversize() {
+ return settings.getMaxFileSizeBytes() != -1 && (this.getPosition() > settings.getMaxFileSizeBytes());
+ }
+
+ /**
+ * Create a new file.
+ * Rotates off the current Writer and creates a new in its place
+ * to take subsequent writes. Usually called from {@link #checkSize()}.
+ * @return Name of file created.
+ * @throws IOException
+ */
+ protected String createFile() throws IOException {
+ generateNewBasename();
+ String name = currentBasename + '.' + this.extension +
+ ((settings.getCompress())? DOT_COMPRESSED_FILE_EXTENSION: "") +
+ OCCUPIED_SUFFIX;
+ File dir = getNextDirectory(settings.calcOutputDirs());
+ return createFile(new File(dir, name));
+ }
+
+ protected String createFile(final File file) throws IOException {
+ close();
+ this.f = file;
+ FileOutputStream fos = new FileOutputStream(this.f);
+ if(rebuf==null) {
+ rebuf = new byte[settings.getWriteBufferSize()];
+ }
+ this.countOut = new MiserOutputStream(new RecyclingFastBufferedOutputStream(fos,rebuf),settings.getFrequentFlushes());
+ this.out = this.countOut;
+ logger.fine("Opened " + this.f.getAbsolutePath());
+ return this.f.getName();
+ }
+
+ /**
+ * @param dirs List of File objects that point at directories.
+ * @return Find next directory to write an arc too. If more
+ * than one, it tries to round-robin through each in turn.
+ * @throws IOException
+ */
+ protected File getNextDirectory(List dirs)
+ throws IOException {
+ if (WriterPoolMember.roundRobinIndex >= dirs.size()) {
+ WriterPoolMember.roundRobinIndex = 0;
+ }
+ File d = null;
+ try {
+ d = checkWriteable((File)dirs.
+ get(WriterPoolMember.roundRobinIndex));
+ } catch (IndexOutOfBoundsException e) {
+ // Dirs list might be altered underneath us.
+ // If so, we get this exception -- just keep on going.
+ }
+ if (d == null && dirs.size() > 1) {
+ for (Iterator i = dirs.iterator(); d == null && i.hasNext();) {
+ d = checkWriteable((File)i.next());
+ }
+ } else {
+ WriterPoolMember.roundRobinIndex++;
+ }
+ if (d == null) {
+ throw new IOException("Directories unusable.");
+ }
+ return d;
+ }
+
+ protected File checkWriteable(File d) {
+ if (d == null) {
+ return d;
+ }
+
+ try {
+ FileUtils.ensureWriteableDirectory(d);
+ } catch(IOException e) {
+ logger.warning("Directory " + d.getPath() + " is not" +
+ " writeable or cannot be created: " + e.getMessage());
+ d = null;
+ }
+ return d;
+ }
+
+ /**
+ * Generate a new basename by interpolating values in the configured
+ * template. Values come from local state, other configured values, and
+ * global system properties. The recommended default template will
+ * generate a unique basename under reasonable assumptions.
+ */
+ protected void generateNewBasename() {
+ Properties localProps = new Properties();
+ localProps.setProperty("prefix", settings.getPrefix());
+ synchronized(this.getClass()) {
+ // ensure that serialNo and timestamp are minted together (never inverted sort order)
+ String paddedSerialNumber = WriterPoolMember.serialNoFormatter.format(serialNo.getAndIncrement());
+ String timestamp17 = ArchiveUtils.getUnique17DigitDate();
+ String timestamp14 = ArchiveUtils.getUnique14DigitDate();
+ currentTimestamp = timestamp17;
+ localProps.setProperty("serialno", paddedSerialNumber);
+ localProps.setProperty("timestamp17", timestamp17);
+ localProps.setProperty("timestamp14", timestamp14);
+ }
+ currentBasename = PropertyUtils.interpolateWithProperties(settings.getTemplate(),
+ localProps, System.getProperties());
+ }
+
+
+ /**
+ * Get the file name
+ *
+ * @return the filename, as if uncompressed
+ */
+ protected String getBaseFilename() {
+ String name = this.f.getName();
+ if (settings.getCompress() && name.endsWith(DOT_COMPRESSED_FILE_EXTENSION)) {
+ return name.substring(0,name.length() - 3);
+ } else if(settings.getCompress() &&
+ name.endsWith(DOT_COMPRESSED_FILE_EXTENSION +
+ OCCUPIED_SUFFIX)) {
+ return name.substring(0, name.length() -
+ (3 + OCCUPIED_SUFFIX.length()));
+ } else {
+ return name;
+ }
+ }
+
+ /**
+ * Get this file.
+ *
+ * Used by junit test to test for creation and when {@link WriterPool} wants
+ * to invalidate a file.
+ *
+ * @return The current file.
+ */
+ public File getFile() {
+ return this.f;
+ }
+
+ /**
+ * Post write tasks.
+ *
+ * Has side effects. Will open new file if we're at the upper bound.
+ * If we're writing compressed files, it will wrap output stream with a
+ * GZIP writer with side effect that GZIP header is written out on the
+ * stream.
+ *
+ * @exception IOException
+ */
+ protected void preWriteRecordTasks()
+ throws IOException {
+ if (this.out == null) {
+ createFile();
+ }
+ if (settings.getCompress()) {
+ // Wrap stream in GZIP Writer.
+ // The below construction immediately writes the GZIP 'default'
+ // header out on the underlying stream.
+ this.out = new CompressedStream(this.out);
+ }
+ }
+
+ /**
+ * Post file write tasks.
+ * If compressed, finishes up compression and flushes stream so any
+ * subsequent checks get good reading.
+ *
+ * @exception IOException
+ */
+ protected void postWriteRecordTasks()
+ throws IOException {
+ if (settings.getCompress()) {
+ CompressedStream o = (CompressedStream)this.out;
+ o.finish();
+ o.flush();
+ o.end();
+ this.out = o.getWrappedStream();
+ }
+ }
+
+ /**
+ * Position in raw output (typically, physical file).
+ * Used making accounting of bytes written.
+ * @return Position in final media (assuming all flushing completes)
+ * @throws IOException
+ */
+ public long getPosition() {
+ return (countOut==null)? 0L : this.countOut.getCount();
+ }
+
+ public boolean isCompressed() {
+ return settings.getCompress();
+ }
+
+ protected void write(final byte [] b) throws IOException {
+ this.out.write(b);
+ }
+
+ protected void flush() throws IOException {
+ this.out.flush();
+ }
+
+ protected void write(byte[] b, int off, int len) throws IOException {
+ this.out.write(b, off, len);
+ }
+
+ protected void write(int b) throws IOException {
+ this.out.write(b);
+ }
+
+ /**
+ * Copy bytes from the provided InputStream to the target file/stream being
+ * written.
+ *
+ * @return number of bytes written (normally equal to {@code enforceLength})
+ * @param is
+ * InputStream to copy bytes from
+ * @param recordLength
+ * expected number of bytes to copy
+ * @param enforceLength
+ * whether to throw an exception if too many/too few bytes are
+ * available from stream
+ * @throws IOException
+ */
+ protected long copyFrom(final InputStream is, final long recordLength,
+ boolean enforceLength) throws IOException {
+ int read = scratchbuffer.length;
+ long tot = 0;
+ while ((tot < recordLength)
+ && (read = is.read(scratchbuffer)) != -1) {
+ int write = read;
+ // never write more than enforced length
+ write = (int) Math.min(write, recordLength - tot);
+ tot += read;
+ write(scratchbuffer, 0, write);
+ }
+ if (enforceLength && tot != recordLength) {
+ // throw exception if desired for read vs. declared mismatches
+ throw new IOException("Read " + tot + " but expected "
+ + recordLength);
+ }
+
+ return tot;
+ }
+
+ public void close() throws IOException {
+ if (this.out == null) {
+ return;
+ }
+ this.out.close();
+ this.out = null;
+ if (this.f != null && this.f.exists()) {
+ String path = this.f.getAbsolutePath();
+ if (path.endsWith(OCCUPIED_SUFFIX)) {
+ File f = new File(path.substring(0,
+ path.length() - OCCUPIED_SUFFIX.length()));
+ if (f.exists() & !f.delete()) {
+ logger.warning("Failed delete of " + f);
+ }
+ if (!this.f.renameTo(f)) {
+ logger.warning("Failed rename of " + path);
+ }
+ this.f = f;
+ }
+
+ logger.fine("Closed " + this.f.getAbsolutePath() +
+ ", size " + this.f.length());
+ }
+ }
+
+ protected OutputStream getOutputStream() {
+ return this.out;
+ }
+
+ /**
+ * An override so we get access to underlying output stream.
+ * and offer an end() that does not accompany closing underlying
+ * stream.
+ * @author stack
+ */
+ private class CompressedStream extends GZIPOutputStream {
+ public CompressedStream(OutputStream out)
+ throws IOException {
+ super(out);
+ }
+
+ /**
+ * @return Reference to stream being compressed.
+ */
+ OutputStream getWrappedStream() {
+ return this.out;
+ }
+
+ /**
+ * Release the deflater's native process resources,
+ * which otherwise would not occur until either
+ * finalization or DeflaterOutputStream.close()
+ * (which would also close underlying stream).
+ */
+ public void end() {
+ def.end();
+ }
+ }
+}
diff --git a/src/main/java/org/archive/io/WriterPoolSettings.java b/src/main/java/org/archive/io/WriterPoolSettings.java
new file mode 100644
index 00000000..d0805cdc
--- /dev/null
+++ b/src/main/java/org/archive/io/WriterPoolSettings.java
@@ -0,0 +1,39 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+import java.io.File;
+import java.util.List;
+
+/**
+ * Settings object for a {@link WriterPool}.
+ * Used creating {@link WriterPoolMember}s.
+ * @author stack
+ * @version $Date$, $Revision$
+ */
+public interface WriterPoolSettings {
+ public long getMaxFileSizeBytes();
+ public String getPrefix();
+ public String getTemplate();
+ public List calcOutputDirs();
+ public boolean getCompress();
+ public List getMetadata();
+ public boolean getFrequentFlushes();
+ public int getWriteBufferSize();
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/io/arc/ARC2WCDX.java b/src/main/java/org/archive/io/arc/ARC2WCDX.java
new file mode 100644
index 00000000..19010131
--- /dev/null
+++ b/src/main/java/org/archive/io/arc/ARC2WCDX.java
@@ -0,0 +1,243 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io.arc;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.PrintStream;
+import java.util.Date;
+import java.util.Iterator;
+import java.util.zip.GZIPOutputStream;
+
+import org.apache.commons.httpclient.Header;
+import org.apache.commons.httpclient.HeaderGroup;
+import org.apache.commons.httpclient.util.DateParseException;
+import org.apache.commons.httpclient.util.DateUtil;
+import org.archive.io.ArchiveRecord;
+import org.archive.util.ArchiveUtils;
+import org.archive.util.SURT;
+
+/**
+ * Create a 'Wide' CDX from an ARC. Takes one argument, the path to the ARC.
+ * Writes .wcdx.gz in same directory.
+ *
+ * @author gojomo
+ */
+public class ARC2WCDX {
+ final public static String WCDX_VERSION="0.1";
+
+ public static void main(String[] args) throws IOException {
+ String arcFilename = args[0];
+ createWcdx(arcFilename);
+ }
+
+ public static Object[] createWcdx(String arcFilename) throws IOException {
+ ARCReader reader = ARCReaderFactory.get(arcFilename);
+ Object[] retVal = createWcdx(reader);
+ reader.close();
+ return retVal;
+ }
+
+ public static Object[] createWcdx(ARCReader reader) {
+ reader.setDigest(true);
+
+ String wcdxPath = reader.getReaderIdentifier().replaceAll("\\.arc(\\.gz)?$",".wcdx.gz");
+ File wcdxFile = new File(wcdxPath+".open");
+ PrintStream writer = null;
+ long count = 0;
+ try {
+ writer = new PrintStream(new GZIPOutputStream(new FileOutputStream(wcdxFile)));
+
+ // write header: legend + timestamp
+ StringBuilder legend = new StringBuilder();
+ appendField(legend,"CDX");
+ appendField(legend,"surt-uri");
+ appendField(legend,"b"); // ARC timestamp
+ appendField(legend,"http-date");
+ appendField(legend,"s"); // status code
+ appendField(legend,"m"); // media type
+ appendField(legend,"sha1"); // content sha1
+ appendField(legend,"g"); // ARC name
+ appendField(legend,"V"); // start offset
+ appendField(legend,"end-offset"); // TODO: implement
+ appendField(legend,"n"); // ARC record length TODO: verify
+ appendField(legend,"http-content-length");
+ appendField(legend,"http-last-modified");
+ appendField(legend,"http-expires");
+ appendField(legend,"http-etag");
+ appendField(legend,"http-location");
+ appendField(legend,"e"); // IP
+ appendField(legend,"a"); // original URL
+ // WCDX version+creation time: crude version control
+ appendField(legend,WCDX_VERSION+"@"+ArchiveUtils.get14DigitDate());
+ writer.println(legend.toString());
+
+ Iterator iter = reader.iterator();
+ count = 0;
+ while(iter.hasNext()) {
+ ARCRecord record = (ARCRecord) iter.next();
+ record.close();
+ ARCRecordMetaData h = (ARCRecordMetaData) record.getHeader();
+ Header[] httpHeaders = record.getHttpHeaders();
+ if(httpHeaders==null) {
+ httpHeaders = new Header[0];
+ }
+ HeaderGroup hg = new HeaderGroup();
+ hg.setHeaders(httpHeaders);
+ StringBuilder builder = new StringBuilder();
+
+ // SURT-form URI
+ appendField(builder,SURT.fromURI(h.getUrl()));
+ // record timestamp ('b')
+ appendField(builder,h.getDate());
+ // http header date
+ appendTimeField(builder,hg.getFirstHeader("Date"));
+ // response code ('s')
+ appendField(builder,h.getStatusCode());
+ // media type ('m')
+ appendField(builder,h.getMimetype());
+ // content checksum (like 'c', but here Base32 SHA1)
+ appendField(builder,record.getDigestStr());
+ // arc name ('g')
+ appendField(builder,reader.getFileName());
+ // compressed start offset ('V')
+ appendField(builder,h.getOffset());
+
+ // compressed end offset (?)
+// appendField(builder,
+// reader.getInputStream() instanceof RepositionableStream
+// ? ((GzippedInputStream)reader.getInputStream()).vPosition()
+// : "-");
+ // TODO; leave unavail for now
+ appendField(builder, "-");
+
+ // uncompressed (declared in ARC headerline) record length
+ appendField(builder,h.getLength());
+ // http header content-length
+ appendField(builder,hg.getFirstHeader("Content-Length"));
+
+ // http header mod-date
+ appendTimeField(builder,hg.getFirstHeader("Last-Modified"));
+ // http header expires
+ appendTimeField(builder,hg.getFirstHeader("Expires"));
+
+ // http header etag
+ appendField(builder,hg.getFirstHeader("ETag"));
+ // http header redirect ('Location' header?)
+ appendField(builder,hg.getFirstHeader("Location"));
+ // ip ('e')
+ appendField(builder,h.getIp());
+ // original URI
+ appendField(builder,h.getUrl());
+ // TODO MAYBE - a title from inside content?
+
+ writer.println(builder.toString());
+ count++;
+ }
+ wcdxFile.renameTo(new File(wcdxPath));
+ } catch (IOException e) {
+ // soldier on: but leave '.open' wcdx file as indicator of error
+ if(!wcdxFile.exists()) {
+ try {
+ wcdxFile.createNewFile();
+ } catch (IOException e1) {
+ // TODO Auto-generated catch block
+ throw new RuntimeException(e1);
+ }
+ }
+ } catch (RuntimeException e) {
+ // soldier on: but leave '.open' wcdx file as indicator of error
+ if(!wcdxFile.exists()) {
+ try {
+ wcdxFile.createNewFile();
+ } catch (IOException e1) {
+ // TODO Auto-generated catch block
+ throw new RuntimeException(e1);
+ }
+ }
+ } finally {
+ if(writer!=null) {
+ writer.close();
+ }
+ }
+
+ return new Object[] {wcdxPath, count};
+ }
+
+ protected static void appendField(StringBuilder builder, Object obj) {
+ if(builder.length()>0) {
+ // prepend with delimiter
+ builder.append(' ');
+ }
+ if(obj instanceof Header) {
+ obj = ((Header)obj).getValue().trim();
+ }
+
+ builder.append((obj==null||obj.toString().length()==0)?"-":obj);
+ }
+
+ protected static void appendTimeField(StringBuilder builder, Object obj) {
+ if(builder.length()>0) {
+ // prepend with delimiter
+ builder.append(' ');
+ }
+ if(obj==null) {
+ builder.append("-");
+ return;
+ }
+ if(obj instanceof Header) {
+ String s = ((Header)obj).getValue().trim();
+ try {
+ Date date = DateUtil.parseDate(s);
+ String d = ArchiveUtils.get14DigitDate(date);
+ if(d.startsWith("209")) {
+ d = "199"+d.substring(3);
+ }
+ obj = d;
+ } catch (DateParseException e) {
+ builder.append('e');
+ return;
+ }
+
+ }
+ builder.append(obj);
+ }
+}
+
+//'wide' CDX
+//a original url
+//b timestamp
+//s resp code
+//m type
+//? content md5 (full 'k'? 'c'?
+//g arc name
+//V compressed start offset
+//? compressed length
+//n? uncompressed length
+//? mod date
+//? expires
+//? server 'date' hdr
+//? etag
+//r redirect ('Location'?)
+//e ip
+//MAYBE:
+//? TITLE from HTML or other format?
+
+
diff --git a/src/main/java/org/archive/io/arc/ARCConstants.java b/src/main/java/org/archive/io/arc/ARCConstants.java
new file mode 100644
index 00000000..c44cfef7
--- /dev/null
+++ b/src/main/java/org/archive/io/arc/ARCConstants.java
@@ -0,0 +1,29 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io.arc;
+
+
+/**
+ * Constants used by ARC files and in ARC file processing.
+ *
+ * @author stack
+ * @deprecated
+ */
+public interface ARCConstants extends org.archive.format.arc.ARCConstants {
+}
diff --git a/src/main/java/org/archive/io/arc/ARCLocation.java b/src/main/java/org/archive/io/arc/ARCLocation.java
new file mode 100644
index 00000000..c6c64437
--- /dev/null
+++ b/src/main/java/org/archive/io/arc/ARCLocation.java
@@ -0,0 +1,37 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io.arc;
+
+/**
+ * Datastructure to hold ARC record location.
+ * Used by wayback machine.
+ * @author stack
+ */
+public interface ARCLocation {
+ /**
+ * @return Returns the ARC filename. Can be full path to ARC, URL to an
+ * ARC or just the portion of an ARC name that is unique to a collection.
+ */
+ public String getName();
+
+ /**
+ * @return Returns the offset into the ARC.
+ */
+ public long getOffset();
+}
diff --git a/src/main/java/org/archive/io/arc/ARCReader.java b/src/main/java/org/archive/io/arc/ARCReader.java
new file mode 100644
index 00000000..7f85cc2a
--- /dev/null
+++ b/src/main/java/org/archive/io/arc/ARCReader.java
@@ -0,0 +1,553 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io.arc;
+
+import java.io.ByteArrayOutputStream;
+import java.io.Closeable;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.logging.Logger;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+import org.apache.commons.cli.PosixParser;
+import org.archive.io.ArchiveReader;
+import org.archive.io.ArchiveRecord;
+import org.archive.io.ArchiveRecordHeader;
+import org.archive.io.RecoverableIOException;
+import org.archive.io.WriterPoolMember;
+import org.archive.util.ArchiveUtils;
+
+
+/**
+ * Get an iterator on an ARC file or get a record by absolute position.
+ *
+ * ARC files are described here:
+ * Arc
+ * File Format.
+ *
+ *
This class knows how to parse an ARC file. Pass it a file path
+ * or an URL to an ARC. It can parse ARC Version 1 and 2.
+ *
+ *
Iterator returns ARCRecord
+ * though {@link Iterator#next()} is returning
+ * java.lang.Object. Cast the return.
+ *
+ *
Profiling java.io vs. memory-mapped ByteBufferInputStream shows the
+ * latter slightly slower -- but not by much. TODO: Test more. Just
+ * change {@link #getInputStream(File, long)}.
+ *
+ * @author stack
+ * @version $Date$ $Revision$
+ */
+public abstract class ARCReader extends ArchiveReader
+implements ARCConstants, Closeable {
+ private final Logger logger = Logger.getLogger(ARCReader.class.getName());
+
+ /**
+ * Set to true if we are aligned on first record of Archive file.
+ * We used depend on offset. If offset was zero, then we were
+ * aligned on first record. This is no longer necessarily the case when
+ * Reader is created at an offset into an Archive file: The offset is zero
+ * but its relative to where we started reading.
+ */
+ private boolean alignedOnFirstRecord = true;
+
+ private boolean parseHttpHeaders = true;
+
+ protected ARCReader() {
+ super();
+ }
+
+ /**
+ * Skip over any trailing new lines at end of the record so we're lined up
+ * ready to read the next.
+ * @param record
+ * @throws IOException
+ */
+ protected void gotoEOR(ArchiveRecord record) throws IOException {
+ if (getIn().available() <= 0) {
+ return;
+ }
+
+ // Remove any trailing LINE_SEPARATOR
+ int c = -1;
+ while (getIn().available() > 0) {
+ if (getIn().markSupported()) {
+ getIn().mark(1);
+ }
+ c = getIn().read();
+ if (c != -1) {
+ if (c == LINE_SEPARATOR) {
+ continue;
+ }
+ if (getIn().markSupported()) {
+ // We've overread. We're probably in next record. There is
+ // no way of telling for sure. It may be dross at end of
+ // current record. Backup.
+ getIn().reset();
+ break;
+ }
+ ArchiveRecordHeader h = (getCurrentRecord() != null)?
+ record.getHeader(): null;
+ throw new IOException("Read " + (char)c +
+ " when only " + LINE_SEPARATOR + " expected. " +
+ getReaderIdentifier() + ((h != null)?
+ h.getHeaderFields().toString(): ""));
+ }
+ }
+ }
+
+ /**
+ * Create new arc record.
+ *
+ * Encapsulate housekeeping that has to do w/ creating a new record.
+ *
+ *
Call this method at end of constructor to read in the
+ * arcfile header. Will be problems reading subsequent arc records
+ * if you don't since arcfile header has the list of metadata fields for
+ * all records that follow.
+ *
+ *
When parsing through ARCs writing out CDX info, we spend about
+ * 38% of CPU in here -- about 30% of which is in getTokenizedHeaderLine
+ * -- of which 16% is reading.
+ *
+ * @param is InputStream to use.
+ * @param offset Absolute offset into arc file.
+ * @return An arc record.
+ * @throws IOException
+ */
+ protected ARCRecord createArchiveRecord(InputStream is, long offset)
+ throws IOException {
+ try {
+ String version = super.getVersion();
+ ARCRecord record = new ARCRecord(is, getReaderIdentifier(), offset,
+ isDigest(), isStrict(), isParseHttpHeaders(),
+ isAlignedOnFirstRecord(), version);
+ if (version != null && super.getVersion() == null)
+ super.setVersion(version);
+ currentRecord(record);
+ } catch (IOException e) {
+ if (e instanceof RecoverableIOException) {
+ // Don't mess with RecoverableIOExceptions. Let them out.
+ throw e;
+ }
+ IOException newE = new IOException(e.getMessage() + " (Offset " +
+ offset + ").");
+ newE.setStackTrace(e.getStackTrace());
+ throw newE;
+ }
+ return (ARCRecord)getCurrentRecord();
+ }
+
+ /**
+ * Returns version of this ARC file. Usually read from first record of ARC.
+ * If we're reading without having first read the first record -- e.g.
+ * random access into middle of an ARC -- then version will not have been
+ * set. For now, we return a default, version 1.1. Later, if more than
+ * just one version of ARC, we could look at such as the meta line to see
+ * what version of ARC this is.
+ * @return Version of this ARC file.
+ */
+ public String getVersion() {
+ return (super.getVersion() == null)? "1.1": super.getVersion();
+ }
+
+ protected boolean isAlignedOnFirstRecord() {
+ return alignedOnFirstRecord;
+ }
+
+ protected void setAlignedOnFirstRecord(boolean alignedOnFirstRecord) {
+ this.alignedOnFirstRecord = alignedOnFirstRecord;
+ }
+
+ /**
+ * @return Returns the parseHttpHeaders.
+ */
+ public boolean isParseHttpHeaders() {
+ return this.parseHttpHeaders;
+ }
+
+ /**
+ * @param parse The parseHttpHeaders to set.
+ */
+ public void setParseHttpHeaders(boolean parse) {
+ this.parseHttpHeaders = parse;
+ }
+
+ public String getFileExtension() {
+ return ARC_FILE_EXTENSION;
+ }
+
+ public String getDotFileExtension() {
+ return DOT_ARC_FILE_EXTENSION;
+ }
+
+ protected boolean output(final String format)
+ throws IOException, java.text.ParseException {
+ boolean result = super.output(format);
+ if(!result && (format.equals(NOHEAD) || format.equals(HEADER))) {
+ throw new IOException(format +
+ " format only supported for single Records");
+ }
+ return result;
+ }
+
+ public boolean outputRecord(final String format) throws IOException {
+ boolean result = super.outputRecord(format);
+ if (result) {
+ return result;
+ }
+ if (format.equals(NOHEAD)) {
+ // No point digesting if dumping content.
+ setDigest(false);
+ ARCRecord r = (ARCRecord) get();
+ r.skipHttpHeader();
+ r.dump();
+ result = true;
+ } else if (format.equals(HEADER)) {
+ // No point digesting if dumping content.
+ setDigest(false);
+ ARCRecord r = (ARCRecord) get();
+ r.dumpHttpHeader();
+ result = true;
+ }
+
+ return result;
+ }
+
+ public void dump(final boolean compress)
+ throws IOException, java.text.ParseException {
+ // No point digesting if we're doing a dump.
+ setDigest(false);
+ boolean firstRecord = true;
+ ARCWriter writer = null;
+ for (Iterator ii = iterator(); ii.hasNext();) {
+ ARCRecord r = (ARCRecord)ii.next();
+ // We're to dump the arc on stdout.
+ // Get the first record's data if any.
+ ARCRecordMetaData meta = r.getMetaData();
+ if (firstRecord) {
+ firstRecord = false;
+ // Get an ARCWriter.
+ ByteArrayOutputStream baos =
+ new ByteArrayOutputStream(r.available());
+ // This is slow but done only once at top of ARC.
+ while (r.available() > 0) {
+ baos.write(r.read());
+ }
+ List listOfMetadata = new ArrayList();
+ listOfMetadata.add(baos.toString(WriterPoolMember.UTF8));
+ // Assume getArc returns full path to file. ARCWriter
+ // or new File will complain if it is otherwise.
+ List outDirs = new ArrayList();
+ WriterPoolSettingsData settings =
+ new WriterPoolSettingsData("","",-1L,compress,outDirs,listOfMetadata);
+ writer = new ARCWriter(new AtomicInteger(), System.out,
+ new File(meta.getArc()), settings);
+ continue;
+ }
+
+ writer.write(meta.getUrl(), meta.getMimetype(), meta.getIp(),
+ ArchiveUtils.parse14DigitDate(meta.getDate()).getTime(),
+ (int)meta.getLength(), r);
+ }
+ // System.out.println(System.currentTimeMillis() - start);
+ }
+
+ /**
+ * @return an ArchiveReader that will delete a local file on close. Used
+ * when we bring Archive files local and need to clean up afterward.
+ */
+ public ARCReader getDeleteFileOnCloseReader(final File f) {
+ final ARCReader d = this;
+ return new ARCReader() {
+ private final ARCReader delegate = d;
+ private File archiveFile = f;
+
+ public void close() throws IOException {
+ this.delegate.close();
+ if (this.archiveFile != null) {
+ if (archiveFile.exists()) {
+ archiveFile.delete();
+ }
+ this.archiveFile = null;
+ }
+ }
+
+ public ArchiveRecord get(long o) throws IOException {
+ return this.delegate.get(o);
+ }
+
+ public boolean isDigest() {
+ return this.delegate.isDigest();
+ }
+
+ public boolean isStrict() {
+ return this.delegate.isStrict();
+ }
+
+ public Iterator iterator() {
+ return this.delegate.iterator();
+ }
+
+ public void setDigest(boolean d) {
+ this.delegate.setDigest(d);
+ }
+
+ public void setStrict(boolean s) {
+ this.delegate.setStrict(s);
+ }
+
+ public List validate() throws IOException {
+ return this.delegate.validate();
+ }
+
+ @Override
+ public ArchiveRecord get() throws IOException {
+ return this.delegate.get();
+ }
+
+ @Override
+ public String getVersion() {
+ return this.delegate.getVersion();
+ }
+
+ @Override
+ public List validate(int noRecords) throws IOException {
+ return this.delegate.validate(noRecords);
+ }
+
+ @Override
+ protected ARCRecord createArchiveRecord(InputStream is,
+ long offset)
+ throws IOException {
+ return this.delegate.createArchiveRecord(is, offset);
+ }
+
+ @Override
+ protected void gotoEOR(ArchiveRecord record) throws IOException {
+ this.delegate.gotoEOR(record);
+ }
+
+ @Override
+ public void dump(boolean compress)
+ throws IOException, java.text.ParseException {
+ this.delegate.dump(compress);
+ }
+
+ @Override
+ public String getDotFileExtension() {
+ return this.delegate.getDotFileExtension();
+ }
+
+ @Override
+ public String getFileExtension() {
+ return this.delegate.getFileExtension();
+ }
+ };
+ }
+
+ // Static methods follow.
+
+ /**
+ *
+ * @param formatter Help formatter instance.
+ * @param options Usage options.
+ * @param exitCode Exit code.
+ */
+ private static void usage(HelpFormatter formatter, Options options,
+ int exitCode) {
+ formatter.printHelp("java org.archive.io.arc.ARCReader" +
+ " [--digest=true|false] \\\n" +
+ " [--format=cdx|cdxfile|dump|gzipdump|header|nohead]" +
+ " [--offset=#] \\\n[--strict] [--parse] ARC_FILE|ARC_URL",
+ options);
+ System.exit(exitCode);
+ }
+
+ /**
+ * Write out the arcfile.
+ *
+ * @param reader
+ * @param format Format to use outputting.
+ * @throws IOException
+ * @throws java.text.ParseException
+ */
+ protected static void output(ARCReader reader, String format)
+ throws IOException, java.text.ParseException {
+ if (!reader.output(format)) {
+ throw new IOException("Unsupported format: " + format);
+ }
+ }
+
+ /**
+ * Generate a CDX index file for an ARC file.
+ *
+ * @param urlOrPath The ARC file to generate a CDX index for
+ * @throws IOException
+ * @throws java.text.ParseException
+ */
+ public static void createCDXIndexFile(String urlOrPath)
+ throws IOException, java.text.ParseException {
+ ARCReader r = ARCReaderFactory.get(urlOrPath);
+ r.setStrict(false);
+ r.setParseHttpHeaders(true);
+ r.setDigest(true);
+ output(r, CDX_FILE);
+ }
+
+ /**
+ * Command-line interface to ARCReader.
+ *
+ * Here is the command-line interface:
+ *
+ * usage: java org.archive.io.arc.ARCReader [--offset=#] ARCFILE
+ * -h,--help Prints this message and exits.
+ * -o,--offset Outputs record at this offset into arc file.
+ *
+ *
See in $HERITRIX_HOME/bin/arcreader for a script that'll
+ * take care of classpaths and the calling of ARCReader.
+ *
+ *
Outputs using a pseudo-CDX format as described here:
+ * CDX
+ * Legent and here
+ * Example.
+ * Legend used in below is: 'CDX b e a m s c V (or v if uncompressed) n g'.
+ * Hash is hard-coded straight SHA-1 hash of content.
+ *
+ * @param args Command-line arguments.
+ * @throws ParseException Failed parse of the command line.
+ * @throws IOException
+ * @throws java.text.ParseException
+ */
+ @SuppressWarnings("unchecked")
+ public static void main(String [] args)
+ throws ParseException, IOException, java.text.ParseException {
+ Options options = getOptions();
+ options.addOption(new Option("p","parse", false, "Parse headers."));
+ PosixParser parser = new PosixParser();
+ CommandLine cmdline = parser.parse(options, args, false);
+ List cmdlineArgs = cmdline.getArgList();
+ Option [] cmdlineOptions = cmdline.getOptions();
+ HelpFormatter formatter = new HelpFormatter();
+
+ // If no args, print help.
+ if (cmdlineArgs.size() <= 0) {
+ usage(formatter, options, 0);
+ }
+
+ // Now look at options passed.
+ long offset = -1;
+ boolean digest = false;
+ boolean strict = false;
+ boolean parse = false;
+ String format = CDX;
+ for (int i = 0; i < cmdlineOptions.length; i++) {
+ switch(cmdlineOptions[i].getId()) {
+ case 'h':
+ usage(formatter, options, 0);
+ break;
+
+ case 'o':
+ offset =
+ Long.parseLong(cmdlineOptions[i].getValue());
+ break;
+
+ case 's':
+ strict = true;
+ break;
+
+ case 'p':
+ parse = true;
+ break;
+
+ case 'd':
+ digest = getTrueOrFalse(cmdlineOptions[i].getValue());
+ break;
+
+ case 'f':
+ format = cmdlineOptions[i].getValue().toLowerCase();
+ boolean match = false;
+ // List of supported formats.
+ final String [] supportedFormats =
+ {CDX, DUMP, GZIP_DUMP, HEADER, NOHEAD, CDX_FILE};
+ for (int ii = 0; ii < supportedFormats.length; ii++) {
+ if (supportedFormats[ii].equals(format)) {
+ match = true;
+ break;
+ }
+ }
+ if (!match) {
+ usage(formatter, options, 1);
+ }
+ break;
+
+ default:
+ throw new RuntimeException("Unexpected option: " +
+ + cmdlineOptions[i].getId());
+ }
+ }
+
+ if (offset >= 0) {
+ if (cmdlineArgs.size() != 1) {
+ System.out.println("Error: Pass one arcfile only.");
+ usage(formatter, options, 1);
+ }
+ ARCReader arc = ARCReaderFactory.get((String)cmdlineArgs.get(0),
+ offset);
+ arc.setStrict(strict);
+ // We must parse headers if we need to skip them.
+ if (format.equals(NOHEAD) || format.equals(HEADER)) {
+ parse = true;
+ }
+ arc.setParseHttpHeaders(parse);
+ outputRecord(arc, format);
+ } else {
+ for (String urlOrPath : cmdlineArgs) {
+ try {
+ ARCReader r = ARCReaderFactory.get(urlOrPath);
+ r.setStrict(strict);
+ r.setParseHttpHeaders(parse);
+ r.setDigest(digest);
+ output(r, format);
+ } catch (RuntimeException e) {
+ // Write out name of file we failed on to help with
+ // debugging. Then print stack trace and try to keep
+ // going. We do this for case where we're being fed
+ // a bunch of ARCs; just note the bad one and move
+ // on to the next.
+ System.err.println("Exception processing " + urlOrPath +
+ ": " + e.getMessage());
+ e.printStackTrace(System.err);
+ System.exit(1);
+ }
+ }
+ }
+ }
+}
diff --git a/src/main/java/org/archive/io/arc/ARCReaderFactory.java b/src/main/java/org/archive/io/arc/ARCReaderFactory.java
new file mode 100644
index 00000000..e7dc1625
--- /dev/null
+++ b/src/main/java/org/archive/io/arc/ARCReaderFactory.java
@@ -0,0 +1,454 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io.arc;
+
+import java.io.BufferedInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Iterator;
+import java.util.logging.Level;
+
+import org.archive.io.ArchiveReader;
+import org.archive.io.ArchiveReaderFactory;
+import org.archive.io.ArchiveRecord;
+import org.archive.io.ArchiveRecordHeader;
+import org.archive.util.FileUtils;
+import org.archive.util.zip.GZIPMembersInputStream;
+import org.archive.util.zip.GzipHeader;
+import org.archive.util.zip.NoGzipMagicException;
+
+import com.google.common.io.CountingInputStream;
+
+
+/**
+ * Factory that returns an ARCReader.
+ *
+ * Can handle compressed and uncompressed ARCs.
+ *
+ * @author stack
+ */
+public class ARCReaderFactory extends ArchiveReaderFactory
+implements ARCConstants {
+ /**
+ * This factory instance.
+ */
+ private static final ARCReaderFactory factory = new ARCReaderFactory();
+
+ /**
+ * Shutdown any access to default constructor.
+ */
+ protected ARCReaderFactory() {
+ super();
+ }
+
+ public static ARCReader get(String arcFileOrUrl)
+ throws MalformedURLException, IOException {
+ return (ARCReader)ARCReaderFactory.factory.
+ getArchiveReader(arcFileOrUrl);
+ }
+
+ public static ARCReader get(String arcFileOrUrl, final long offset)
+ throws MalformedURLException, IOException {
+ return (ARCReader)ARCReaderFactory.factory.
+ getArchiveReader(arcFileOrUrl, offset);
+ }
+
+ public static ARCReader get(final File f) throws IOException {
+ return (ARCReader)ARCReaderFactory.factory.getArchiveReader(f);
+ }
+
+ public static ARCReader get(final File f, final long offset)
+ throws IOException {
+ return (ARCReader)ARCReaderFactory.factory.getArchiveReader(f, offset);
+ }
+
+ protected ArchiveReader getArchiveReader(final File f, final long offset)
+ throws IOException {
+ return getArchiveReader(f, true, offset);
+ }
+
+ /**
+ * @param f An arcfile to read.
+ * @param skipSuffixTest Set to true if want to test that ARC has proper
+ * suffix. Use this method and pass false to open ARCs
+ * with the .open or otherwise suffix.
+ * @param offset Have returned ARCReader set to start reading at passed
+ * offset.
+ * @return An ARCReader.
+ * @throws IOException
+ */
+ public static ARCReader get(final File f,
+ final boolean skipSuffixTest, final long offset)
+ throws IOException {
+ return (ARCReader)ARCReaderFactory.factory.getArchiveReader(f,
+ skipSuffixTest, offset);
+ }
+
+ protected ArchiveReader getArchiveReader(final File arcFile,
+ final boolean skipSuffixTest, final long offset)
+ throws IOException {
+ boolean compressed = testCompressedARCFile(arcFile, skipSuffixTest);
+ if (!compressed) {
+ if (!FileUtils.isReadableWithExtensionAndMagic(arcFile,
+ ARC_FILE_EXTENSION, ARC_MAGIC_NUMBER)) {
+ throw new IOException(arcFile.getAbsolutePath() +
+ " is not an Internet Archive ARC file.");
+ }
+ }
+ return compressed?
+ (ARCReader)ARCReaderFactory.factory.
+ new CompressedARCReader(arcFile, offset):
+ (ARCReader)ARCReaderFactory.factory.
+ new UncompressedARCReader(arcFile, offset);
+ }
+
+ public static ArchiveReader get(final String s, final InputStream is,
+ final boolean atFirstRecord)
+ throws IOException {
+ return ARCReaderFactory.factory.getArchiveReader(s, is,
+ atFirstRecord);
+ }
+
+ protected ArchiveReader getArchiveReader(final String arc,
+ final InputStream is, final boolean atFirstRecord)
+ throws IOException {
+
+ // We do this mark() reset() stuff, wrapping in a BufferedInputStream if
+ // necessary to make it work, because testCompressedARCStream() consumes
+ // some bytes from the input stream
+ InputStream possiblyWrapped;
+ if (is.markSupported()) {
+ possiblyWrapped = is;
+ } else {
+ possiblyWrapped = new BufferedInputStream(is);
+ }
+
+ possiblyWrapped.mark(100);
+ boolean compressed = testCompressedARCStream(possiblyWrapped);
+ possiblyWrapped.reset();
+
+ if (compressed) {
+ return new CompressedARCReader(arc, possiblyWrapped, atFirstRecord);
+ } else {
+ return new UncompressedARCReader(arc, possiblyWrapped);
+ }
+ }
+
+ /**
+ * Get an ARCReader aligned at offset. This version of get
+ * will not bring the ARC local but will try to stream across the net making
+ * an HTTP 1.1 Range request on remote http server (RFC1435 Section 14.35).
+ *
+ * @param arcUrl HTTP URL for an ARC (All ARCs considered remote).
+ * @param offset Offset into ARC at which to start fetching.
+ * @return An ARCReader aligned at offset.
+ * @throws IOException
+ */
+ public static ARCReader get(final URL arcUrl, final long offset)
+ throws IOException {
+ return (ARCReader)ARCReaderFactory.factory.getArchiveReader(arcUrl,
+ offset);
+ }
+
+ /**
+ * Get an ARCReader.
+ * Pulls the ARC local into whereever the System Property
+ * java.io.tmpdir points. It then hands back an ARCReader that
+ * points at this local copy. A close on this ARCReader instance will
+ * remove the local copy.
+ * @param arcUrl An URL that points at an ARC.
+ * @return An ARCReader.
+ * @throws IOException
+ */
+ public static ARCReader get(final URL arcUrl)
+ throws IOException {
+ return (ARCReader)ARCReaderFactory.factory.getArchiveReader(arcUrl);
+ }
+
+ /**
+ * @param arcFile File to test.
+ * @return True if arcFile is compressed ARC.
+ * @throws IOException
+ */
+ public boolean isCompressed(File arcFile) throws IOException {
+ return testCompressedARCFile(arcFile);
+ }
+
+ /**
+ * Check file is compressed and in ARC GZIP format.
+ *
+ * @param arcFile File to test if its Internet Archive ARC file
+ * GZIP compressed.
+ *
+ * @return True if this is an Internet Archive GZIP'd ARC file (It begins
+ * w/ the Internet Archive GZIP header and has the
+ * COMPRESSED_ARC_FILE_EXTENSION suffix).
+ *
+ * @exception IOException If file does not exist or is not unreadable.
+ */
+ public static boolean testCompressedARCFile(File arcFile)
+ throws IOException {
+ return testCompressedARCFile(arcFile, false);
+ }
+
+ /**
+ * Check file is compressed and in ARC GZIP format.
+ *
+ * @param arcFile File to test if its Internet Archive ARC file
+ * GZIP compressed.
+ * @param skipSuffixCheck Set to true if we're not to test on the
+ * '.arc.gz' suffix.
+ *
+ * @return True if this is an Internet Archive GZIP'd ARC file (It begins
+ * w/ the Internet Archive GZIP header).
+ *
+ * @exception IOException If file does not exist or is not unreadable.
+ */
+ public static boolean testCompressedARCFile(File arcFile,
+ boolean skipSuffixCheck)
+ throws IOException {
+ boolean compressedARCFile = false;
+ FileUtils.assertReadable(arcFile);
+ if(!skipSuffixCheck && !arcFile.getName().toLowerCase()
+ .endsWith(COMPRESSED_ARC_FILE_EXTENSION)) {
+ return compressedARCFile;
+ }
+
+ final InputStream is = new FileInputStream(arcFile);
+ try {
+ compressedARCFile = testCompressedARCStream(is);
+ } finally {
+ is.close();
+ }
+ return compressedARCFile;
+ }
+
+ public static boolean isARCSuffix(final String arcName) {
+ return (arcName == null)?
+ false:
+ (arcName.toLowerCase().endsWith(DOT_COMPRESSED_ARC_FILE_EXTENSION))?
+ true:
+ (arcName.toLowerCase().endsWith(DOT_ARC_FILE_EXTENSION))?
+ true: false;
+ }
+
+ /**
+ * Tests passed stream is gzip stream by reading in the HEAD.
+ * Does not reposition the stream. That is left up to the caller.
+ * @param is An InputStream.
+ * @return True if compressed stream.
+ * @throws IOException
+ */
+ public static boolean testCompressedARCStream(final InputStream is)
+ throws IOException {
+ boolean compressedARCFile = false;
+ GzipHeader gh = null;
+ try {
+ gh = new GzipHeader(is);
+ } catch (NoGzipMagicException e) {
+ return false;
+ }
+
+ byte[] fextra = gh.getFextra();
+ // Now make sure following bytes are IA GZIP comment.
+ // First check length. ARC_GZIP_EXTRA_FIELD includes length
+ // so subtract two and start compare to ARC_GZIP_EXTRA_FIELD
+ // at +2.
+ // some Alexa ARC files gzip extra fields have changed slightly
+ // after the first two bytes, so we'll just look for the 'LX'
+ // extension for valid IA ARC files.
+ if (fextra != null) {
+ if (fextra.length >= ARC_GZIP_EXTRA_FIELD.length - 2) {
+ if (fextra[0] == ARC_GZIP_EXTRA_FIELD[2] &&
+ fextra[1] == ARC_GZIP_EXTRA_FIELD[3]) {
+ compressedARCFile = true;
+ }
+ }
+ } else {
+ // Some old arcs don't have an extra header at all, but they're still compressed
+ compressedARCFile = true;
+ }
+
+ return compressedARCFile;
+ }
+
+ /**
+ * Uncompressed arc file reader.
+ * @author stack
+ */
+ public class UncompressedARCReader extends ARCReader {
+ /**
+ * Constructor.
+ * @param f Uncompressed arcfile to read.
+ * @throws IOException
+ */
+ public UncompressedARCReader(final File f)
+ throws IOException {
+ this(f, 0);
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param f Uncompressed arcfile to read.
+ * @param offset Offset at which to position ARCReader.
+ * @throws IOException
+ */
+ public UncompressedARCReader(final File f, final long offset)
+ throws IOException {
+ // Arc file has been tested for existence by time it has come
+ // to here.
+ setIn(new CountingInputStream(getInputStream(f, offset)));
+ getIn().skip(offset);
+ initialize(f.getAbsolutePath());
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param f Uncompressed arc to read.
+ * @param is InputStream.
+ */
+ public UncompressedARCReader(final String f, final InputStream is) {
+ // Arc file has been tested for existence by time it has come
+ // to here.
+ setIn(new CountingInputStream(is));
+ initialize(f);
+ }
+ }
+
+ /**
+ * Compressed arc file reader.
+ *
+ * @author stack
+ */
+ public class CompressedARCReader extends ARCReader {
+
+ /**
+ * Constructor.
+ *
+ * @param f
+ * Compressed arcfile to read.
+ * @throws IOException
+ */
+ public CompressedARCReader(final File f) throws IOException {
+ this(f, 0);
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param f Compressed arcfile to read.
+ * @param offset Position at where to start reading file.
+ * @throws IOException
+ */
+ public CompressedARCReader(final File f, final long offset)
+ throws IOException {
+ // Arc file has been tested for existence by time it has come
+ // to here.
+ setIn(new GZIPMembersInputStream(getInputStream(f, offset)));
+ ((GZIPMembersInputStream)getIn()).compressedSeek(offset);
+ setCompressed((offset == 0)); // TODO: does this make sense???
+ initialize(f.getAbsolutePath());
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param f Compressed arcfile.
+ * @param is InputStream to use.
+ * @throws IOException
+ */
+ public CompressedARCReader(final String f, final InputStream is,
+ final boolean atFirstRecord)
+ throws IOException {
+ // Arc file has been tested for existence by time it has come
+ // to here.
+ setIn(new GZIPMembersInputStream(is));
+ setCompressed(true);
+ setAlignedOnFirstRecord(atFirstRecord);
+ initialize(f);
+ }
+
+ /**
+ * Get record at passed offset.
+ *
+ * @param offset
+ * Byte index into arcfile at which a record starts.
+ * @return An ARCRecord reference.
+ * @throws IOException
+ */
+ public ARCRecord get(long offset) throws IOException {
+ cleanupCurrentRecord();
+ ((GZIPMembersInputStream)getIn()).compressedSeek(offset);
+ return createArchiveRecord(getIn(), offset);
+ }
+
+ public Iterator iterator() {
+ /**
+ * Override ARCRecordIterator so can base returned iterator on
+ * GzippedInputStream iterator.
+ */
+ return new ArchiveRecordIterator() {
+ private GZIPMembersInputStream gis =
+ (GZIPMembersInputStream)getIn();
+
+ private Iterator gzipIterator = this.gis.memberIterator();
+
+ protected boolean innerHasNext() {
+ return this.gzipIterator.hasNext();
+ }
+
+ protected ArchiveRecord innerNext() throws IOException {
+ InputStream is = this.gzipIterator.next();
+ return createArchiveRecord(is, Math.max(gis.getCurrentMemberStart(), gis.getCurrentMemberEnd()));
+ }
+ };
+ }
+
+ protected void gotoEOR(ArchiveRecord rec) throws IOException {
+ int c;
+ while ((c = getIn().read())==LINE_SEPARATOR);
+ if(c==-1) {
+ return;
+ }
+ long skipped = 1;
+ while (getIn().read()>-1) {
+ skipped++;
+ }
+ // Report on system error the number of unexpected characters
+ // at the end of this record.
+ ArchiveRecordHeader meta = (getCurrentRecord() != null)?
+ rec.getHeader(): null;
+ String message = "Record STARTING at " +
+ ((GZIPMembersInputStream)getIn()).getCurrentMemberStart() +
+ " has " + skipped + " trailing byte(s): " +
+ ((meta != null)? meta.toString(): "");
+ if (isStrict()) {
+ throw new IOException(message);
+ }
+ logStdErr(Level.WARNING, message);
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/org/archive/io/arc/ARCRecord.java b/src/main/java/org/archive/io/arc/ARCRecord.java
new file mode 100644
index 00000000..21bea07c
--- /dev/null
+++ b/src/main/java/org/archive/io/arc/ARCRecord.java
@@ -0,0 +1,835 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io.arc;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import java.util.regex.Matcher;
+
+import org.apache.commons.httpclient.Header;
+import org.apache.commons.httpclient.StatusLine;
+import org.apache.commons.httpclient.util.EncodingUtil;
+import org.apache.commons.lang.StringUtils;
+import org.archive.io.ArchiveRecord;
+import org.archive.io.ArchiveRecordHeader;
+import org.archive.io.RecoverableIOException;
+import org.archive.util.InetAddressUtil;
+import org.archive.util.LaxHttpParser;
+import org.archive.util.TextUtils;
+
+/**
+ * An ARC file record.
+ * Does not compass the ARCRecord metadata line, just the record content.
+ * @author stack
+ */
+public class ARCRecord extends ArchiveRecord implements ARCConstants {
+ /**
+ * Http status line object.
+ *
+ * May be null if record is not http.
+ */
+ private StatusLine httpStatus = null;
+
+ /**
+ * Http header bytes.
+ *
+ * If non-null and bytes available, give out its contents before we
+ * go back to the underlying stream.
+ */
+ private InputStream httpHeaderStream = null;
+
+ /**
+ * Http headers.
+ *
+ * Only populated after reading of headers.
+ */
+ private Header [] httpHeaders = null;
+
+ /**
+ * Array of field names.
+ *
+ * Used to initialize headerFieldNameKeys.
+ */
+ private final String [] headerFieldNameKeysArray = {
+ URL_FIELD_KEY,
+ IP_HEADER_FIELD_KEY,
+ DATE_FIELD_KEY,
+ MIMETYPE_FIELD_KEY,
+ LENGTH_FIELD_KEY
+ };
+
+ /**
+ * An array of the header field names found in the ARC file header on
+ * the 3rd line.
+ *
+ * We used to read these in from the arc file first record 3rd line but
+ * now we hardcode them for sake of improved performance.
+ */
+ private final List headerFieldNameKeys =
+ Arrays.asList(this.headerFieldNameKeysArray);
+
+ /**
+ * Http header bytes read while trying to read http header
+ */
+ public long httpHeaderBytesRead = -1;
+
+ /**
+ * record length from metadata line
+ */
+ public long recordDeclaredLength;
+
+ /**
+ * null if source was not compressed
+ */
+ public long compressedBytes;
+
+ /**
+ * actual payload data (not including trailing newline),
+ * should match record-declared-length
+ */
+ public long uncompressedBytes;
+
+ /**
+ * content-length header, iff HTTP and present, null otherwise
+ */
+ public long httpPayloadDeclaredLength;
+
+ /**
+ * actual http payload length, should match http-payload-declared-length
+ */
+ public long httpPayloadActualLength;
+
+ /**
+ * errors encountered reading record
+ */
+ public List errors = new ArrayList();
+
+ /**
+ * verbatim ARC record header string
+ */
+ private String headerString;
+ public String getHeaderString() {
+ return this.headerString;
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param in Stream cue'd up to be at the start of the record this instance
+ * is to represent.
+ * @param metaData Meta data.
+ * @throws IOException
+ */
+ public ARCRecord(InputStream in, ArchiveRecordHeader metaData)
+ throws IOException {
+ this(in, metaData, 0, true, false, true);
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param in Stream cue'd up to be at the start of the record this instance
+ * is to represent.
+ * @param metaData Meta data.
+ * @param bodyOffset Offset into the body. Usually 0.
+ * @param digest True if we're to calculate digest for this record. Not
+ * digesting saves about ~15% of cpu during an ARC parse.
+ * @param strict Be strict parsing (Parsing stops if ARC inproperly
+ * formatted).
+ * @param parseHttpHeaders True if we are to parse HTTP headers. Costs
+ * about ~20% of CPU during an ARC parse.
+ * @throws IOException
+ */
+ public ARCRecord(InputStream in, ArchiveRecordHeader metaData,
+ int bodyOffset, boolean digest, boolean strict,
+ final boolean parseHttpHeaders)
+ throws IOException {
+ super(in, metaData, bodyOffset, digest, strict);
+ if (parseHttpHeaders) {
+ this.httpHeaderStream = readHttpHeader();
+ }
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param in Stream cue'd up to be at the start of the records metadata
+ * this instance is to represent.
+ * @param identifier Identifier for this the hosting Reader.
+ * @param offset Current offset into in (Used to keep
+ * position properly aligned). Usually 0.
+ * @param digest True if we're to calculate digest for this record. Not
+ * digesting saves about ~15% of cpu during an ARC parse.
+ * @param strict Be strict parsing (Parsing stops if ARC inproperly
+ * formatted).
+ * @param parseHttpHeaders True if we are to parse HTTP headers. Costs
+ * about ~20% of CPU during an ARC parse.
+ * @param isAllignedOnFirstRecord True if this is the first record to be
+ * read from an archive
+ * @param String version Version information to be returned to the
+ * ARCReader constructing this record
+ *
+ * @throws IOException
+ */
+ public ARCRecord(InputStream in, final String identifier,
+ final long offset, boolean digest, boolean strict,
+ final boolean parseHttpHeaders,
+ final boolean isAlignedOnFirstRecord, String version)
+ throws IOException {
+ super(in, null, 0, digest, strict);
+ setHeader(parseHeaders(in, identifier, offset, strict, isAlignedOnFirstRecord, version));
+ if (parseHttpHeaders) {
+ this.httpHeaderStream = readHttpHeader();
+ }
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param in Stream cue'd up to be at the start of the records metadata
+ * this instance is to represent.
+ * @param identifier Identifier for this the hosting Reader.
+ * @param offset Current offset into in (Used to keep
+ * position properly aligned). Usually 0.
+ * @param digest True if we're to calculate digest for this record. Not
+ * digesting saves about ~15% of cpu during an ARC parse.
+ * @param strict Be strict parsing (Parsing stops if ARC inproperly
+ * formatted).
+ * @param parseHttpHeaders True if we are to parse HTTP headers. Costs
+ * about ~20% of CPU during an ARC parse.
+ *
+ * @throws IOException
+ */
+ public ARCRecord(InputStream in, final String identifier,
+ final long offset, boolean digest, boolean strict,
+ final boolean parseHttpHeaders)
+ throws IOException {
+ this(in, identifier, offset, digest, strict, parseHttpHeaders,
+ false, null);
+ }
+
+ private ArchiveRecordHeader parseHeaders(final InputStream in,
+ final String identifier, final long offset, final boolean strict,
+ final boolean isAlignedOnFirstRecord, String version)
+ throws IOException {
+
+ ArrayList firstLineValues = new ArrayList(20);
+ getTokenizedHeaderLine(in, firstLineValues);
+
+ int bodyOffset = 0;
+ if (offset == 0 && isAlignedOnFirstRecord) {
+ // If offset is zero and we were aligned at first record on
+ // creation (See #alignedOnFirstRecord for more on this), then no
+ // records have been read yet and we're reading our first one, the
+ // record of ARC file meta info. Its special. In ARC versions
+ // 1.x, first record has three lines of meta info. We've just read
+ // the first line. There are two more. The second line has misc.
+ // info. We're only interested in the first field, the version
+ // number. The third line is the list of field names. Here's what
+ // ARC file version 1.x meta content looks like:
+ //
+ // filedesc://testIsBoundary-JunitIAH200401070157520.arc 0.0.0.0 \\
+ // 20040107015752 text/plain 77
+ // 1 0 InternetArchive
+ // URL IP-address Archive-date Content-type Archive-length
+ //
+ ArrayList secondLineValues = new ArrayList(20);
+ bodyOffset += getTokenizedHeaderLine(in, secondLineValues);
+ version = ((String)secondLineValues.get(0) +
+ "." + (String)secondLineValues.get(1));
+ // Just read over the 3rd line. We used to parse it and use
+ // values found here but now we just hardcode them to avoid
+ // having to read this 3rd line even for random arc file accesses.
+ bodyOffset += getTokenizedHeaderLine(in, null);
+ // this.position = bodyOffset;
+ }
+ setBodyOffset(bodyOffset);
+
+ return computeMetaData(this.headerFieldNameKeys, firstLineValues, version, offset, identifier);
+ }
+
+ /**
+ * Get a record header line as list of tokens.
+ *
+ * We keep reading till we find a LINE_SEPARATOR or we reach the end
+ * of file w/o finding a LINE_SEPARATOR or the line length is crazy.
+ *
+ * @param stream InputStream to read from.
+ * @param list Empty list that gets filled w/ string tokens.
+ * @return Count of characters read.
+ * @exception IOException If problem reading stream or no line separator
+ * found or EOF before EOL or we didn't get minimum header fields.
+ */
+ private int getTokenizedHeaderLine(final InputStream stream,
+ List list) throws IOException {
+ // Preallocate usual line size.
+ StringBuilder buffer = new StringBuilder(2048 + 20);
+ int read = 0;
+ int previous = -1;
+ for (int c = -1; true;) {
+ previous = c;
+ c = stream.read();
+ if (c == -1) {
+ throw new RecoverableIOException("Hit EOF before header EOL.");
+ }
+ c &= 0xff;
+ read++;
+ if (read > MAX_HEADER_LINE_LENGTH) {
+ throw new IOException("Header line longer than max allowed " +
+ " -- " + String.valueOf(MAX_HEADER_LINE_LENGTH) +
+ " -- or passed buffer doesn't contain a line (Read: " +
+ buffer.length() + "). Here's" +
+ " some of what was read: " +
+ buffer.substring(0, Math.min(buffer.length(), 256)));
+ }
+
+ if (c == LINE_SEPARATOR) {
+ if (buffer.length() == 0) {
+ // Empty line at start of buffer. Skip it and try again.
+ continue;
+ }
+
+ if (list != null) {
+ list.add(buffer.toString());
+ }
+ // LOOP TERMINATION.
+ break;
+ } else if (c == HEADER_FIELD_SEPARATOR) {
+ if (!isStrict() && previous == HEADER_FIELD_SEPARATOR) {
+ // Early ARCs sometimes had multiple spaces between fields.
+ continue;
+ }
+ if (list != null) {
+ list.add(buffer.toString());
+ }
+ // reset to empty
+ buffer.setLength(0);
+ } else {
+ buffer.append((char)c);
+ }
+ }
+
+ // List must have at least 3 elements in it and no more than 10. If
+ // it has other than this, then bogus parse.
+ if (list != null && (list.size() < 3 || list.size() > 100)) {
+ throw new IOException("Unparseable header line: " + list);
+ }
+
+ // save verbatim header String
+ this.headerString = StringUtils.join(list," ");
+
+ return read;
+ }
+
+ /**
+ * Compute metadata fields.
+ *
+ * Here we check the meta field has right number of items in it.
+ *
+ * @param keys Keys to use composing headerFields map.
+ * @param values Values to set into the headerFields map.
+ * @param v The version of this ARC file.
+ * @param offset Offset into arc file.
+ *
+ * @return Metadata structure for this record.
+ *
+ * @exception IOException If no. of keys doesn't match no. of values.
+ */
+ private ARCRecordMetaData computeMetaData(List keys,
+ List values, String v, long offset, final String identifier)
+ throws IOException {
+ if (keys.size() != values.size()) {
+ List originalValues = values;
+ if (!isStrict()) {
+ values = fixSpaceInURL(values, keys.size());
+ // If values still doesn't match key size, try and do
+ // further repair.
+ if (keys.size() != values.size()) {
+ // Early ARCs had a space in mimetype.
+ if (values.size() == (keys.size() + 1) &&
+ values.get(4).toLowerCase().startsWith("charset=")) {
+ List nuvalues =
+ new ArrayList(keys.size());
+ nuvalues.add(0, values.get(0));
+ nuvalues.add(1, values.get(1));
+ nuvalues.add(2, values.get(2));
+ nuvalues.add(3, values.get(3) + values.get(4));
+ nuvalues.add(4, values.get(5));
+ values = nuvalues;
+ } else if((values.size() + 1) == keys.size() &&
+ isLegitimateIPValue(values.get(1)) &&
+ isDate(values.get(2)) && isNumber(values.get(3))) {
+ // Mimetype is empty.
+ List nuvalues =
+ new ArrayList