From 3efbffdd3628b165616707e7fa849380460618b2 Mon Sep 17 00:00:00 2001
From: Noah Levitt
Date: Thu, 13 Mar 2014 18:59:12 -0700
Subject: [PATCH 001/252] avoid pulling in logback, which is wreaking havoc on
logging in apps using this library
---
pom.xml | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index 9c8698c7..4e635475 100644
--- a/pom.xml
+++ b/pom.xml
@@ -48,7 +48,6 @@
git@github.com:iipc/webarchive-commons.git
-
UTF-8
${maven.build.timestamp}
@@ -165,6 +164,12 @@
dsiutils
2.0.12
compile
+
+
+ ch.qos.logback
+ logback-classic
+
+
org.apache.httpcomponents
From b45ea54b82c363d0987ee32893cb33c96ea0f701 Mon Sep 17 00:00:00 2001
From: Andrew Jackson
Date: Fri, 4 Apr 2014 12:02:49 +0100
Subject: [PATCH 002/252] Added potential test case and lots of debug logging.
---
.../java/org/archive/io/ArchiveRecord.java | 5 +
.../org/archive/io/arc/ARCReaderFactory.java | 2 +-
.../java/org/archive/io/arc/ARCRecord.java | 4 +
.../archive/io/arc/ARCReaderFactoryTest.java | 61 +
...080430204825-00000-blackbook-truncated.arc | 1006 +++++++++++++++++
5 files changed, 1077 insertions(+), 1 deletion(-)
create mode 100644 src/test/java/org/archive/io/arc/ARCReaderFactoryTest.java
create mode 100644 src/test/resources/org/archive/format/arc/IAH-20080430204825-00000-blackbook-truncated.arc
diff --git a/src/main/java/org/archive/io/ArchiveRecord.java b/src/main/java/org/archive/io/ArchiveRecord.java
index 63bfe628..a3cab4ba 100644
--- a/src/main/java/org/archive/io/ArchiveRecord.java
+++ b/src/main/java/org/archive/io/ArchiveRecord.java
@@ -292,10 +292,13 @@ public String getDigestStr() {
}
protected void incrementPosition() {
+ System.err.println("incrementPostion()");
this.position++;
}
protected void incrementPosition(final long incr) {
+ new Exception().printStackTrace();
+ System.err.println("incrementPostion("+incr+")");
this.position += incr;
}
@@ -404,6 +407,8 @@ public boolean hasContentHeaders() {
}
protected void setBodyOffset(int bodyOffset) {
+ new Exception().printStackTrace();
+ System.err.println("setBodyOffset("+bodyOffset+")");
this.position = bodyOffset;
}
}
diff --git a/src/main/java/org/archive/io/arc/ARCReaderFactory.java b/src/main/java/org/archive/io/arc/ARCReaderFactory.java
index e7dc1625..ce12c4bb 100644
--- a/src/main/java/org/archive/io/arc/ARCReaderFactory.java
+++ b/src/main/java/org/archive/io/arc/ARCReaderFactory.java
@@ -147,7 +147,7 @@ protected ArchiveReader getArchiveReader(final String arc,
possiblyWrapped.mark(100);
boolean compressed = testCompressedARCStream(possiblyWrapped);
possiblyWrapped.reset();
-
+
if (compressed) {
return new CompressedARCReader(arc, possiblyWrapped, atFirstRecord);
} else {
diff --git a/src/main/java/org/archive/io/arc/ARCRecord.java b/src/main/java/org/archive/io/arc/ARCRecord.java
index 21bea07c..7f3bf653 100644
--- a/src/main/java/org/archive/io/arc/ARCRecord.java
+++ b/src/main/java/org/archive/io/arc/ARCRecord.java
@@ -344,6 +344,8 @@ private int getTokenizedHeaderLine(final InputStream stream,
// save verbatim header String
this.headerString = StringUtils.join(list," ");
+ System.err.println("This "+this.headerString);
+
return read;
}
@@ -589,6 +591,7 @@ private InputStream readHttpHeader() throws IOException {
statusLine = EncodingUtil.getString(statusBytes, 0,
statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING);
+ System.err.println("statusLine: "+statusLine);
// If a null or DELETED break immediately
if ((statusLine == null) || statusLine.startsWith("DELETED")) {
@@ -602,6 +605,7 @@ private InputStream readHttpHeader() throws IOException {
}
// Add bytes read to error "offset" to add to position
+ System.err.println("BYTES: "+new String(statusBytes));
errOffset += statusBytes.length;
}
diff --git a/src/test/java/org/archive/io/arc/ARCReaderFactoryTest.java b/src/test/java/org/archive/io/arc/ARCReaderFactoryTest.java
new file mode 100644
index 00000000..090ccef7
--- /dev/null
+++ b/src/test/java/org/archive/io/arc/ARCReaderFactoryTest.java
@@ -0,0 +1,61 @@
+package org.archive.io.arc;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.InputStream;
+import java.io.RandomAccessFile;
+
+import org.archive.io.ArchiveReader;
+import org.archive.io.ArchiveRecord;
+
+import junit.framework.TestCase;
+
+/**
+ *
+ * Based on https://github.com/iipc/openwayback/pull/104/files
+ *
+ * @author csr@statsbiblioteket.dk (Colin Rosenthal)
+ *
+ */
+public class ARCReaderFactoryTest extends TestCase {
+
+ private File testfile1 = new File("src/test/resources/org/archive/format/arc/IAH-20080430204825-00000-blackbook-truncated.arc");
+
+ /**
+ * Test reading uncompressed arcfile for issue
+ * https://github.com/iipc/openwayback/issues/101
+ * @throws Exception
+ */
+ public void testGetResource() throws Exception {
+ this.offsetResourceTest(testfile1, 1515, "archive.org/robots.txt" );
+ this.offsetResourceTest(testfile1, 36420, "archive.org/services/collection-rss.php" );
+ }
+
+ private void offsetResourceTest( File testfile, long offset, String uri ) throws Exception {
+ RandomAccessFile raf = new RandomAccessFile(testfile, "r");
+ raf.seek(offset);
+ InputStream is = new FileInputStream(raf.getFD());
+ String fPath = testfile.getAbsolutePath();
+ ArchiveReader reader = ARCReaderFactory.get(fPath, is, false);
+ // This one works:
+ //ArchiveReader reader = ARCReaderFactory.get(testfile, offset);
+ ArchiveRecord record = reader.get();
+ System.out.println("Position:"+record.getPosition());
+
+ final String url = record.getHeader().getUrl();
+ System.out.println("Got URL: "+url);
+ assertEquals("URL of record is not as expected.", uri, url);
+
+ final long position = record.getPosition();
+ final long recordLength = record.getHeader().getLength();
+ System.out.println("Position:"+position);
+ System.out.println("Length:"+recordLength);
+ assertTrue("Position " + position + " is after end of record " + recordLength, position <= recordLength);
+
+ // Clean up:
+ if( raf != null )
+ raf.close();
+ }
+
+}
diff --git a/src/test/resources/org/archive/format/arc/IAH-20080430204825-00000-blackbook-truncated.arc b/src/test/resources/org/archive/format/arc/IAH-20080430204825-00000-blackbook-truncated.arc
new file mode 100644
index 00000000..3cbffb81
--- /dev/null
+++ b/src/test/resources/org/archive/format/arc/IAH-20080430204825-00000-blackbook-truncated.arc
@@ -0,0 +1,1006 @@
+filedesc://IAH-20080430204825-00000-blackbook-truncated.arc 0.0.0.0 20080430204825 text/plain 1300
+1 1 InternetArchive
+URL IP-address Archive-date Content-type Archive-length
+
+
+Heritrix @VERSION@ http://crawler.archive.org
+blackbook
+192.168.1.13
+archive.org-shallow
+archive.org shallow
+Admin
+2008-04-30T20:48:24+00:00
+Mozilla/5.0 (compatible; heritrix/1.14.0 +http://crawler.archive.org)
+archive-crawler-agent@lists.sourceforge.net
+classic
+ARC file version 1.1
+http://www.archive.org/web/researcher/ArcFileFormat.php
+
+dns:www.archive.org 68.87.76.178 20080430204825 text/dns 56
+20080430204825
+www.archive.org. 589 IN A 207.241.229.39
+http://www.archive.org/robots.txt 207.241.229.39 20080430204825 text/plain 782
+HTTP/1.1 200 OK
+Date: Wed, 30 Apr 2008 20:48:24 GMT
+Server: Apache/2.0.54 (Ubuntu) PHP/5.0.5-2ubuntu1.4 mod_ssl/2.0.54 OpenSSL/0.9.7g
+Last-Modified: Sat, 02 Feb 2008 19:40:44 GMT
+ETag: "47c3-1d3-11134700"
+Accept-Ranges: bytes
+Content-Length: 467
+Connection: close
+Content-Type: text/plain; charset=UTF-8
+
+##############################################
+#
+# Welcome to the Archive!
+#
+##############################################
+# Please crawl our files.
+# We appreciate if you can crawl responsibly.
+# Stay open!
+##############################################
+User-agent: *
+Disallow: /nothing---please-crawl-us--
+
+# slow down the ask jeeves crawler which was hitting our SE a little too fast
+# via collection pages. --Feb2008 tracey--
+User-agent: Teoma
+Crawl-Delay: 10
+http://www.archive.org/ 207.241.229.39 20080430204826 text/html 680
+HTTP/1.1 200 OK
+Date: Wed, 30 Apr 2008 20:48:25 GMT
+Server: Apache/2.0.54 (Ubuntu) PHP/5.0.5-2ubuntu1.4 mod_ssl/2.0.54 OpenSSL/0.9.7g
+Last-Modified: Wed, 09 Jan 2008 23:18:29 GMT
+ETag: "47ac-16e-4f9e5b40"
+Accept-Ranges: bytes
+Content-Length: 366
+Connection: close
+Content-Type: text/html; charset=UTF-8
+
+
+
+
+
+
+
+
+Please visit our website at:
+http://www.archive.org
+
+
+http://www.archive.org/index.php 207.241.229.39 20080430204826 text/html 29000
+HTTP/1.1 200 OK
+Date: Wed, 30 Apr 2008 20:48:25 GMT
+Server: Apache/2.0.54 (Ubuntu) PHP/5.0.5-2ubuntu1.4 mod_ssl/2.0.54 OpenSSL/0.9.7g
+X-Powered-By: PHP/5.0.5-2ubuntu1.4
+Set-Cookie: PHPSESSID=657fa9749e9426f2ffa75f14b54ed4ac; path=/; domain=.archive.org
+Connection: close
+Content-Type: text/html; charset=UTF-8
+
+
+
+
+
+
+ Internet Archive
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Upload Anonymous User ( or )
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+The Internet Archive is building a digital library of Internet
+ sites and other cultural artifacts in digital form. Like a paper
+ library, we provide free access to researchers, historians,
+ scholars, and the general public.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
A Few Good G-Men Randall Glass, the maker of "Warthog Jump," re-creates in "A Few Good G-Men" an entire scene from...
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+http://www.archive.org/images/logoc.jpg 207.241.229.39 20080430204829 image/jpeg 1963
+HTTP/1.1 200 OK
+Date: Wed, 30 Apr 2008 20:48:28 GMT
+Server: Apache/2.0.54 (Ubuntu) PHP/5.0.5-2ubuntu1.4 mod_ssl/2.0.54 OpenSSL/0.9.7g
+Last-Modified: Mon, 16 Jun 2003 22:28:51 GMT
+ETag: "34dc-67e-2ed02ec0"
+Accept-Ranges: bytes
+Content-Length: 1662
+Connection: close
+Content-Type: image/jpeg
+
+ JFIF d d Adobe ImageReady Ducky <