diff --git a/src/main/java/org/archive/io/arc/ARCReader.java b/src/main/java/org/archive/io/arc/ARCReader.java
index 7f85cc2a..2809d5d4 100644
--- a/src/main/java/org/archive/io/arc/ARCReader.java
+++ b/src/main/java/org/archive/io/arc/ARCReader.java
@@ -164,7 +164,7 @@ protected ARCRecord createArchiveRecord(InputStream is, long offset)
}
return (ARCRecord)getCurrentRecord();
}
-
+
/**
* Returns version of this ARC file. Usually read from first record of ARC.
* If we're reading without having first read the first record -- e.g.
diff --git a/src/main/java/org/archive/io/arc/ARCRecord.java b/src/main/java/org/archive/io/arc/ARCRecord.java
index 21bea07c..9aec90f5 100644
--- a/src/main/java/org/archive/io/arc/ARCRecord.java
+++ b/src/main/java/org/archive/io/arc/ARCRecord.java
@@ -137,7 +137,7 @@ public class ARCRecord extends ArchiveRecord implements ARCConstants {
public String getHeaderString() {
return this.headerString;
}
-
+
/**
* Constructor.
*
@@ -233,7 +233,7 @@ public ARCRecord(InputStream in, final String identifier,
this(in, identifier, offset, digest, strict, parseHttpHeaders,
false, null);
}
-
+
private ArchiveRecordHeader parseHeaders(final InputStream in,
final String identifier, final long offset, final boolean strict,
final boolean isAlignedOnFirstRecord, String version)
@@ -241,7 +241,7 @@ private ArchiveRecordHeader parseHeaders(final InputStream in,
ArrayList firstLineValues = new ArrayList(20);
getTokenizedHeaderLine(in, firstLineValues);
-
+
int bodyOffset = 0;
if (offset == 0 && isAlignedOnFirstRecord) {
// If offset is zero and we were aligned at first record on
@@ -343,7 +343,7 @@ private int getTokenizedHeaderLine(final InputStream stream,
// save verbatim header String
this.headerString = StringUtils.join(list," ");
-
+
return read;
}
@@ -569,7 +569,6 @@ private InputStream readHttpHeader() throws IOException {
getHeader().getLength() <= MIN_HTTP_HEADER_LENGTH) {
return null;
}
-
String statusLine;
byte[] statusBytes;
int eolCharCount = 0;
@@ -597,14 +596,19 @@ private InputStream readHttpHeader() throws IOException {
// If it's actually the status line, break, otherwise continue skipping any
// previous header values
- if (!statusLine.contains(":") && StatusLine.startsWithHTTP(statusLine)) {
+ // Old code contained {@code !statusLine.contains(":")}, which conflicts with RFC2616-sec6
+ if (StatusLine.startsWithHTTP(statusLine)) {
break;
}
-
+
+ if (statusLine.replace("\r", "").isEmpty()) { // No more header lines
+ break;
+ }
+
// Add bytes read to error "offset" to add to position
errOffset += statusBytes.length;
}
-
+
if (errOffset > 0) {
this.incrementPosition(errOffset);
}
diff --git a/src/test/java/org/archive/io/SubInputStream.java b/src/test/java/org/archive/io/SubInputStream.java
new file mode 100644
index 00000000..4dcb15f7
--- /dev/null
+++ b/src/test/java/org/archive/io/SubInputStream.java
@@ -0,0 +1,222 @@
+/*
+ * $Header: $
+ * $Revision$
+ * $Date$
+ *
+ * ====================================================================
+ *
+ * Copyright 1999-2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation. For more
+ * information on the Apache Software Foundation, please see
+ * .
+ *
+ */
+/*
+ *
+ */
+package org.archive.io;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * Encapsulates another stream, keeping track of local as well as global offsets.
+ * The SubStream has a max size and close() ensures that the encapsulated Stream
+ * is fast-forwarded to that point.
+ *
+ * Note: Calling close on the SubInputStream does not close the wrapped stream.
+ * Note 2: This implementation relies on the wrapped InputStream not to return 0
+ * in {@link java.io.InputStream#available()} until that stream has been depleted.
+ */
+public class SubInputStream extends InputStream {
+ private final InputStream inner;
+ private final long globalPosOrigo;
+
+ private long length;
+ private long mark = -1;
+ private long pos = 0;
+
+ /**
+ * Wraps the inner Stream with no max on bytes read. Reduces functionality to tracking position.
+ *
+ * Note: The length can be specified later with {@link #setLength(long)}.
+ * @param inner data source.
+ */
+ public SubInputStream(InputStream inner) {
+ this(inner, Long.MAX_VALUE, 0);
+ }
+ /**
+ * Wraps the inner InputStream with a max on bytes read.
+ * @param inner data source.
+ * @param length the number of bytes that at a maximum can be read from inner.
+ */
+ public SubInputStream(InputStream inner, long length) {
+ this(inner, length, 0);
+ }
+ /**
+ * Wraps the inner InputStream with a max on bytes read.
+ * @param inner data source.
+ * @param length the number of bytes that at a maximum can be read from inner.
+ * @param globalPosition the position in the inner stream.
+ */
+ public SubInputStream(InputStream inner, long length, long globalPosition) {
+ this.inner = inner;
+ globalPosOrigo = globalPosition;
+ this.length = length;
+ }
+
+ /**
+ * @return the position from the virtual stream.
+ */
+ public long getPosition() {
+ return pos;
+ }
+
+ /**
+ * @return the position in the wrapped stream, if the starting point was stated during construction.
+ */
+ public long getGlobalPosition() {
+ return globalPosOrigo + pos;
+ }
+
+ public void setLength(long length) {
+ if (length <= pos) {
+ throw new IllegalStateException(
+ "The position is " + pos + " which is past the allowed virtual length " + length);
+ }
+ this.length = length;
+ }
+
+ public long getLength() {
+ return length;
+ }
+
+ /**
+ * Reads to the next '\n' and returns the line as an UTF-8 string, excluding trailing
+ * carriage returns {@code '\r'} and newlines {@code '\n'}.
+ * @return the next line or null if EOF.
+ * @throws IOException if there was a problem reading bytes from inner.
+ */
+ public String readLine() throws IOException {
+ byte[] bytes = readLineBytes();
+ if (bytes == null) {
+ return null;
+ }
+ int length = bytes.length;
+ while (length > 0 && (bytes[length-1] == '\n' || bytes[length-1] == '\r')) {
+ length--;
+ }
+ return new String(bytes, 0, length, "utf-8");
+ }
+ /**
+ * Reads to the next '\n' and returns the line as raw bytes, including the delimiting '\n'.
+ * @return the next line.
+ * @throws IOException if there was a problem reading bytes from inner.
+ */
+ public byte[] readLineBytes() throws IOException {
+ ByteArrayOutputStream by = new ByteArrayOutputStream();
+ int b;
+ while ((b = read()) != -1) {
+ by.write(b);
+ if (b == '\n') {
+ break;
+ }
+ }
+ return by.size() == 0 && b == -1 ? null : by.toByteArray();
+ }
+
+ /* Delegates from inner InputStream */
+
+ @Override
+ public int read() throws IOException {
+ if (available() == 0) {
+ return -1;
+ }
+ int c = inner.read();
+ if (c != -1) {
+ pos++;
+ }
+ return c;
+ }
+
+ @Override
+ public int read(byte[] b) throws IOException {
+ return read(b, 0, b.length);
+ }
+
+ @Override
+ public int read(byte[] b, int off, int len) throws IOException {
+ if (len == 0) {
+ return 0;
+ }
+ if (available() == 0) {
+ return -1;
+ }
+ len = Math.min(len, available());
+ int r = inner.read(b, off, len);
+ if (r != -1) { // EOF
+ pos += r;
+ }
+ return r;
+ }
+
+ @Override
+ public long skip(long n) throws IOException {
+ n = Math.min(n, available());
+ if (n <= 0) {
+ return 0;
+ }
+ long s = inner.skip(n);
+ pos += s;
+ return s;
+ }
+
+ @Override
+ public int available() throws IOException {
+ return (int) Math.min(inner.available(), length - pos);
+ }
+
+ @SuppressWarnings("ResultOfMethodCallIgnored")
+ @Override
+ public void close() throws IOException {
+ if (pos < length) {
+ skip(length - pos);
+ }
+ }
+
+ @Override
+ public void mark(int readlimit) {
+ mark = pos;
+ inner.mark(readlimit);
+ }
+
+ @Override
+ public void reset() throws IOException {
+ if (mark == -1) {
+ throw new IOException("A mark must be set before reset is called");
+ }
+ inner.reset();
+ pos = mark;
+ }
+
+ @Override
+ public boolean markSupported() {
+ return inner.markSupported();
+ }
+}
diff --git a/src/test/java/org/archive/io/arc/ARCReaderFactoryTest.java b/src/test/java/org/archive/io/arc/ARCReaderFactoryTest.java
index 0721f795..71c444b9 100644
--- a/src/test/java/org/archive/io/arc/ARCReaderFactoryTest.java
+++ b/src/test/java/org/archive/io/arc/ARCReaderFactoryTest.java
@@ -1,15 +1,14 @@
package org.archive.io.arc;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
-import java.io.InputStream;
-import java.io.RandomAccessFile;
+import java.io.*;
+import java.net.URL;
+import java.util.List;
import org.archive.io.ArchiveReader;
import org.archive.io.ArchiveRecord;
import junit.framework.TestCase;
+import org.archive.io.SubInputStream;
/**
*
@@ -21,6 +20,9 @@
public class ARCReaderFactoryTest extends TestCase {
private File testfile1 = new File("src/test/resources/org/archive/format/arc/IAH-20080430204825-00000-blackbook-truncated.arc");
+ //private File testfile_nl = new File("src/test/resources/org/archive/format/arc/137542-153-20111129020925-00316-kb-prod-har-003.kb.dk_truncated.arc");
+ private File testfile_nl = getResource(
+ "org/archive/format/arc/137542-153-20111129020925-00316-kb-prod-har-003.kb.dk_truncated.arc");
/**
* Test reading uncompressed arcfile for issue
@@ -53,5 +55,63 @@ private void offsetResourceTest( File testfile, long offset, String uri ) throws
if( raf != null )
raf.close();
}
-
+
+ public void testBaseSampleARC() throws IOException {
+ testARCReaderIteration(testfile1, 9, 7);
+ }
+ /*
+ This failed with the old http-header parsing code in {@code ARCRecord#readHttpHeader}.
+ */
+ public void testNewlinedSampleARC() throws IOException {
+ testARCReaderIteration(testfile_nl, 4, 3); // Status has 2*200 & 1*404
+ }
+
+ // Independent of the ARCReader code
+ public void testBaseSampleIntegrity() throws IOException {
+ List urls = ARCTestHelper.getURLs(testfile1);
+ assertEquals("The correct number of URLs should be extracted", 9, urls.size());
+ }
+ public void testVerifyNewlinedSampleIntegrity() throws IOException {
+ List urls = ARCTestHelper.getURLs(testfile_nl);
+ assertEquals("The correct number of URLs should be extracted", 4, urls.size());
+ }
+
+ public void testNewlinedSampleARCContentLength() throws IOException {
+ ARCTestHelper.testARCContentLength(testfile_nl);
+ }
+ public void testBaseSampleARCContentLength() throws IOException {
+ ARCTestHelper.testARCContentLength(testfile1);
+ }
+// public void testLocalSampleARCContentLength() throws IOException {
+// ARCTestHelper.testARCContentLength(
+// new File("/home/te/tmp/warc/137542-153-20111129020925-00316-kb-prod-har-003.kb.dk.arc"));
+// }
+
+ // Uncomment println for manual inspection of first content line
+ private void testARCReaderIteration(File arc, int expectedRecords, int hasStatus) throws IOException {
+ ARCReader reader = ARCReaderFactory.get(arc);
+ int recordCount = 0;
+ int okCount = 0;
+ for (ArchiveRecord record : reader) {
+ if (((ARCRecord)record).getStatusCode() != -1) {
+ okCount++;
+ }
+ SubInputStream sub = new SubInputStream(record);
+ sub.skip(record.getHeader().getContentBegin());
+ //System.out.println(record.getPosition() + "> " + sub.readLine());
+ sub.close();
+ recordCount++;
+ }
+ reader.close();
+ assertEquals("There should be the right number of records in " + arc, expectedRecords, recordCount);
+ assertEquals("There should be the right number of status 200 records in " + arc, hasStatus, okCount);
+ }
+
+ private static File getResource(String resource) {
+ URL url = Thread.currentThread().getContextClassLoader().getResource(resource);
+ if (url == null) {
+ throw new RuntimeException("The resource '" + resource + "' could not be located in the class path");
+ }
+ return new File(url.getFile());
+ }
}
diff --git a/src/test/java/org/archive/io/arc/ARCTestHelper.java b/src/test/java/org/archive/io/arc/ARCTestHelper.java
new file mode 100644
index 00000000..9cedbfe4
--- /dev/null
+++ b/src/test/java/org/archive/io/arc/ARCTestHelper.java
@@ -0,0 +1,118 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.archive.io.arc;
+
+import org.archive.io.SubInputStream;
+
+import java.io.*;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Lists the URLs from an ARC file.
+ */
+public class ARCTestHelper {
+
+ // Extracts the header URLs from an ARC file
+ public static List getURLs(File arc) throws IOException {
+ List urls = new ArrayList();
+ if (!arc.exists()) {
+ throw new IOException("The file '" + arc + "' does not exist");
+ }
+ InputStream fis = new FileInputStream(arc);
+ SubInputStream in = new SubInputStream(fis);
+ String line = in.readLine();
+ final Pattern URL_EXTRACT = Pattern.compile("^(.+) [0-9]{14} .*");
+ // Iterate the records
+ while (line != null) {
+ //System.out.println(line + " (absolute offset: " + oldOffset + ")");
+ Matcher matcher = URL_EXTRACT.matcher(line);
+ if (!matcher.find()) {
+ throw new IllegalArgumentException("Unable to extract URL from '" + line + "'");
+ }
+ urls.add(matcher.group());
+ final long delta = getDelta(line);
+ if (in.skip(delta) != delta) {
+ System.err.println("Could not skip " + delta + " bytes");
+ }
+ // Skip the newline after content
+ if (in.read() == -1) {
+ break;
+ }
+ //oldOffset = in.getOffset();
+ line = in.readLine();
+ //noinspection StatementWithEmptyBody
+ //while ((line = in.readLine()) != null && line.isEmpty());
+ }
+ in.close();
+ fis.close();
+ return urls;
+ }
+
+ // Checks that the two content lengths (ARC and server-issued) for each record matches
+ public static void testARCContentLength(File arc) throws IOException {
+ if (!arc.exists()) {
+ throw new IOException("The file '" + arc + "' does not exist");
+ }
+ InputStream fis = new FileInputStream(arc);
+ SubInputStream out = new SubInputStream(fis);
+
+ final Pattern CONTENT_LENGTH = Pattern.compile("Content-Length: ([0-9]+)[^0-9]*");
+ String outline;
+ while ((outline = out.readLine()) != null) {
+ if (outline.isEmpty()) {
+ throw new IllegalStateException("Got unexpected empty line. Next line is\n" + out.readLine());
+
+ }
+ final long delta = getDelta(outline);
+ SubInputStream sub = new SubInputStream(out, delta, out.getPosition());
+ long contentLength = -1;
+ String inline;
+ while ((inline = sub.readLine()) != null) {
+ Matcher clMatcher = CONTENT_LENGTH.matcher(inline);
+ if (clMatcher.matches()) {
+ contentLength = Long.parseLong(clMatcher.group(1));
+ }
+ if (inline.isEmpty() || "\r".equals(inline)) {
+ break;
+ }
+ }
+ if (contentLength != -1 && contentLength != sub.available()) {
+ throw new IllegalStateException(String.format(
+ "sub_pos=%6d, sub_length=%6d, sub_available=%6d, Content-Length=%6d, header=%s",
+ sub.getPosition(), sub.getLength(), sub.available(), contentLength, outline));
+ }
+ sub.close();
+
+ // Newline delimiter
+ if (out.read() == -1) {
+ break;
+ }
+ }
+ fis.close();
+ }
+
+ /// http://www.example.com/somepath 192.168.10.12 20111129020924 text/html 79022
+ private static long getDelta(String line) {
+ String tokens[] = line.split(" ");
+ try {
+ return Long.parseLong(tokens[tokens.length-1]);
+ } catch (NumberFormatException e) {
+ throw new IllegalArgumentException("Unable to extract delta from line\n" + line);
+ }
+ }
+}
diff --git a/src/test/resources/org/archive/format/arc/137542-153-20111129020925-00316-kb-prod-har-003.kb.dk_truncated.arc b/src/test/resources/org/archive/format/arc/137542-153-20111129020925-00316-kb-prod-har-003.kb.dk_truncated.arc
new file mode 100644
index 00000000..9a705995
--- /dev/null
+++ b/src/test/resources/org/archive/format/arc/137542-153-20111129020925-00316-kb-prod-har-003.kb.dk_truncated.arc
@@ -0,0 +1,83 @@
+filedesc://137542-153-20111129020925-00316-kb-prod-har-003.kb.dk.arc.open 0.0.0.0 20111129020925 text/plain 1287
+1 1 InternetArchive
+URL IP-address Archive-date Content-type Archive-length
+
+
+Heritrix 1.14.4 http://crawler.archive.org
+kb-prod-har-003.kb.dk
+130.226.228.74
+default_orderxml
+Default Profile
+Admin
+2008-01-18T11:12:17+00:00
+Mozilla/5.0 (compatible; heritrix/1.12.1b +http://netarkivet.dk/website/info.html)
+netarkivet-svar@netarkivet.dk
+ignore
+ARC file version 1.1
+http://www.archive.org/web/researcher/ArcFileFormat.php
+
+
+http://www.deerhunter.dk////Default.aspx?ID=361&ProductComp=2634 80.63.58.81 20111129020924 text/html 546
+HTTP/1.1 200 OK
+Connection: close
+Date: Tue, 29 Nov 2011 02:09:25 GMT
+Server: Microsoft-IIS/6.0
+X-Powered-By: ASP.NET
+X-AspNet-Version: 2.0.50727
+Cache-Control: private
+Content-Type: text/html; charset=utf-8
+Content-Length: 307
+
+
+
+
+[truncated by hand]
+
+
+
+
+
+http://www.def.dk/sitecore/service/notfound.aspx?item=%2farbejdsforhold%2farbejdsmiljoe%2fsitecore%2fservice%2fnotfound&user=extranet%5cAnonymous&site=website 217.145.53.21 20111129021529 text/html 702
+HTTP/1.1 404 Item not found: /arbejdsforhold/arbejdsmiljoe/sitecore/service/notfound
+Connection: close
+Date: Tue, 29 Nov 2011 02:15:29 GMT
+Server: Microsoft-IIS/6.0
+X-Powered-By: ASP.NET; Sitecore CMS
+X-Powered-By: ASP.NET
+X-AspNet-Version: 2.0.50727
+Cache-Control: no-cache, no-store
+Pragma: no-cache
+Expires: -1
+Content-Type: text/html; charset=utf-8
+Content-Length: 315
+
+
+
+
+
+ Document Not Found
+[truncated by hand]
+pageTracker._trackPageview();
+} catch(err) {}
+
+
+http://www.dccenergi.dk/privat/fyringsolie/bestil-olie/prev/privat/fyringsolie/privat/node/privat/fyringsolie/privat/privat/fyringsolie/automatisk-olielevering 195.225.91.18 20111129021529 text/html 721
+HTTP/1.1 200 OK
+Date: Tue, 29 Nov 2011 02:15:29 GMT
+Server: Apache/2.2.3 (Red Hat) mod_ssl/2.2.3 OpenSSL/0.9.8e-fips-rhel5 DAV/2 PHP/5.2.17
+X-Powered-By: PHP/5.2.17
+Expires: Sun, 19 Nov 1978 05:00:00 GMT
+Last-Modified: Tue, 29 Nov 2011 02:15:29 GMT
+Cache-Control: store, no-cache, must-revalidate
+Cache-Control: post-check=0, pre-check=0
+Connection: close
+Content-Type: text/html; charset=utf-8
+
+
+
+
+
+[truncated by hand]
+
+
+