From 9af2eb661559859271500ed3802aeedf9d062f4f Mon Sep 17 00:00:00 2001 From: Toke Eskildsen Date: Tue, 17 Feb 2015 09:18:45 +0100 Subject: [PATCH 1/7] Made the independent ARC file reader return URLs --- .../org/archive/io/arc/ARCTestHelper.java | 136 ++++++++++++++++++ 1 file changed, 136 insertions(+) create mode 100644 src/test/java/org/archive/io/arc/ARCTestHelper.java diff --git a/src/test/java/org/archive/io/arc/ARCTestHelper.java b/src/test/java/org/archive/io/arc/ARCTestHelper.java new file mode 100644 index 00000000..96809d2c --- /dev/null +++ b/src/test/java/org/archive/io/arc/ARCTestHelper.java @@ -0,0 +1,136 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.archive.io.arc; + +import java.io.*; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Lists the URLs from an ARC file. + */ +public class ARCTestHelper { + + public static List getURLs(File arc) throws IOException { + List urls = new ArrayList(); + if (!arc.exists()) { + throw new IOException("The file '" + arc + "' does not exist"); + } + LineInputStream in = new LineInputStream(arc); + + String line; + long oldOffset = 0; + + // Skip the ARC header + majorheader: + while ((line = in.readLine()) != null) { + if (!line.contains("")) { + continue; + } + while ((line = in.readLine()) != null) { + if (!line.isEmpty()) { + break majorheader; + } + } + } + if (line == null) { + // No recognized records + return urls; + } + + final Pattern URL_EXTRACT = Pattern.compile("^(.+) [0-9]{14} .*"); + // Iterate the records + while (line != null) { + //System.out.println(line + " (absolute offset: " + oldOffset + ")"); + Matcher matcher = URL_EXTRACT.matcher(line); + if (!matcher.find()) { + throw new IllegalArgumentException("Unable to extract URL from '" + line + "'"); + } + urls.add(matcher.group()); + final long delta = getDelta(line); + if (in.skip(delta) != delta) { + System.err.println("Could not skip " + delta + " bytes"); + } + // Skip the newline after content + if (in.read() == -1) { + break; + } + oldOffset = in.getOffset(); + line = in.readLine(); + //noinspection StatementWithEmptyBody + //while ((line = in.readLine()) != null && line.isEmpty()); + } + in.close(); + return urls; + } + + public static class LineInputStream extends FileInputStream { + private long offset = 0; + public LineInputStream(File file) throws FileNotFoundException { + super(file); + } + public String readLine() throws IOException { + ByteArrayOutputStream by = new ByteArrayOutputStream(); + int b; + while ((b = read()) != '\n' && b != -1) { + by.write(b); + } + return by.size() == 0 && b == -1 ? null : by.toString("utf-8"); + } + public long getOffset() { + return offset; + } + + @Override + public int read() throws IOException { + offset++; + return super.read(); + } + + @Override + public int read(byte[] b) throws IOException { + int read = super.read(b); + offset += read; + return read; + } + + @Override + public int read(byte[] b, int off, int len) throws IOException { + int read = super.read(b, off, len); + offset += read; + return read; + } + + @Override + public long skip(long n) throws IOException { + long read = super.skip(n); + offset += read; + return read; + } + } + + /// http://www.example.com/somepath 192.168.10.12 20111129020924 text/html 79022 + private static long getDelta(String line) { + String tokens[] = line.split(" "); + try { + return Long.parseLong(tokens[tokens.length-1]); + } catch (NumberFormatException e) { + throw new IllegalArgumentException("Unable to extract delta from line\n" + line); + } + } +} From 414095ee7beaebe8144fff5201e9b556201f6d9e Mon Sep 17 00:00:00 2001 From: Toke Eskildsen Date: Tue, 17 Feb 2015 09:19:13 +0100 Subject: [PATCH 2/7] Added test for ARC that does not parse --- .../archive/io/arc/ARCReaderFactoryTest.java | 53 ++++++++++++++++--- 1 file changed, 47 insertions(+), 6 deletions(-) diff --git a/src/test/java/org/archive/io/arc/ARCReaderFactoryTest.java b/src/test/java/org/archive/io/arc/ARCReaderFactoryTest.java index 0721f795..46e7d735 100644 --- a/src/test/java/org/archive/io/arc/ARCReaderFactoryTest.java +++ b/src/test/java/org/archive/io/arc/ARCReaderFactoryTest.java @@ -1,10 +1,9 @@ package org.archive.io.arc; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.io.InputStream; -import java.io.RandomAccessFile; +import java.io.*; +import java.net.URL; +import java.util.Iterator; +import java.util.List; import org.archive.io.ArchiveReader; import org.archive.io.ArchiveRecord; @@ -21,6 +20,9 @@ public class ARCReaderFactoryTest extends TestCase { private File testfile1 = new File("src/test/resources/org/archive/format/arc/IAH-20080430204825-00000-blackbook-truncated.arc"); + //private File testfile_nl = new File("src/test/resources/org/archive/format/arc/137542-153-20111129020925-00316-kb-prod-har-003.kb.dk_truncated.arc"); + private File testfile_nl = getResource( + "org/archive/format/arc/137542-153-20111129020925-00316-kb-prod-har-003.kb.dk_truncated.arc"); /** * Test reading uncompressed arcfile for issue @@ -53,5 +55,44 @@ private void offsetResourceTest( File testfile, long offset, String uri ) throws if( raf != null ) raf.close(); } - + + public void testBaseSampleARC() throws IOException { + testIteration(testfile1); + } + // Independent of the ARCReader code + public void testBaseSampleIntegrity() throws IOException { + List urls = ARCTestHelper.getURLs(testfile1); + assertEquals("The correct number of URLs should be extracted", 8, urls.size()); + } + + // Independent of the ARCReader code + public void testVerifyNewlinedSampleIntegrity() throws IOException { + List urls = ARCTestHelper.getURLs(testfile_nl); + assertEquals("The correct number of URLs should be extracted", 3, urls.size()); + } + + /* + This fails, but the independent {@link ARCTestHelper} is able to process it. + Logically one of the implementations is faulty. + */ + public void testNewlinedSampleARC() throws IOException { + testIteration(testfile_nl); + } + + private void testIteration(File arc) throws IOException { + ARCReader reader = ARCReaderFactory.get(arc); + Iterator ir = reader.iterator(); + while (ir.hasNext()) { + System.out.println(ir.next().getHeader().getHeaderValue("subject-uri")); + } + reader.close(); + } + + private static File getResource(String resource) { + URL url = Thread.currentThread().getContextClassLoader().getResource(resource); + if (url == null) { + throw new RuntimeException("The resource '" + resource + "' could not be located in the class path"); + } + return new File(url.getFile()); + } } From a19195a49716dac112eaf2f77776d59f579c6f88 Mon Sep 17 00:00:00 2001 From: Toke Eskildsen Date: Tue, 17 Feb 2015 09:20:15 +0100 Subject: [PATCH 3/7] Adjusted problematic sample to hopefully be technically correct --- ...-00316-kb-prod-har-003.kb.dk_truncated.arc | 83 +++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 src/test/resources/org/archive/format/arc/137542-153-20111129020925-00316-kb-prod-har-003.kb.dk_truncated.arc diff --git a/src/test/resources/org/archive/format/arc/137542-153-20111129020925-00316-kb-prod-har-003.kb.dk_truncated.arc b/src/test/resources/org/archive/format/arc/137542-153-20111129020925-00316-kb-prod-har-003.kb.dk_truncated.arc new file mode 100644 index 00000000..25d9979e --- /dev/null +++ b/src/test/resources/org/archive/format/arc/137542-153-20111129020925-00316-kb-prod-har-003.kb.dk_truncated.arc @@ -0,0 +1,83 @@ +filedesc://137542-153-20111129020925-00316-kb-prod-har-003.kb.dk.arc.open 0.0.0.0 20111129020925 text/plain 1287 +1 1 InternetArchive +URL IP-address Archive-date Content-type Archive-length + + +Heritrix 1.14.4 http://crawler.archive.org +kb-prod-har-003.kb.dk +130.226.228.74 +default_orderxml +Default Profile +Admin +2008-01-18T11:12:17+00:00 +Mozilla/5.0 (compatible; heritrix/1.12.1b +http://netarkivet.dk/website/info.html) +netarkivet-svar@netarkivet.dk +ignore +ARC file version 1.1 +http://www.archive.org/web/researcher/ArcFileFormat.php + + +http://www.deerhunter.dk////Default.aspx?ID=361&ProductComp=2634 80.63.58.81 20111129020924 text/html 548 +HTTP/1.1 200 OK +Connection: close +Date: Tue, 29 Nov 2011 02:09:25 GMT +Server: Microsoft-IIS/6.0 +X-Powered-By: ASP.NET +X-AspNet-Version: 2.0.50727 +Cache-Control: private +Content-Type: text/html; charset=utf-8 +Content-Length: 78781 + + + + +[truncated by hand] + + + + + +http://www.def.dk/sitecore/service/notfound.aspx?item=%2farbejdsforhold%2farbejdsmiljoe%2fsitecore%2fservice%2fnotfound&user=extranet%5cAnonymous&site=website 217.145.53.21 20111129021529 text/html 703 +HTTP/1.1 404 Item not found: /arbejdsforhold/arbejdsmiljoe/sitecore/service/notfound +Connection: close +Date: Tue, 29 Nov 2011 02:15:29 GMT +Server: Microsoft-IIS/6.0 +X-Powered-By: ASP.NET; Sitecore CMS +X-Powered-By: ASP.NET +X-AspNet-Version: 2.0.50727 +Cache-Control: no-cache, no-store +Pragma: no-cache +Expires: -1 +Content-Type: text/html; charset=utf-8 +Content-Length: 4802 + + + + + + Document Not Found +[truncated by hand] +pageTracker._trackPageview(); +} catch(err) {} + + +http://www.dccenergi.dk/privat/fyringsolie/bestil-olie/prev/privat/fyringsolie/privat/node/privat/fyringsolie/privat/privat/fyringsolie/automatisk-olielevering 195.225.91.18 20111129021529 text/html 721 +HTTP/1.1 200 OK +Date: Tue, 29 Nov 2011 02:15:29 GMT +Server: Apache/2.2.3 (Red Hat) mod_ssl/2.2.3 OpenSSL/0.9.8e-fips-rhel5 DAV/2 PHP/5.2.17 +X-Powered-By: PHP/5.2.17 +Expires: Sun, 19 Nov 1978 05:00:00 GMT +Last-Modified: Tue, 29 Nov 2011 02:15:29 GMT +Cache-Control: store, no-cache, must-revalidate +Cache-Control: post-check=0, pre-check=0 +Connection: close +Content-Type: text/html; charset=utf-8 + + + + + +[truncated by hand] + + + From 7b74f92af1b9cadb746b7769aeb95584bdc21eda Mon Sep 17 00:00:00 2001 From: Toke Eskildsen Date: Tue, 17 Feb 2015 11:30:44 +0100 Subject: [PATCH 4/7] Simplified dummy ARC parser and unit test --- .../java/org/archive/io/arc/ARCReaderFactoryTest.java | 4 ++-- src/test/java/org/archive/io/arc/ARCTestHelper.java | 9 ++++----- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/test/java/org/archive/io/arc/ARCReaderFactoryTest.java b/src/test/java/org/archive/io/arc/ARCReaderFactoryTest.java index 46e7d735..509c6f60 100644 --- a/src/test/java/org/archive/io/arc/ARCReaderFactoryTest.java +++ b/src/test/java/org/archive/io/arc/ARCReaderFactoryTest.java @@ -62,13 +62,13 @@ public void testBaseSampleARC() throws IOException { // Independent of the ARCReader code public void testBaseSampleIntegrity() throws IOException { List urls = ARCTestHelper.getURLs(testfile1); - assertEquals("The correct number of URLs should be extracted", 8, urls.size()); + assertEquals("The correct number of URLs should be extracted", 9, urls.size()); } // Independent of the ARCReader code public void testVerifyNewlinedSampleIntegrity() throws IOException { List urls = ARCTestHelper.getURLs(testfile_nl); - assertEquals("The correct number of URLs should be extracted", 3, urls.size()); + assertEquals("The correct number of URLs should be extracted", 4, urls.size()); } /* diff --git a/src/test/java/org/archive/io/arc/ARCTestHelper.java b/src/test/java/org/archive/io/arc/ARCTestHelper.java index 96809d2c..de01fb38 100644 --- a/src/test/java/org/archive/io/arc/ARCTestHelper.java +++ b/src/test/java/org/archive/io/arc/ARCTestHelper.java @@ -32,9 +32,8 @@ public static List getURLs(File arc) throws IOException { throw new IOException("The file '" + arc + "' does not exist"); } LineInputStream in = new LineInputStream(arc); - - String line; - long oldOffset = 0; + String line = in.readLine(); +/* long oldOffset = 0; // Skip the ARC header majorheader: @@ -52,7 +51,7 @@ public static List getURLs(File arc) throws IOException { // No recognized records return urls; } - + */ final Pattern URL_EXTRACT = Pattern.compile("^(.+) [0-9]{14} .*"); // Iterate the records while (line != null) { @@ -70,7 +69,7 @@ public static List getURLs(File arc) throws IOException { if (in.read() == -1) { break; } - oldOffset = in.getOffset(); + //oldOffset = in.getOffset(); line = in.readLine(); //noinspection StatementWithEmptyBody //while ((line = in.readLine()) != null && line.isEmpty()); From 405a32d9177ea450672f38a4fa29124f70ca9f82 Mon Sep 17 00:00:00 2001 From: Toke Eskildsen Date: Tue, 17 Feb 2015 14:57:39 +0100 Subject: [PATCH 5/7] Corrected Content-Length --- ...111129020925-00316-kb-prod-har-003.kb.dk_truncated.arc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/test/resources/org/archive/format/arc/137542-153-20111129020925-00316-kb-prod-har-003.kb.dk_truncated.arc b/src/test/resources/org/archive/format/arc/137542-153-20111129020925-00316-kb-prod-har-003.kb.dk_truncated.arc index 25d9979e..9a705995 100644 --- a/src/test/resources/org/archive/format/arc/137542-153-20111129020925-00316-kb-prod-har-003.kb.dk_truncated.arc +++ b/src/test/resources/org/archive/format/arc/137542-153-20111129020925-00316-kb-prod-har-003.kb.dk_truncated.arc @@ -17,7 +17,7 @@ URL IP-address Archive-date Content-type Archive-length http://www.archive.org/web/researcher/ArcFileFormat.php -http://www.deerhunter.dk////Default.aspx?ID=361&ProductComp=2634 80.63.58.81 20111129020924 text/html 548 +http://www.deerhunter.dk////Default.aspx?ID=361&ProductComp=2634 80.63.58.81 20111129020924 text/html 546 HTTP/1.1 200 OK Connection: close Date: Tue, 29 Nov 2011 02:09:25 GMT @@ -26,7 +26,7 @@ X-Powered-By: ASP.NET X-AspNet-Version: 2.0.50727 Cache-Control: private Content-Type: text/html; charset=utf-8 -Content-Length: 78781 +Content-Length: 307 @@ -37,7 +37,7 @@ Content-Length: 78781 -http://www.def.dk/sitecore/service/notfound.aspx?item=%2farbejdsforhold%2farbejdsmiljoe%2fsitecore%2fservice%2fnotfound&user=extranet%5cAnonymous&site=website 217.145.53.21 20111129021529 text/html 703 +http://www.def.dk/sitecore/service/notfound.aspx?item=%2farbejdsforhold%2farbejdsmiljoe%2fsitecore%2fservice%2fnotfound&user=extranet%5cAnonymous&site=website 217.145.53.21 20111129021529 text/html 702 HTTP/1.1 404 Item not found: /arbejdsforhold/arbejdsmiljoe/sitecore/service/notfound Connection: close Date: Tue, 29 Nov 2011 02:15:29 GMT @@ -49,7 +49,7 @@ Cache-Control: no-cache, no-store Pragma: no-cache Expires: -1 Content-Type: text/html; charset=utf-8 -Content-Length: 4802 +Content-Length: 315 From 8186cc974ab1cfd9a2b69e95e821ed3479a0ca38 Mon Sep 17 00:00:00 2001 From: Toke Eskildsen Date: Wed, 18 Feb 2015 14:08:48 +0100 Subject: [PATCH 6/7] Bugfix: If the status header could not be detected, the http header parser could skip into next record, invalidating further iteration of the ARC stream --- .../java/org/archive/io/arc/ARCReader.java | 2 +- .../java/org/archive/io/arc/ARCRecord.java | 17 +- .../java/org/archive/io/SubInputStream.java | 222 ++++++++++++++++++ .../archive/io/arc/ARCReaderFactoryTest.java | 42 ++-- .../org/archive/io/arc/ARCTestHelper.java | 101 ++++---- 5 files changed, 303 insertions(+), 81 deletions(-) create mode 100644 src/test/java/org/archive/io/SubInputStream.java diff --git a/src/main/java/org/archive/io/arc/ARCReader.java b/src/main/java/org/archive/io/arc/ARCReader.java index 7f85cc2a..2809d5d4 100644 --- a/src/main/java/org/archive/io/arc/ARCReader.java +++ b/src/main/java/org/archive/io/arc/ARCReader.java @@ -164,7 +164,7 @@ protected ARCRecord createArchiveRecord(InputStream is, long offset) } return (ARCRecord)getCurrentRecord(); } - + /** * Returns version of this ARC file. Usually read from first record of ARC. * If we're reading without having first read the first record -- e.g. diff --git a/src/main/java/org/archive/io/arc/ARCRecord.java b/src/main/java/org/archive/io/arc/ARCRecord.java index 21bea07c..2214f070 100644 --- a/src/main/java/org/archive/io/arc/ARCRecord.java +++ b/src/main/java/org/archive/io/arc/ARCRecord.java @@ -137,7 +137,7 @@ public class ARCRecord extends ArchiveRecord implements ARCConstants { public String getHeaderString() { return this.headerString; } - + /** * Constructor. * @@ -233,7 +233,7 @@ public ARCRecord(InputStream in, final String identifier, this(in, identifier, offset, digest, strict, parseHttpHeaders, false, null); } - + private ArchiveRecordHeader parseHeaders(final InputStream in, final String identifier, final long offset, final boolean strict, final boolean isAlignedOnFirstRecord, String version) @@ -241,7 +241,7 @@ private ArchiveRecordHeader parseHeaders(final InputStream in, ArrayList firstLineValues = new ArrayList(20); getTokenizedHeaderLine(in, firstLineValues); - + int bodyOffset = 0; if (offset == 0 && isAlignedOnFirstRecord) { // If offset is zero and we were aligned at first record on @@ -343,7 +343,7 @@ private int getTokenizedHeaderLine(final InputStream stream, // save verbatim header String this.headerString = StringUtils.join(list," "); - + return read; } @@ -569,7 +569,6 @@ private InputStream readHttpHeader() throws IOException { getHeader().getLength() <= MIN_HTTP_HEADER_LENGTH) { return null; } - String statusLine; byte[] statusBytes; int eolCharCount = 0; @@ -600,11 +599,15 @@ private InputStream readHttpHeader() throws IOException { if (!statusLine.contains(":") && StatusLine.startsWithHTTP(statusLine)) { break; } - + + if (statusLine.replace("\r", "").isEmpty()) { // No more headerlines + break; + } + // Add bytes read to error "offset" to add to position errOffset += statusBytes.length; } - + if (errOffset > 0) { this.incrementPosition(errOffset); } diff --git a/src/test/java/org/archive/io/SubInputStream.java b/src/test/java/org/archive/io/SubInputStream.java new file mode 100644 index 00000000..4dcb15f7 --- /dev/null +++ b/src/test/java/org/archive/io/SubInputStream.java @@ -0,0 +1,222 @@ +/* + * $Header: $ + * $Revision$ + * $Date$ + * + * ==================================================================== + * + * Copyright 1999-2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * ==================================================================== + * + * This software consists of voluntary contributions made by many + * individuals on behalf of the Apache Software Foundation. For more + * information on the Apache Software Foundation, please see + * . + * + */ +/* + * + */ +package org.archive.io; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; + +/** + * Encapsulates another stream, keeping track of local as well as global offsets. + * The SubStream has a max size and close() ensures that the encapsulated Stream + * is fast-forwarded to that point. + *

+ * Note: Calling close on the SubInputStream does not close the wrapped stream. + * Note 2: This implementation relies on the wrapped InputStream not to return 0 + * in {@link java.io.InputStream#available()} until that stream has been depleted. + */ +public class SubInputStream extends InputStream { + private final InputStream inner; + private final long globalPosOrigo; + + private long length; + private long mark = -1; + private long pos = 0; + + /** + * Wraps the inner Stream with no max on bytes read. Reduces functionality to tracking position. + *

+ * Note: The length can be specified later with {@link #setLength(long)}. + * @param inner data source. + */ + public SubInputStream(InputStream inner) { + this(inner, Long.MAX_VALUE, 0); + } + /** + * Wraps the inner InputStream with a max on bytes read. + * @param inner data source. + * @param length the number of bytes that at a maximum can be read from inner. + */ + public SubInputStream(InputStream inner, long length) { + this(inner, length, 0); + } + /** + * Wraps the inner InputStream with a max on bytes read. + * @param inner data source. + * @param length the number of bytes that at a maximum can be read from inner. + * @param globalPosition the position in the inner stream. + */ + public SubInputStream(InputStream inner, long length, long globalPosition) { + this.inner = inner; + globalPosOrigo = globalPosition; + this.length = length; + } + + /** + * @return the position from the virtual stream. + */ + public long getPosition() { + return pos; + } + + /** + * @return the position in the wrapped stream, if the starting point was stated during construction. + */ + public long getGlobalPosition() { + return globalPosOrigo + pos; + } + + public void setLength(long length) { + if (length <= pos) { + throw new IllegalStateException( + "The position is " + pos + " which is past the allowed virtual length " + length); + } + this.length = length; + } + + public long getLength() { + return length; + } + + /** + * Reads to the next '\n' and returns the line as an UTF-8 string, excluding trailing + * carriage returns {@code '\r'} and newlines {@code '\n'}. + * @return the next line or null if EOF. + * @throws IOException if there was a problem reading bytes from inner. + */ + public String readLine() throws IOException { + byte[] bytes = readLineBytes(); + if (bytes == null) { + return null; + } + int length = bytes.length; + while (length > 0 && (bytes[length-1] == '\n' || bytes[length-1] == '\r')) { + length--; + } + return new String(bytes, 0, length, "utf-8"); + } + /** + * Reads to the next '\n' and returns the line as raw bytes, including the delimiting '\n'. + * @return the next line. + * @throws IOException if there was a problem reading bytes from inner. + */ + public byte[] readLineBytes() throws IOException { + ByteArrayOutputStream by = new ByteArrayOutputStream(); + int b; + while ((b = read()) != -1) { + by.write(b); + if (b == '\n') { + break; + } + } + return by.size() == 0 && b == -1 ? null : by.toByteArray(); + } + + /* Delegates from inner InputStream */ + + @Override + public int read() throws IOException { + if (available() == 0) { + return -1; + } + int c = inner.read(); + if (c != -1) { + pos++; + } + return c; + } + + @Override + public int read(byte[] b) throws IOException { + return read(b, 0, b.length); + } + + @Override + public int read(byte[] b, int off, int len) throws IOException { + if (len == 0) { + return 0; + } + if (available() == 0) { + return -1; + } + len = Math.min(len, available()); + int r = inner.read(b, off, len); + if (r != -1) { // EOF + pos += r; + } + return r; + } + + @Override + public long skip(long n) throws IOException { + n = Math.min(n, available()); + if (n <= 0) { + return 0; + } + long s = inner.skip(n); + pos += s; + return s; + } + + @Override + public int available() throws IOException { + return (int) Math.min(inner.available(), length - pos); + } + + @SuppressWarnings("ResultOfMethodCallIgnored") + @Override + public void close() throws IOException { + if (pos < length) { + skip(length - pos); + } + } + + @Override + public void mark(int readlimit) { + mark = pos; + inner.mark(readlimit); + } + + @Override + public void reset() throws IOException { + if (mark == -1) { + throw new IOException("A mark must be set before reset is called"); + } + inner.reset(); + pos = mark; + } + + @Override + public boolean markSupported() { + return inner.markSupported(); + } +} diff --git a/src/test/java/org/archive/io/arc/ARCReaderFactoryTest.java b/src/test/java/org/archive/io/arc/ARCReaderFactoryTest.java index 509c6f60..03ba0341 100644 --- a/src/test/java/org/archive/io/arc/ARCReaderFactoryTest.java +++ b/src/test/java/org/archive/io/arc/ARCReaderFactoryTest.java @@ -2,13 +2,13 @@ import java.io.*; import java.net.URL; -import java.util.Iterator; import java.util.List; import org.archive.io.ArchiveReader; import org.archive.io.ArchiveRecord; import junit.framework.TestCase; +import org.archive.io.SubInputStream; /** * @@ -57,35 +57,49 @@ private void offsetResourceTest( File testfile, long offset, String uri ) throws } public void testBaseSampleARC() throws IOException { - testIteration(testfile1); + testARCReaderIteration(testfile1, 9); } + /* + This failed with the old http-header parsing code in {@code ARCRecord#readHttpHeader}. + */ + public void testNewlinedSampleARC() throws IOException { + testARCReaderIteration(testfile_nl, 4); + } + // Independent of the ARCReader code public void testBaseSampleIntegrity() throws IOException { List urls = ARCTestHelper.getURLs(testfile1); assertEquals("The correct number of URLs should be extracted", 9, urls.size()); } - - // Independent of the ARCReader code public void testVerifyNewlinedSampleIntegrity() throws IOException { List urls = ARCTestHelper.getURLs(testfile_nl); assertEquals("The correct number of URLs should be extracted", 4, urls.size()); } - /* - This fails, but the independent {@link ARCTestHelper} is able to process it. - Logically one of the implementations is faulty. - */ - public void testNewlinedSampleARC() throws IOException { - testIteration(testfile_nl); + public void testNewlinedSampleARCContentLength() throws IOException { + ARCTestHelper.testARCContentLength(testfile_nl); + } + public void testBaseSampleARCContentLength() throws IOException { + ARCTestHelper.testARCContentLength(testfile1); } +// public void testLocalSampleARCContentLength() throws IOException { +// ARCTestHelper.testARCContentLength( +// new File("/home/te/tmp/warc/137542-153-20111129020925-00316-kb-prod-har-003.kb.dk.arc")); +// } - private void testIteration(File arc) throws IOException { + // Uncomment println for manual inspection of first content line + private void testARCReaderIteration(File arc, int expectedRecords) throws IOException { ARCReader reader = ARCReaderFactory.get(arc); - Iterator ir = reader.iterator(); - while (ir.hasNext()) { - System.out.println(ir.next().getHeader().getHeaderValue("subject-uri")); + int recordCount = 0; + for (ArchiveRecord record : reader) { + SubInputStream sub = new SubInputStream(record); + sub.skip(record.getHeader().getContentBegin()); + //System.out.println(record.getPosition() + "> " + sub.readLine()); + sub.close(); + recordCount++; } reader.close(); + assertEquals("There should be the right number of records in " + arc, expectedRecords, recordCount); } private static File getResource(String resource) { diff --git a/src/test/java/org/archive/io/arc/ARCTestHelper.java b/src/test/java/org/archive/io/arc/ARCTestHelper.java index de01fb38..9cedbfe4 100644 --- a/src/test/java/org/archive/io/arc/ARCTestHelper.java +++ b/src/test/java/org/archive/io/arc/ARCTestHelper.java @@ -14,8 +14,9 @@ */ package org.archive.io.arc; +import org.archive.io.SubInputStream; + import java.io.*; -import java.net.URL; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; @@ -26,32 +27,15 @@ */ public class ARCTestHelper { + // Extracts the header URLs from an ARC file public static List getURLs(File arc) throws IOException { List urls = new ArrayList(); if (!arc.exists()) { throw new IOException("The file '" + arc + "' does not exist"); } - LineInputStream in = new LineInputStream(arc); + InputStream fis = new FileInputStream(arc); + SubInputStream in = new SubInputStream(fis); String line = in.readLine(); -/* long oldOffset = 0; - - // Skip the ARC header - majorheader: - while ((line = in.readLine()) != null) { - if (!line.contains("")) { - continue; - } - while ((line = in.readLine()) != null) { - if (!line.isEmpty()) { - break majorheader; - } - } - } - if (line == null) { - // No recognized records - return urls; - } - */ final Pattern URL_EXTRACT = Pattern.compile("^(.+) [0-9]{14} .*"); // Iterate the records while (line != null) { @@ -75,52 +59,51 @@ public static List getURLs(File arc) throws IOException { //while ((line = in.readLine()) != null && line.isEmpty()); } in.close(); + fis.close(); return urls; } - public static class LineInputStream extends FileInputStream { - private long offset = 0; - public LineInputStream(File file) throws FileNotFoundException { - super(file); - } - public String readLine() throws IOException { - ByteArrayOutputStream by = new ByteArrayOutputStream(); - int b; - while ((b = read()) != '\n' && b != -1) { - by.write(b); - } - return by.size() == 0 && b == -1 ? null : by.toString("utf-8"); - } - public long getOffset() { - return offset; - } - - @Override - public int read() throws IOException { - offset++; - return super.read(); + // Checks that the two content lengths (ARC and server-issued) for each record matches + public static void testARCContentLength(File arc) throws IOException { + if (!arc.exists()) { + throw new IOException("The file '" + arc + "' does not exist"); } + InputStream fis = new FileInputStream(arc); + SubInputStream out = new SubInputStream(fis); - @Override - public int read(byte[] b) throws IOException { - int read = super.read(b); - offset += read; - return read; - } + final Pattern CONTENT_LENGTH = Pattern.compile("Content-Length: ([0-9]+)[^0-9]*"); + String outline; + while ((outline = out.readLine()) != null) { + if (outline.isEmpty()) { + throw new IllegalStateException("Got unexpected empty line. Next line is\n" + out.readLine()); - @Override - public int read(byte[] b, int off, int len) throws IOException { - int read = super.read(b, off, len); - offset += read; - return read; - } + } + final long delta = getDelta(outline); + SubInputStream sub = new SubInputStream(out, delta, out.getPosition()); + long contentLength = -1; + String inline; + while ((inline = sub.readLine()) != null) { + Matcher clMatcher = CONTENT_LENGTH.matcher(inline); + if (clMatcher.matches()) { + contentLength = Long.parseLong(clMatcher.group(1)); + } + if (inline.isEmpty() || "\r".equals(inline)) { + break; + } + } + if (contentLength != -1 && contentLength != sub.available()) { + throw new IllegalStateException(String.format( + "sub_pos=%6d, sub_length=%6d, sub_available=%6d, Content-Length=%6d, header=%s", + sub.getPosition(), sub.getLength(), sub.available(), contentLength, outline)); + } + sub.close(); - @Override - public long skip(long n) throws IOException { - long read = super.skip(n); - offset += read; - return read; + // Newline delimiter + if (out.read() == -1) { + break; + } } + fis.close(); } /// http://www.example.com/somepath 192.168.10.12 20111129020924 text/html 79022 From 6f5c60734d499cafa079c484ca60f56b6caf0a93 Mon Sep 17 00:00:00 2001 From: Toke Eskildsen Date: Wed, 18 Feb 2015 14:45:33 +0100 Subject: [PATCH 7/7] Bugfix: The lax parser for HTTP-status did not accept ":" in the line, but ":" is legal according to RFC2616 Test: Extended unit test to check for extracted status --- src/main/java/org/archive/io/arc/ARCRecord.java | 5 +++-- .../java/org/archive/io/arc/ARCReaderFactoryTest.java | 11 ++++++++--- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/main/java/org/archive/io/arc/ARCRecord.java b/src/main/java/org/archive/io/arc/ARCRecord.java index 2214f070..9aec90f5 100644 --- a/src/main/java/org/archive/io/arc/ARCRecord.java +++ b/src/main/java/org/archive/io/arc/ARCRecord.java @@ -596,11 +596,12 @@ private InputStream readHttpHeader() throws IOException { // If it's actually the status line, break, otherwise continue skipping any // previous header values - if (!statusLine.contains(":") && StatusLine.startsWithHTTP(statusLine)) { + // Old code contained {@code !statusLine.contains(":")}, which conflicts with RFC2616-sec6 + if (StatusLine.startsWithHTTP(statusLine)) { break; } - if (statusLine.replace("\r", "").isEmpty()) { // No more headerlines + if (statusLine.replace("\r", "").isEmpty()) { // No more header lines break; } diff --git a/src/test/java/org/archive/io/arc/ARCReaderFactoryTest.java b/src/test/java/org/archive/io/arc/ARCReaderFactoryTest.java index 03ba0341..71c444b9 100644 --- a/src/test/java/org/archive/io/arc/ARCReaderFactoryTest.java +++ b/src/test/java/org/archive/io/arc/ARCReaderFactoryTest.java @@ -57,13 +57,13 @@ private void offsetResourceTest( File testfile, long offset, String uri ) throws } public void testBaseSampleARC() throws IOException { - testARCReaderIteration(testfile1, 9); + testARCReaderIteration(testfile1, 9, 7); } /* This failed with the old http-header parsing code in {@code ARCRecord#readHttpHeader}. */ public void testNewlinedSampleARC() throws IOException { - testARCReaderIteration(testfile_nl, 4); + testARCReaderIteration(testfile_nl, 4, 3); // Status has 2*200 & 1*404 } // Independent of the ARCReader code @@ -88,10 +88,14 @@ public void testBaseSampleARCContentLength() throws IOException { // } // Uncomment println for manual inspection of first content line - private void testARCReaderIteration(File arc, int expectedRecords) throws IOException { + private void testARCReaderIteration(File arc, int expectedRecords, int hasStatus) throws IOException { ARCReader reader = ARCReaderFactory.get(arc); int recordCount = 0; + int okCount = 0; for (ArchiveRecord record : reader) { + if (((ARCRecord)record).getStatusCode() != -1) { + okCount++; + } SubInputStream sub = new SubInputStream(record); sub.skip(record.getHeader().getContentBegin()); //System.out.println(record.getPosition() + "> " + sub.readLine()); @@ -100,6 +104,7 @@ private void testARCReaderIteration(File arc, int expectedRecords) throws IOExce } reader.close(); assertEquals("There should be the right number of records in " + arc, expectedRecords, recordCount); + assertEquals("There should be the right number of status 200 records in " + arc, hasStatus, okCount); } private static File getResource(String resource) {