From 11579c2baab0db08f14341f70b848353eed17269 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Wed, 22 Feb 2017 13:11:13 +0100
Subject: [PATCH 001/189] Improve HTML link extraction - add extractors for
more elements which can take URLs as attribute values, add missing
attributes - generalize extraction of "global" attributes (`background`) -
add custom data attributes frequently used for linking (`data-href`,
`data-uri`) - add unit test to cover link extraction
---
.../html/ExtractingParseObserver.java | 79 ++++-
.../html/ExtractingParseObserverTest.java | 161 +++++++++
.../resource/html/link-extraction-test.warc | 320 ++++++++++++++++++
3 files changed, 551 insertions(+), 9 deletions(-)
create mode 100644 src/test/resources/org/archive/resource/html/link-extraction-test.warc
diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
index deb8c8c0..826851e0 100644
--- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
+++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
@@ -2,12 +2,17 @@
import java.util.ArrayList;
import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Locale;
import java.util.Map;
+import java.util.Set;
import java.util.Stack;
+import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.archive.format.text.html.ParseObserver;
+import org.htmlparser.Attribute;
import org.htmlparser.nodes.RemarkNode;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.nodes.TextNode;
@@ -36,11 +41,10 @@ public class ExtractingParseObserver implements ParseObserver {
private final static int MAX_TEXT_LEN = 100;
-// private static String GLOBAL_ATTR[] = {"background"};
-
private static final String PATH = "path";
private static final String PATH_SEPARATOR = "@/";
- private final static Map extractors;
+ private static final Map extractors;
+ private static final Set globalHrefAttributes;
static {
extractors = new HashMap();
extractors.put("A", new AnchorTagExtractor());
@@ -57,6 +61,22 @@ public class ExtractingParseObserver implements ParseObserver {
extractors.put("META", new MetaTagExtractor());
extractors.put("OBJECT", new ObjectTagExtractor());
extractors.put("SCRIPT", new ScriptTagExtractor());
+ extractors.put("Q", new QuotationLinkTagExtractor());
+ extractors.put("BLOCKQUOTE", new QuotationLinkTagExtractor());
+ extractors.put("DEL", new QuotationLinkTagExtractor());
+ extractors.put("INS", new QuotationLinkTagExtractor());
+ // HTML5:
+ extractors.put("BUTTON", new ButtonTagExtractor());
+ extractors.put("MENUITEM", new MenuitemTagExtractor());
+ extractors.put("VIDEO", new EmbedVideoTagExtractor());
+ extractors.put("AUDIO", new EmbedTagExtractor());
+ extractors.put("TRACK", new EmbedTagExtractor());
+ extractors.put("SOURCE", new EmbedTagExtractor());
+
+ globalHrefAttributes = new HashSet();
+ globalHrefAttributes.add("background");
+ globalHrefAttributes.add("data-href");
+ globalHrefAttributes.add("data-uri");
}
@@ -84,11 +104,19 @@ public void handleTagOpen(TagNode tag) {
inTitle = !tag.isEmptyXmlTag();
return;
}
+
// first the global attributes:
- // background
- String v = tag.getAttribute("background");
- if(v != null) {
- data.addHref(PATH,makePath(name,"background"),"url",v);
+ Vector attributes = tag.getAttributesEx();
+ for (Attribute a : attributes) {
+ String attrName = a.getName();
+ String attrValue = a.getValue();
+ if (attrName == null || attrValue == null) {
+ continue;
+ }
+ attrName = attrName.toLowerCase(Locale.ROOT);
+ if (globalHrefAttributes.contains(attrName)) {
+ data.addHref(PATH,makePath(name,attrName),"url",attrValue);
+ }
}
// TODO: style attribute, BASE(href) tag, Resolve URLs
@@ -296,12 +324,24 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
}
}
+ private static class ButtonTagExtractor implements TagExtractor {
+ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
+ addBasicHrefs(data,node,"formaction");
+ }
+ }
+
private static class EmbedTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
addBasicHrefs(data,node,"src");
}
}
+ private static class EmbedVideoTagExtractor implements TagExtractor {
+ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
+ addBasicHrefs(data,node,"src","poster");
+ }
+ }
+
private static class FormTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
ArrayList l = new ArrayList();
@@ -329,21 +369,26 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
addBasicHrefs(data,node,"src");
}
}
+
private static class IFrameTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
addBasicHrefs(data,node,"src");
}
}
+
private static class ImgTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
addHrefWithAttrs(data,node,"src","alt","title");
+ addBasicHrefs(data,node,"longdesc");
}
}
+
private static class InputTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
- addBasicHrefs(data,node,"src");
+ addBasicHrefs(data,node,"src","formaction");
}
}
+
private static class LinkTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
ArrayList l = getAttrListUrl(node,"href","rel","type");
@@ -352,6 +397,13 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
}
}
}
+
+ private static class MenuitemTagExtractor implements TagExtractor {
+ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
+ addBasicHrefs(data,node,"icon");
+ }
+ }
+
private static class MetaTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
ArrayList l = getAttrList(node,"name","rel","content","http-equiv");
@@ -360,11 +412,19 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
}
}
}
+
private static class ObjectTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
- addBasicHrefs(data,node,"codebase","cdata");
+ addBasicHrefs(data,node,"codebase","cdata","data");
}
}
+
+ private static class QuotationLinkTagExtractor implements TagExtractor {
+ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
+ addBasicHrefs(data,node,"cite");
+ }
+ }
+
private static class ScriptTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
ArrayList l = getAttrListUrl(node,"src","type");
@@ -373,6 +433,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
}
}
}
+
private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String content) {
Matcher m = pattern.matcher(content);
int idx = 0;
diff --git a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
index bfbd6f02..8f690a06 100644
--- a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
+++ b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
@@ -1,15 +1,33 @@
package org.archive.resource.html;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.logging.Logger;
+
+import org.archive.extract.ExtractingResourceFactoryMapper;
+import org.archive.extract.ExtractingResourceProducer;
+import org.archive.extract.ProducerUtils;
+import org.archive.extract.ResourceFactoryMapper;
import org.archive.resource.MetaData;
+import org.archive.resource.Resource;
+import org.archive.resource.ResourceParseException;
+import org.archive.resource.ResourceProducer;
import org.htmlparser.nodes.TextNode;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
+import com.google.common.collect.ArrayListMultimap;
+import com.google.common.collect.Multimap;
+
import junit.framework.TestCase;
public class ExtractingParseObserverTest extends TestCase {
+ private static final Logger LOG =
+ Logger.getLogger(ExtractingParseObserverTest.class.getName());
+
public void testHandleStyleNodeExceptions() throws Exception {
String[] tests = {
"some css",
@@ -103,5 +121,148 @@ private void checkExtract(String[] data) throws JSONException {
}
}
+ private void checkLink(Multimap links, String url, String path) {
+ assertTrue("Link with URL " + url + " not found", links.containsKey(url));
+ assertTrue("Wrong path " + path + " for " + url, links.get(url).contains(path));
+ }
+
+ private void checkLinks(Resource resource, String[][] expectedLinks) {
+ assertNotNull(resource);
+ assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource);
+ MetaData md = resource.getMetaData();
+ LOG.info(md.toString());
+ Multimap links = ArrayListMultimap.create();
+ JSONObject head = md.optJSONObject("Head");
+ if (head != null) {
+ //
+ String baseUrl = (String) head.opt("Base");
+ if (baseUrl != null) {
+ links.put(baseUrl, "__base__");
+ }
+ //
+ JSONArray metas = head.optJSONArray("Metas");
+ if (metas != null) {
+ for (int i = 0; i < metas.length(); i++) {
+ JSONObject o = (JSONObject) metas.optJSONObject(i);
+ String httpEquiv = o.optString("http-equiv");
+ if (httpEquiv != null && httpEquiv.equalsIgnoreCase("Refresh")) {
+ String metaRefreshTarget = o.optString("content");
+ if (metaRefreshTarget != null) {
+ metaRefreshTarget = metaRefreshTarget.replaceFirst("(?i)(?:^\\d+\\s*;)?\\s*url=", "");
+ links.put(metaRefreshTarget, "__meta_refresh__");
+ }
+ }
+ }
+ }
+ }
+ // extract outlinks
+ List linkArrays = new ArrayList();
+ if (md.optJSONArray("Links") != null) {
+ linkArrays.add(md.optJSONArray("Links"));
+ }
+ try {
+ if (md.getJSONObject("Head") != null && md.getJSONObject("Head").getJSONArray("Link") != null) {
+ linkArrays.add(md.getJSONObject("Head").getJSONArray("Link"));
+ }
+ } catch (JSONException e1) {
+ }
+ for (JSONArray ldata : linkArrays) {
+ for (int i = 0; i < ldata.length(); i++) {
+ JSONObject o = (JSONObject) ldata.optJSONObject(i);
+ try {
+ String url = o.getString("url");
+ links.put(url, o.getString("path"));
+ LOG.info(" found link: " + o.getString("url") + " " + o.getString("path"));
+ } catch (JSONException e) {
+ fail("Failed to extract URL from link: " + e.getMessage());
+ }
+ }
+ }
+ assertEquals("Unexpected number of links", expectedLinks.length, links.size());
+ for (String[] l : expectedLinks) {
+ checkLink(links, l[0], l[1]);
+ }
+ }
+
+ public void testLinkExtraction() throws ResourceParseException, IOException {
+ String testFileName = "link-extraction-test.warc";
+ ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath());
+ ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
+ ExtractingResourceProducer extractor =
+ new ExtractingResourceProducer(producer, mapper);
+ extractor.getNext(); // skip warcinfo record
+ String[][] html4links = {
+ {"http://www.example.com/", "__base__"},
+ {"http://www.example.com/redirected.html", "__meta_refresh__"},
+ {"background.jpg", "BODY@/background"},
+ {"http://www.example.com/a-href.html", "A@/href"},
+ {"#anchor", "A@/href"},
+ {"image.png", "IMG@/src"},
+ {"image.gif", "IMG@/src"},
+ {"http://example.com/image-description.html#image.gif", "IMG@/longdesc"},
+ {"helloworld.swf", "OBJECT@/data"},
+ {"http://www.example.com/shakespeare.html", "Q@/cite"},
+ {"http://www.example.com/shakespeare-long.html", "BLOCKQUOTE@/cite"}
+ };
+ checkLinks(extractor.getNext(), html4links);
+ String[][] html5links = {
+ {"http:///www.example.com/video.html", "LINK@/href", "canonical"},
+ {"video.rss", "LINK@/href", "alternate"},
+ {"https://archive.org/download/WebmVp8Vorbis/webmvp8.gif", "VIDEO@/poster"},
+ {"https://archive.org/download/WebmVp8Vorbis/webmvp8.webm", "SOURCE@/src"},
+ {"https://archive.org/download/WebmVp8Vorbis/webmvp8_512kb.mp4", "SOURCE@/src"},
+ {"https://archive.org/download/WebmVp8Vorbis/webmvp8.ogv", "SOURCE@/src"}
+ };
+ checkLinks(extractor.getNext(), html5links);
+ String[][] html5links2 = {
+ {"http://www.example.com/", "A@/href"},
+ };
+ checkLinks(extractor.getNext(), html5links2);
+ String[][] fbVideoLinks = {
+ {"https://www.facebook.com/facebook/videos/10153231379946729/", "BLOCKQUOTE@/cite"},
+ {"https://www.facebook.com/facebook/videos/10153231379946729/", "A@/href"},
+ {"https://www.facebook.com/facebook/", "A@/href"},
+ {"https://www.facebook.com/facebook/videos/10153231379946729/", "DIV@/data-href"}
+ };
+ checkLinks(extractor.getNext(), fbVideoLinks);
+ String[][] dataHrefLinks = {
+ {"standard.css", "LINK@/href", "stylesheet"},
+ {"https://www.facebook.com/elegantthemes/videos/10153760379211923/", "DIV@/data-href"},
+ {"https://www.facebook.com/facebook/videos/10153231379946729/", "DIV@/data-href"},
+ {"https://www.facebook.com/facebook/videos/10153231379946729/", "BLOCKQUOTE@/cite"},
+ {"https://www.facebook.com/facebook/videos/10153231379946729/", "A@/href"},
+ {"https://www.facebook.com/facebook/", "A@/href"},
+ {"//edge.flowplayer.org/bauhaus.webm", "SOURCE@/src"},
+ {"//edge.flowplayer.org/bauhaus.mp4", "SOURCE@/src"},
+ {"//edge.flowplayer.org/functional.webm", "BUTTON@/data-href"},
+ {"/content-page", "ARTICLE@/data-href"},
+ {"/content-page", "A@/href"},
+ {"/tags/content","A@/href"},
+ {"/tags/headlines", "A@/href"},
+ {"http://grabaperch.com", "DIV@/data-href"},
+ {"green.css", "LINK@/data-href"},
+ {"blue.css", "LINK@/data-href"},
+ {"http://codecanyon.net/user/CodingJack", "A@/data-href"},
+ {"jackbox/img/thumbs/4.jpg", "IMG@/src"},
+ {"//venobox-destination", "A@/data-href"},
+ {"#", "A@/href"},
+ {"http://www.youtube.com/v/itTskyFLSS8&rel=0&autohide=1&showinfo=0&autoplay=1", "DIV@/data-href"},
+ {"#", "A@/href"},
+ {"http://www.youtube.com/v/itTskyFLSS8&rel=0&autohide=1&showinfo=0", "IFRAME@/src"}
+ };
+ checkLinks(extractor.getNext(), dataHrefLinks);
+ String[][] fbSocialLinks = {
+ {"http://www.your-domain.com/your-page.html", "DIV@/data-uri"},
+ {"https://developers.facebook.com/docs/plugins/comments#configurator", "DIV@/data-href"},
+ {"https://www.facebook.com/zuck/posts/10102735452532991?comment_id=1070233703036185", "DIV@/data-href"},
+ {"https://www.facebook.com/zuck", "DIV@/data-href"},
+ {"https://developers.facebook.com/docs/plugins/", "DIV@/data-href"},
+ {"https://www.facebook.com/facebook", "DIV@/data-href"},
+ {"https://www.facebook.com/facebook", "BLOCKQUOTE@/cite"},
+ {"https://www.facebook.com/facebook", "A@/href"},
+ {"http://www.your-domain.com/your-page.html", "DIV@/data-href"}
+ };
+ checkLinks(extractor.getNext(), fbSocialLinks);
+ }
}
diff --git a/src/test/resources/org/archive/resource/html/link-extraction-test.warc b/src/test/resources/org/archive/resource/html/link-extraction-test.warc
new file mode 100644
index 00000000..ab0e54c8
--- /dev/null
+++ b/src/test/resources/org/archive/resource/html/link-extraction-test.warc
@@ -0,0 +1,320 @@
+WARC/1.0
+WARC-Type: warcinfo
+Content-Type: application/warc-fields
+WARC-Date: 2017-02-20T14:00:56Z
+Content-Length: 128
+
+format: WARC File Format 1.0
+conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf
+robots: classic
+
+
+
+WARC/1.0
+WARC-Type: response
+WARC-Date: 2017-02-20T14:00:56Z
+WARC-Target-URI: http://www.example.com/html4.html
+Content-Type: application/http; msgtype=response
+Content-Length: 1243
+
+HTTP/1.1 200 OK
+Date: Mon, 20 Feb 2017 14:00:56 GMT
+Content-Length: 1125
+Content-Type: application/xhtml+xml
+
+
+
+
+
+
+
+Test XHTML Link Extraction
+
+
+A@/href
+
+ anchor only
+
+
+
+
+
+ To be or not to be.
+
+
+To be, or not to be, that is the question:
+Whether 'tis nobler in the mind to suffer
+The slings and arrows of outrageous fortune, …
+
+
+
+
+
+WARC/1.0
+WARC-Type: response
+WARC-Target-URI: http://www.example.com/link-extraction-test-html5-video.html
+WARC-Date: 2017-02-20T21:35:03Z
+Content-Type: application/http; msgtype=response
+Content-Length: 890
+
+HTTP/1.1 200 OK
+Date: Mon, 20 Feb 2017 21:35:03 GMT
+Content-Length: 789
+Content-Type: text/html
+
+
+
+
+Test HTML5 Video Tag
+
+
+
+
+
+
+
+
+
+
+
+WARC/1.0
+WARC-Type: response
+WARC-Target-URI: http://www.example.com/poor_html5.html
+WARC-Date: 2017-02-21T15:50:40Z
+Content-Type: application/http; msgtype=response
+Content-Length: 594
+
+HTTP/1.1 200 OK
+Date: Tue, 21 Feb 2017 15:50:40 GMT
+Content-Length: 486
+Content-Type: text/html
+
+
+Testing poor HTML5
+
+
+
+
+
+This is valid HTML5!
+
+
+
+
+
+headline
+
+paragraph one with link.
+
+
+WARC/1.0
+WARC-Type: response
+WARC-Target-URI: http://www.example.com/fb-video.html
+WARC-Date: 2017-02-20T16:58:50Z
+Content-Type: application/http; msgtype=response
+Content-Length: 1330
+
+HTTP/1.1 200 OK
+Date: Mon, 20 Feb 2017 16:58:50 GMT
+Content-Length: 1194
+Content-Type: text/html
+
+
+
+
+ fb-video - Embedded Videos - Social Plugins
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+WARC/1.0
+WARC-Type: response
+WARC-Target-URI: http://www.example.com/data-href.examples.html
+WARC-Date: 2017-02-21T21:05:10Z
+Content-Type: application/http; msgtype=response
+Content-Length: 3160
+
+HTTP/1.1 200 OK
+Date: Tue, 21 Feb 2017 21:05:10 GMT
+Content-Length: 3057
+Content-Type: text/html
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ And here goes a bit of copy about the content of the article.
+ Tags: content, headlines
+
+
+
+
+
+
+
+
+
+
+
+venobox
+
+
+
+
+
+WARC/1.0
+WARC-Type: response
+WARC-Target-URI: http://www.example.com/fb-social-plugins.html
+WARC-Date: 2017-02-22T09:33:02Z
+Content-Type: application/http; msgtype=response
+Content-Length: 1870
+
+HTTP/1.1 200 OK
+Date: Wed, 22 Feb 2017 09:33:02 GMT
+Content-Length: 1767
+Content-Type: text/html
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
From 6aa43f83a2cbc2acd0feb7f2c81d66f4ef1b13c5 Mon Sep 17 00:00:00 2001
From: Mohamed Elsayed
Date: Thu, 2 Mar 2017 15:28:16 +0200
Subject: [PATCH 002/189] Fix #25: move missing unit tests over from Heritrix3
---
.../archive/io/ArchiveReaderFactoryTest.java | 94 +++
.../io/BufferedSeekInputStreamTest.java | 67 ++
.../archive/io/HeaderedArchiveRecordTest.java | 209 ++++++
.../archive/io/RecordingInputStreamTest.java | 132 ++++
.../archive/io/ReplayCharSequenceTest.java | 391 ++++++++++
.../io/RepositionableInputStreamTest.java | 70 ++
.../org/archive/io/arc/ARCWriterPoolTest.java | 122 +++
.../org/archive/io/arc/ARCWriterTest.java | 699 ++++++++++++++++++
.../org/archive/io/warc/WARCWriterTest.java | 512 +++++++++++++
.../org/archive/uid/UUIDGeneratorTest.java | 44 ++
.../java/org/archive/util/FileUtilsTest.java | 271 +++++++
.../org/archive/util/MimetypeUtilsTest.java | 63 ++
.../org/archive/util/PropertyUtilsTest.java | 45 ++
.../org/archive/util/anvl/ANVLRecordTest.java | 128 ++++
14 files changed, 2847 insertions(+)
create mode 100644 src/test/java/org/archive/io/ArchiveReaderFactoryTest.java
create mode 100644 src/test/java/org/archive/io/BufferedSeekInputStreamTest.java
create mode 100644 src/test/java/org/archive/io/HeaderedArchiveRecordTest.java
create mode 100644 src/test/java/org/archive/io/RecordingInputStreamTest.java
create mode 100644 src/test/java/org/archive/io/ReplayCharSequenceTest.java
create mode 100644 src/test/java/org/archive/io/RepositionableInputStreamTest.java
create mode 100644 src/test/java/org/archive/io/arc/ARCWriterPoolTest.java
create mode 100644 src/test/java/org/archive/io/arc/ARCWriterTest.java
create mode 100644 src/test/java/org/archive/io/warc/WARCWriterTest.java
create mode 100644 src/test/java/org/archive/uid/UUIDGeneratorTest.java
create mode 100644 src/test/java/org/archive/util/FileUtilsTest.java
create mode 100644 src/test/java/org/archive/util/MimetypeUtilsTest.java
create mode 100644 src/test/java/org/archive/util/PropertyUtilsTest.java
create mode 100644 src/test/java/org/archive/util/anvl/ANVLRecordTest.java
diff --git a/src/test/java/org/archive/io/ArchiveReaderFactoryTest.java b/src/test/java/org/archive/io/ArchiveReaderFactoryTest.java
new file mode 100644
index 00000000..2313868c
--- /dev/null
+++ b/src/test/java/org/archive/io/ArchiveReaderFactoryTest.java
@@ -0,0 +1,94 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Iterator;
+
+import org.apache.commons.lang.StringUtils;
+import org.archive.io.ArchiveRecord;
+import org.archive.io.arc.ARCWriterTest;
+import org.archive.util.TmpDirTestCase;
+
+public class ArchiveReaderFactoryTest extends TmpDirTestCase {
+ /**
+ * Test local file as URL
+ * @throws IOException
+ */
+ public void testGetFileURL() throws IOException {
+ File arc = ARCWriterTest.createARCFile(getTmpDir(), true);
+ ArchiveReader reader = null;
+ try {
+ reader = ArchiveReaderFactory.
+ get(new URL("file:////" + arc.getAbsolutePath()));
+ for (Iterator i = reader.iterator(); i.hasNext();) {
+ ArchiveRecord r = (ArchiveRecord)i.next();
+ assertTrue("mime unread",StringUtils.isNotBlank(r.getHeader().getMimetype()));
+ }
+ } finally {
+ if (reader != null) {
+ reader.close();
+ }
+ }
+ }
+
+ /**
+ * Test local file as File
+ * @throws IOException
+ */
+ public void testGetFile() throws IOException {
+ File arc = ARCWriterTest.createARCFile(getTmpDir(), true);
+ ArchiveReader reader = null;
+ try {
+ reader = ArchiveReaderFactory.get(arc.getAbsoluteFile());
+ for (Iterator i = reader.iterator(); i.hasNext();) {
+ ArchiveRecord r = (ArchiveRecord)i.next();
+ assertTrue("mime unread",StringUtils.isNotBlank(r.getHeader().getMimetype()));
+ }
+ } finally {
+ if (reader != null) {
+ reader.close();
+ }
+ }
+ }
+
+ /**
+ * Test local file as String path
+ * @throws IOException
+ */
+ public void testGetPath() throws IOException {
+ File arc = ARCWriterTest.createARCFile(getTmpDir(), true);
+ ArchiveReader reader = null;
+ try {
+ reader = ArchiveReaderFactory.get(arc.getAbsoluteFile().getAbsolutePath());
+ for (Iterator i = reader.iterator(); i.hasNext();) {
+ ArchiveRecord r = (ArchiveRecord)i.next();
+ assertTrue("mime unread",StringUtils.isNotBlank(r.getHeader().getMimetype()));
+ }
+ } finally {
+ if (reader != null) {
+ reader.close();
+ }
+ }
+ }
+}
diff --git a/src/test/java/org/archive/io/BufferedSeekInputStreamTest.java b/src/test/java/org/archive/io/BufferedSeekInputStreamTest.java
new file mode 100644
index 00000000..270e45e0
--- /dev/null
+++ b/src/test/java/org/archive/io/BufferedSeekInputStreamTest.java
@@ -0,0 +1,67 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+import java.util.Random;
+
+import junit.framework.TestCase;
+
+
+/**
+ * Unit test for BufferedSeekInputStream. The tests do some random
+ * repositioning in the stream to make sure the buffer is always valid.
+ *
+ * @author pjack
+ */
+public class BufferedSeekInputStreamTest extends TestCase {
+
+
+ private static byte[] TEST_DATA = makeTestData();
+
+ public void testPosition() throws Exception {
+ Random random = new Random();
+ ArraySeekInputStream asis = new ArraySeekInputStream(TEST_DATA);
+ BufferedSeekInputStream bsis = new BufferedSeekInputStream(asis, 11);
+ for (int i = 0; i < TEST_DATA.length; i++) {
+ byte b = (byte)bsis.read();
+ assertEquals(TEST_DATA[i], b);
+ }
+ for (int i = 0; i < 1000; i++) {
+ int index = random.nextInt(TEST_DATA.length);
+ bsis.position(index);
+ char expected = (char)((int)TEST_DATA[index] & 0xFF);
+ char read = (char)(bsis.read() & 0xFF);
+ assertEquals(expected, read);
+ }
+ }
+
+
+ private static byte[] makeTestData() {
+ String s = "If the dull substance of my flesh were thought\n"
+ + "Injurious distance could not stop my way\n"
+ + "For then, despite of space, I would be brought\n"
+ + "From limits far remote where thou dost stay.\n";
+ byte[] r = new byte[s.length()];
+ for (int i = 0; i < r.length; i++) {
+ r[i] = (byte)s.charAt(i);
+// r[i] = (byte)s.charAt(i);
+ }
+ return r;
+ }
+}
diff --git a/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java b/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java
new file mode 100644
index 00000000..9f7e2a15
--- /dev/null
+++ b/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java
@@ -0,0 +1,209 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import junit.framework.TestCase;
+
+import org.apache.commons.httpclient.Header;
+import org.archive.io.arc.ARCRecord;
+import org.archive.io.warc.WARCRecord;
+
+public class HeaderedArchiveRecordTest extends TestCase {
+ private static final String HTTPHEADER = "HTTP/1.1 200 OK\r\n"
+ + "Last-Modified: Sun, 28 Aug 2005 14:10:55 GMT\r\n"
+ + "Content-Length: 108\r\n" + "Connection: close\r\n"
+ + "Content-Type: text/html\r\n" + "\r\n";
+ private static final String BODY = "\r\n" + " \r\n"
+ + " Neue Seite 1\r\n" + " \r\n"
+ + " \r\n" + " \r\n" + "";
+
+ public void testParseHttpHeadersInWARC() throws IOException {
+ final String url = "http://foo.maths.uq.edu.au/index.html";
+ // final String warcHeader = "WARC/0.10 000000000486 response " +
+ // url + " 20070315152520 " +
+ // "urn:uuid:d8b342a8-dba4-4d7f-a551-1d8184f2ff58 " +
+ // "application/http; msgtype=response\r\n" +
+ // "Checksum: sha1:IT6YEX5WHKK57GOEHV2YHTTXEP5KPM6A\r\n" +
+ // "IP-Address: 80.150.6.184\r\n" +
+ // "\r\n";
+
+ final String warcHeader = "WARC/0.12\r\n"
+ + "MIME-Version: 1.0\r\n"
+ + "WARC-Record-Type: response\r\n"
+ + "WARC-Target-URI: http://foo.maths.uq.edu.au/index.html\r\n"
+ + "WARC-Date: 2006-09-19T17:20:24Z\r\n"
+ + "WARC-Digest: sha1:IT6YEX5WHKK57GOEHV2YHTTXEP5KPM6A\r\n"
+ + "WARC-IP-Address: 80.150.6.184\r\n"
+ + "Content-ID: \r\n"
+ + "Content-Type: application/http; msgtype=response\r\n"
+ + "Content-Length: " + (HTTPHEADER.length() + BODY.length()) + "\r\n"
+ + "\r\n";
+
+ final String hdr = warcHeader + HTTPHEADER + BODY;
+
+ WARCRecord r = new WARCRecord(new ByteArrayInputStream(hdr.getBytes()),
+ "READER_IDENTIFIER", 0, false, true);
+ HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true);
+
+ har.skipHttpHeader();
+
+ byte[] b = new byte[BODY.length()];
+ har.read(b);
+ String bodyRead = new String(b);
+ assertEquals(BODY, bodyRead);
+ assertHeaderCorrectlyParsed(har.getContentHeaders());
+ assertEquals("failed to retrieve Url from metadata", har.getHeader()
+ .getUrl(), url);
+ }
+
+ public void testParseHttpHeadersInARC() throws IOException {
+ final int len = HTTPHEADER.length() + BODY.length();
+ final int contentLength = BODY.length();
+ final String url = "http://www.ly.gov.tw:80/accpart.htm";
+ final String hdr = HTTPHEADER + BODY;
+ // Interesting difference between ARCRecord and WARCRecord is that the
+ // stream passed the ARCRecord is supposed to be just past the
+ // ARCRecord metadata line where as stream passed WARCRecord is at
+ // record start. TODO: Add to ARCRecord constructor that doesn't
+ // take an ArchiveRecordHeader but rather parses it from the stream.
+ ArchiveRecordHeader arh = new ArchiveRecordHeader() {
+ public int getContentBegin() {
+ // TODO: In ARCs, this is where http headers end and
+ // the content begins. Need to reconcile for generic
+ // HeaderedArchiveRecord processing. In this context, it
+ // makes sense setting it to zero -- HeaderedArchiveRecord
+ // will then figure it out.
+ return 0;
+ }
+
+ public String getDate() {
+ return null;
+ }
+
+ public String getDigest() {
+ return null;
+ }
+
+ public Set getHeaderFieldKeys() {
+ return null;
+ }
+
+ public Map getHeaderFields() {
+ return null;
+ }
+
+ public Object getHeaderValue(String key) {
+ return null;
+ }
+
+ public long getLength() {
+ return len;
+ }
+
+ public long getContentLength() {
+ return contentLength;
+ }
+
+ public String getMimetype() {
+ return null;
+ }
+
+ public long getOffset() {
+ return 0;
+ }
+
+ public String getReaderIdentifier() {
+ return null;
+ }
+
+ public String getRecordIdentifier() {
+ return null;
+ }
+
+ public String getUrl() {
+ return url;
+ }
+
+ public String getVersion() {
+ return null;
+ }
+
+ };
+ ARCRecord r = new ARCRecord(new ByteArrayInputStream(hdr.getBytes()),
+ arh, 0, false, true, false);
+
+ HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true);
+ har.skipHttpHeader();
+ byte[] b = new byte[BODY.length()];
+ har.read(b);
+ String bodyRead = new String(b);
+ assertEquals(BODY, bodyRead);
+ assertHeaderCorrectlyParsed(har.getContentHeaders());
+ }
+
+ public void testEasierParseHttpHeadersInARC() throws IOException {
+ final String url = "http://www.archive.org/index.htm";
+ final String arcHeader = url
+ + " 192.168.0.1 20070515111004 text/html 167568\n";
+ final String hdr = arcHeader + HTTPHEADER + BODY;
+
+ ARCRecord r = new ARCRecord(new ByteArrayInputStream(hdr.getBytes()),
+ "READER_IDENTIFIER", 0, false, true, false);
+
+ HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true);
+ har.skipHttpHeader();
+ byte[] b = new byte[BODY.length()];
+ har.read(b);
+ String bodyRead = new String(b);
+ assertEquals(BODY, bodyRead);
+ assertHeaderCorrectlyParsed(har.getContentHeaders());
+ assertEquals("failed to retrieve Url from metadata", har.getHeader()
+ .getUrl(), url);
+ }
+
+ private void assertHeaderCorrectlyParsed(Header[] headers) {
+ final List orgHeaders = Arrays.asList(HTTPHEADER.split("\r\n"));
+ assertEquals("not all HTTP header entries have been retrieved",
+ orgHeaders.size(), headers.length + 1);
+
+ for (Header header : headers) {
+ assertTrue(orgHeaders.contains(header.getName() + ": "
+ + header.getValue()));
+ }
+ }
+
+ public void testNoheaderWARC() throws IOException {
+ String b = "hello world";
+ String c = "WARC/0.12\r\nContent-Type: text/plain\r\n"
+ + "Content-Length: " + b.length() + "\r\n\r\n" + b;
+ org.archive.io.warc.WARCRecord r = new org.archive.io.warc.WARCRecord(
+ new ByteArrayInputStream(c.getBytes()), "READER_IDENTIFIER", 0,
+ false, true);
+ HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true);
+ assertTrue(har.isStrict());
+ }
+}
diff --git a/src/test/java/org/archive/io/RecordingInputStreamTest.java b/src/test/java/org/archive/io/RecordingInputStreamTest.java
new file mode 100644
index 00000000..20a8b8b3
--- /dev/null
+++ b/src/test/java/org/archive/io/RecordingInputStreamTest.java
@@ -0,0 +1,132 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.PipedInputStream;
+import java.io.PipedOutputStream;
+
+import org.archive.util.TmpDirTestCase;
+
+
+/**
+ * Test cases for RecordingInputStream.
+ *
+ * @author gojomo
+ */
+public class RecordingInputStreamTest extends TmpDirTestCase
+{
+
+
+ /*
+ * @see TmpDirTestCase#setUp()
+ */
+ protected void setUp() throws Exception
+ {
+ super.setUp();
+ }
+
+ /**
+ * Test readFullyOrUntil soft (no exception) and hard (exception)
+ * length cutoffs, timeout, and rate-throttling.
+ *
+ * @throws IOException
+ * @throws InterruptedException
+ * @throws RecorderTimeoutException
+ */
+ public void testReadFullyOrUntil() throws RecorderTimeoutException, IOException, InterruptedException
+ {
+ RecordingInputStream ris = new RecordingInputStream(16384, (new File(
+ getTmpDir(), "testReadFullyOrUntil").getAbsolutePath()));
+ ByteArrayInputStream bais = new ByteArrayInputStream(
+ "abcdefghijklmnopqrstuvwxyz".getBytes());
+ // test soft max
+ ris.open(bais);
+ ris.setLimits(10,0,0);
+ ris.readFullyOrUntil(7);
+ ris.close();
+ ReplayInputStream res = ris.getReplayInputStream();
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ res.readFullyTo(baos);
+ assertEquals("soft max cutoff","abcdefg",new String(baos.toByteArray()));
+ // test hard max
+ bais.reset();
+ baos.reset();
+ ris.open(bais);
+ boolean exceptionThrown = false;
+ try {
+ ris.setLimits(10,0,0);
+ ris.readFullyOrUntil(13);
+ } catch (RecorderLengthExceededException ex) {
+ exceptionThrown = true;
+ }
+ assertTrue("hard max exception",exceptionThrown);
+ ris.close();
+ res = ris.getReplayInputStream();
+ res.readFullyTo(baos);
+ assertEquals("hard max cutoff","abcdefghijk",
+ new String(baos.toByteArray()));
+ // test timeout
+ PipedInputStream pin = new PipedInputStream();
+ PipedOutputStream pout = new PipedOutputStream(pin);
+ ris.open(pin);
+ exceptionThrown = false;
+ trickle("abcdefghijklmnopqrstuvwxyz".getBytes(),pout);
+ try {
+ ris.setLimits(0,5000,0);
+ ris.readFullyOrUntil(0);
+ } catch (RecorderTimeoutException ex) {
+ exceptionThrown = true;
+ }
+ assertTrue("timeout exception",exceptionThrown);
+ ris.close();
+ // test rate limit
+ bais = new ByteArrayInputStream(new byte[1024*2*5]);
+ ris.open(bais);
+ long startTime = System.currentTimeMillis();
+ ris.setLimits(0,0,2);
+ ris.readFullyOrUntil(0);
+ long endTime = System.currentTimeMillis();
+ long duration = endTime - startTime;
+ assertTrue("read too fast: "+duration,duration>=5000);
+ ris.close();
+ }
+
+ protected void trickle(final byte[] bytes, final PipedOutputStream pout) {
+ new Thread() {
+ public void run() {
+ try {
+ for (int i = 0; i < bytes.length; i++) {
+ Thread.sleep(1000);
+ pout.write(bytes[i]);
+ }
+ pout.close();
+ } catch (IOException e) {
+ // do nothing
+ } catch (Exception e) {
+ System.err.print(e);
+ }
+ }
+ }.start();
+
+ }
+}
diff --git a/src/test/java/org/archive/io/ReplayCharSequenceTest.java b/src/test/java/org/archive/io/ReplayCharSequenceTest.java
new file mode 100644
index 00000000..9208594a
--- /dev/null
+++ b/src/test/java/org/archive/io/ReplayCharSequenceTest.java
@@ -0,0 +1,391 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io;
+
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.text.NumberFormat;
+import java.util.Date;
+import java.util.Random;
+import java.util.logging.Logger;
+
+import org.archive.util.FileUtils;
+import org.archive.util.TmpDirTestCase;
+
+import com.google.common.base.Charsets;
+
+/**
+ * Test ReplayCharSequences.
+ *
+ * @author stack, gojomo
+ * @version $Revision$, $Date$
+ */
+public class ReplayCharSequenceTest extends TmpDirTestCase
+{
+ /**
+ * Logger.
+ */
+ private static Logger logger =
+ Logger.getLogger("org.archive.io.ReplayCharSequenceFactoryTest");
+
+
+ private static final int SEQUENCE_LENGTH = 127;
+ private static final int MULTIPLIER = 3;
+ private static final int BUFFER_SIZE = SEQUENCE_LENGTH * MULTIPLIER;
+ private static final int INCREMENT = 1;
+
+ /**
+ * Buffer of regular content.
+ */
+ private byte [] regularBuffer = null;
+
+ /*
+ * @see TestCase#setUp()
+ */
+ protected void setUp() throws Exception
+ {
+ super.setUp();
+ this.regularBuffer =
+ fillBufferWithRegularContent(new byte [BUFFER_SIZE]);
+ }
+
+ public void testShiftjis() throws IOException {
+
+ // Here's the bytes for the JIS encoding of the Japanese form of Nihongo
+ byte[] bytes_nihongo = {
+ (byte) 0x1B, (byte) 0x24, (byte) 0x42, (byte) 0x46,
+ (byte) 0x7C, (byte) 0x4B, (byte) 0x5C, (byte) 0x38,
+ (byte) 0x6C, (byte) 0x1B, (byte) 0x28, (byte) 0x42,
+ (byte) 0x1B, (byte) 0x28, (byte) 0x42 };
+ final String ENCODING = "SJIS";
+ // Here is nihongo converted to JVM encoding.
+ String nihongo = new String(bytes_nihongo, ENCODING);
+
+ RecordingOutputStream ros = writeTestStream(
+ bytes_nihongo,MULTIPLIER,
+ "testShiftjis",MULTIPLIER);
+ // TODO: check for existence of overflow file?
+ ReplayCharSequence rcs = getReplayCharSequence(ros,Charset.forName(ENCODING));
+
+ // Now check that start of the rcs comes back in as nihongo string.
+ String rcsStr = rcs.subSequence(0, nihongo.length()).toString();
+ assertTrue("Nihongo " + nihongo + " does not equal converted string" +
+ " from rcs " + rcsStr,
+ nihongo.equals(rcsStr));
+ // And assert next string is also properly nihongo.
+ if (rcs.length() >= (nihongo.length() * 2)) {
+ rcsStr = rcs.subSequence(nihongo.length(),
+ nihongo.length() + nihongo.length()).toString();
+ assertTrue("Nihongo " + nihongo + " does not equal converted " +
+ " string from rcs (2nd time)" + rcsStr,
+ nihongo.equals(rcsStr));
+ }
+ }
+
+ public void testGetReplayCharSequenceByteZeroOffset() throws IOException {
+
+ RecordingOutputStream ros = writeTestStream(
+ regularBuffer,MULTIPLIER,
+ "testGetReplayCharSequenceByteZeroOffset",MULTIPLIER);
+ ReplayCharSequence rcs = getReplayCharSequence(ros);
+
+ for (int i = 0; i < MULTIPLIER; i++) {
+ accessingCharacters(rcs);
+ }
+ }
+
+ private ReplayCharSequence getReplayCharSequence(RecordingOutputStream ros) throws IOException {
+ return getReplayCharSequence(ros,null);
+ }
+
+ private ReplayCharSequence getReplayCharSequence(RecordingOutputStream ros, Charset charset) throws IOException {
+ return new GenericReplayCharSequence(ros.getReplayInputStream(),
+ ros.getBufferLength()/2, ros.backingFilename, charset);
+ }
+
+
+ public void testGetReplayCharSequenceMultiByteZeroOffset()
+ throws IOException {
+
+ RecordingOutputStream ros = writeTestStream(
+ regularBuffer,MULTIPLIER,
+ "testGetReplayCharSequenceMultiByteZeroOffset",MULTIPLIER);
+ ReplayCharSequence rcs = getReplayCharSequence(ros,Charsets.UTF_8);
+
+ for (int i = 0; i < MULTIPLIER; i++) {
+ accessingCharacters(rcs);
+ }
+ }
+
+ public void testReplayCharSequenceByteToString() throws IOException {
+ String fileContent = "Some file content";
+ byte [] buffer = fileContent.getBytes();
+ RecordingOutputStream ros = writeTestStream(
+ buffer,1,
+ "testReplayCharSequenceByteToString.txt",0);
+ ReplayCharSequence rcs = getReplayCharSequence(ros);
+ String result = rcs.toString();
+ assertEquals("Strings don't match",result,fileContent);
+ }
+
+ private String toHexString(String str)
+ {
+ if (str != null) {
+ StringBuilder buf = new StringBuilder("{ ");
+ buf.append(Integer.toString(str.charAt(0), 16));
+ for (int i = 1; i < str.length(); i++) {
+ buf.append(", ");
+ buf.append(Integer.toString(str.charAt(i), 16));
+ }
+ buf.append(" }");
+ return buf.toString();
+ }
+ else
+ return "null";
+ }
+
+ public void testSingleByteEncodings() throws IOException {
+ byte[] bytes = {
+ (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64,
+ (byte) 0x7d, (byte) 0x7e, (byte) 0x7f, (byte) 0x80,
+ (byte) 0x81, (byte) 0x82, (byte) 0x83, (byte) 0x84,
+ (byte) 0xfc, (byte) 0xfd, (byte) 0xfe, (byte) 0xff };
+
+ String latin1String = new String(bytes, "latin1");
+ RecordingOutputStream ros = writeTestStream(
+ bytes, 1, "testSingleByteEncodings-latin1.txt", 0);
+ ReplayCharSequence rcs = getReplayCharSequence(ros,Charsets.ISO_8859_1);
+ String result = rcs.toString();
+ logger.fine("latin1[0] " + toHexString(latin1String));
+ logger.fine("latin1[1] " + toHexString(result));
+ assertEquals("latin1 strings don't match", result, latin1String);
+
+ String w1252String = new String(bytes, "windows-1252");
+ ros = writeTestStream(
+ bytes, 1, "testSingleByteEncodings-windows-1252.txt", 0);
+ rcs = getReplayCharSequence(ros,Charset.forName("windows-1252"));
+ result = rcs.toString();
+ logger.fine("windows-1252[0] " + toHexString(w1252String));
+ logger.fine("windows-1252[1] " + toHexString(result));
+ assertEquals("windows-1252 strings don't match", result, w1252String);
+
+ String asciiString = new String(bytes, "ascii");
+ ros = writeTestStream(
+ bytes, 1, "testSingleByteEncodings-ascii.txt", 0);
+ rcs = getReplayCharSequence(ros,Charset.forName("ascii"));
+ result = rcs.toString();
+ logger.fine("ascii[0] " + toHexString(asciiString));
+ logger.fine("ascii[1] " + toHexString(result));
+ assertEquals("ascii strings don't match", result, asciiString);
+ }
+
+ public void testReplayCharSequenceByteToStringOverflow() throws IOException {
+ String fileContent = "Some file content. "; // ascii
+ byte [] buffer = fileContent.getBytes();
+ RecordingOutputStream ros = writeTestStream(
+ buffer,1,
+ "testReplayCharSequenceByteToStringOverflow.txt",1);
+ String expectedContent = fileContent+fileContent;
+
+ // The string is ascii which is a subset of both these encodings. Use
+ // both encodings because they exercise different code paths. UTF-8 is
+ // decoded to UTF-16 while windows-1252 is memory mapped directly. See
+ // GenericReplayCharSequence
+ ReplayCharSequence rcsUtf8 = getReplayCharSequence(ros,Charsets.UTF_8);
+ ReplayCharSequence rcs1252 = getReplayCharSequence(ros,Charset.forName("windows-1252"));
+
+ String result = rcsUtf8.toString();
+ assertEquals("Strings don't match", expectedContent, result);
+
+ result = rcs1252.toString();
+ assertEquals("Strings don't match", expectedContent, result);
+ }
+
+ public void testReplayCharSequenceByteToStringMulti() throws IOException {
+ String fileContent = "Some file content";
+ byte [] buffer = fileContent.getBytes("UTF-8");
+ final int MULTIPLICAND = 10;
+ StringBuilder sb =
+ new StringBuilder(MULTIPLICAND * fileContent.length());
+ for (int i = 0; i < MULTIPLICAND; i++) {
+ sb.append(fileContent);
+ }
+ String expectedResult = sb.toString();
+ RecordingOutputStream ros = writeTestStream(
+ buffer,1,
+ "testReplayCharSequenceByteToStringMulti.txt",MULTIPLICAND-1);
+ for (int i = 0; i < 3; i++) {
+ ReplayCharSequence rcs = getReplayCharSequence(ros,Charsets.UTF_8);
+ String result = rcs.toString();
+ assertEquals("Strings don't match", result, expectedResult);
+ rcs.close();
+ System.gc();
+ System.runFinalization();
+ }
+ }
+
+ public void xestHugeReplayCharSequence() throws IOException {
+ String fileContent = "01234567890123456789";
+ String characterEncoding = "ascii";
+ byte[] buffer = fileContent.getBytes(characterEncoding);
+
+ long reps = (long) Integer.MAX_VALUE / (long) buffer.length + 1000000l;
+
+ logger.info("writing " + (reps * buffer.length)
+ + " bytes to testHugeReplayCharSequence.txt");
+ RecordingOutputStream ros = writeTestStream(buffer, 0,
+ "testHugeReplayCharSequence.txt", reps);
+ ReplayCharSequence rcs = getReplayCharSequence(ros,Charset.forName(characterEncoding));
+
+ if (reps * fileContent.length() > (long) Integer.MAX_VALUE) {
+ assertTrue("ReplayCharSequence has wrong length (length()="
+ + rcs.length() + ") (should be " + Integer.MAX_VALUE + ")",
+ rcs.length() == Integer.MAX_VALUE);
+ } else {
+ assertEquals("ReplayCharSequence has wrong length (length()="
+ + rcs.length() + ") (should be "
+ + (reps * fileContent.length()) + ")", (long) rcs.length(),
+ reps * (long) fileContent.length());
+ }
+
+ // boundary cases or something
+ for (int index : new int[] { 0, rcs.length() / 4, rcs.length() / 2,
+ rcs.length() - 1, rcs.length() / 4 }) {
+ // logger.info("testing char at index=" +
+ // NumberFormat.getInstance().format(index));
+ assertEquals("Characters don't match (index="
+ + NumberFormat.getInstance().format(index) + ")",
+ fileContent.charAt(index % fileContent.length()), rcs
+ .charAt(index));
+ }
+
+ // check that out of bounds indices throw exception
+ for (int n : new int[] { -1, Integer.MIN_VALUE, rcs.length() + 1 }) {
+ try {
+ String message = "rcs.charAt(" + n + ")=" + rcs.charAt(n)
+ + " ?!? -- expected IndexOutOfBoundsException";
+ logger.severe(message);
+ fail(message);
+ } catch (IndexOutOfBoundsException e) {
+ logger.info("got expected exception: " + e);
+ }
+ }
+
+ // check some characters at random spots & kinda stress test the
+ // system's memory mapping facility
+ Random rand = new Random(0); // seed so we get the same ones each time
+ for (int i = 0; i < 5000; i++) {
+ int index = rand.nextInt(rcs.length());
+ // logger.info(i + ". testing char at index=" +
+ // NumberFormat.getInstance().format(index));
+ assertEquals("Characters don't match (index="
+ + NumberFormat.getInstance().format(index) + ")",
+ fileContent.charAt(index % fileContent.length()), rcs
+ .charAt(index));
+ }
+ }
+
+ /**
+ * Accessing characters test.
+ *
+ * Checks that characters in the rcs are in sequence.
+ *
+ * @param rcs The ReplayCharSequence to try out.
+ */
+ private void accessingCharacters(CharSequence rcs) {
+ long timestamp = (new Date()).getTime();
+ int seeks = 0;
+ for (int i = (INCREMENT * 2); (i + INCREMENT) < rcs.length();
+ i += INCREMENT) {
+ checkCharacter(rcs, i);
+ seeks++;
+ for (int j = i - INCREMENT; j < i; j++) {
+ checkCharacter(rcs, j);
+ seeks++;
+ }
+ }
+ // Note that printing out below breaks cruisecontrols drawing
+ // of the xml unit test results because it outputs disallowed
+ // xml characters.
+ logger.fine(rcs + " seeks count " + seeks + " in " +
+ ((new Date().getTime()) - timestamp) + " milliseconds.");
+ }
+
+ /**
+ * Check the character read.
+ *
+ * Throws assertion if not expected result.
+ *
+ * @param rcs ReplayCharSequence to read from.
+ * @param i Character offset.
+ */
+ private void checkCharacter(CharSequence rcs, int i) {
+ int c = rcs.charAt(i);
+ assertTrue("Character " + Integer.toString(c) + " at offset " + i +
+ " unexpected.", (c % SEQUENCE_LENGTH) == (i % SEQUENCE_LENGTH));
+ }
+
+ /**
+ * @param baseName
+ * @return RecordingOutputStream
+ * @throws IOException
+ */
+ private RecordingOutputStream writeTestStream(byte[] content,
+ int memReps, String baseName, long fileReps) throws IOException {
+ String backingFilename = FileUtils.maybeRelative(getTmpDir(),baseName).getAbsolutePath();
+ RecordingOutputStream ros = new RecordingOutputStream(
+ content.length * memReps,
+ backingFilename);
+ ros.open();
+ ros.markMessageBodyBegin();
+ for(long i = 0; i < (memReps+fileReps); i++) {
+ // fill buffer (repeat MULTIPLIER times) and
+ // overflow to disk (also MULTIPLIER times)
+ ros.write(content);
+ }
+ ros.close();
+ return ros;
+ }
+
+
+ /**
+ * Fill a buffer w/ regular progression of single-byte
+ * (and <= 127) characters.
+ * @param buffer Buffer to fill.
+ * @return The buffer we filled.
+ */
+ private byte [] fillBufferWithRegularContent(byte [] buffer) {
+ int index = 0;
+ for (int i = 0; i < buffer.length; i++) {
+ buffer[i] = (byte) (index & 0x00ff);
+ index++;
+ if (index >= SEQUENCE_LENGTH) {
+ // Reset the index.
+ index = 0;
+ }
+ }
+ return buffer;
+ }
+
+ public void testCheckParameters()
+ {
+ // TODO.
+ }
+}
diff --git a/src/test/java/org/archive/io/RepositionableInputStreamTest.java b/src/test/java/org/archive/io/RepositionableInputStreamTest.java
new file mode 100644
index 00000000..1c7cc74c
--- /dev/null
+++ b/src/test/java/org/archive/io/RepositionableInputStreamTest.java
@@ -0,0 +1,70 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.PrintWriter;
+
+import org.archive.util.TmpDirTestCase;
+
+public class RepositionableInputStreamTest extends TmpDirTestCase {
+ private File testFile;
+ private static final String LINE = "0123456789abcdefghijklmnopqrstuv";
+ protected void setUp() throws Exception {
+ super.setUp();
+ this.testFile = new File(getTmpDir(), this.getClass().getName());
+ PrintWriter pw = new PrintWriter(new FileOutputStream(testFile));
+ for (int i = 0; i < 100; i++) {
+ pw.print(LINE);
+ }
+ pw.close();
+ }
+ protected void tearDown() throws Exception {
+ super.tearDown();
+ }
+ public void testname() throws Exception {
+ // Make buffer awkward size so we run into buffers spanning issues.
+ RepositionableInputStream ris =
+ new RepositionableInputStream(new FileInputStream(this.testFile),
+ 57);
+ int c = ris.read();
+ assertEquals(1, ris.position());
+ ris.read();
+ ris.position(0);
+ assertEquals(0, ris.position());
+ int c1 = ris.read();
+ assertEquals(c, c1);
+ ris.position(0);
+ byte [] bytes = new byte[LINE.length()];
+ long offset = 0;
+ for (int i = 0; i < 10; i++) {
+ ris.read(bytes, 0, LINE.length());
+ assertEquals(LINE, new String(bytes));
+ offset += LINE.length();
+ assertEquals(offset, ris.position());
+ }
+ long p = ris.position();
+ ris.position(p - LINE.length());
+ assertEquals(p - LINE.length(), ris.position());
+ c = ris.read();
+ assertEquals(c, c1);
+ }
+}
diff --git a/src/test/java/org/archive/io/arc/ARCWriterPoolTest.java b/src/test/java/org/archive/io/arc/ARCWriterPoolTest.java
new file mode 100644
index 00000000..f0be6506
--- /dev/null
+++ b/src/test/java/org/archive/io/arc/ARCWriterPoolTest.java
@@ -0,0 +1,122 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io.arc;
+
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.util.Arrays;
+
+import org.archive.io.WriterPool;
+import org.archive.io.WriterPoolMember;
+import org.archive.io.WriterPoolSettings;
+import org.archive.util.TmpDirTestCase;
+
+
+/**
+ * Test ARCWriterPool
+ */
+@SuppressWarnings("deprecation")
+public class ARCWriterPoolTest extends TmpDirTestCase {
+ private static final String PREFIX = "TEST";
+
+ public void testARCWriterPool()
+ throws Exception {
+ final int MAX_ACTIVE = 3;
+ final int MAX_WAIT_MILLISECONDS = 100;
+ cleanUpOldFiles(PREFIX);
+ WriterPool pool = new ARCWriterPool(getSettings(true),
+ MAX_ACTIVE, MAX_WAIT_MILLISECONDS);
+ WriterPoolMember [] writers = new WriterPoolMember[MAX_ACTIVE];
+ final String CONTENT = "Any old content";
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ baos.write(CONTENT.getBytes());
+ for (int i = 0; i < MAX_ACTIVE; i++) {
+ writers[i] = pool.borrowFile();
+ assertEquals("Number active", i + 1, pool.getNumActive());
+ ((ARCWriter)writers[i]).write("http://one.two.three", "no-type",
+ "0.0.0.0", 1234567890, CONTENT.length(), baos);
+ }
+
+ // Pool is maxed out. New behavior is that additional requests
+ // block as long as necessary -- so no longer testing for timeout/
+ // exception
+
+ for (int i = (MAX_ACTIVE - 1); i >= 0; i--) {
+ pool.returnFile(writers[i]);
+ assertEquals("Number active", i, pool.getNumActive());
+ assertEquals("Number idle", MAX_ACTIVE - pool.getNumActive(),
+ pool.getNumIdle());
+ }
+ pool.close();
+ }
+
+ public void testInvalidate() throws Exception {
+ final int MAX_ACTIVE = 3;
+ final int MAX_WAIT_MILLISECONDS = 100;
+ cleanUpOldFiles(PREFIX);
+ WriterPool pool = new ARCWriterPool(getSettings(true),
+ MAX_ACTIVE, MAX_WAIT_MILLISECONDS);
+ WriterPoolMember [] writers = new WriterPoolMember[MAX_ACTIVE];
+ final String CONTENT = "Any old content";
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ baos.write(CONTENT.getBytes());
+ for (int i = 0; i < MAX_ACTIVE; i++) {
+ writers[i] = pool.borrowFile();
+ assertEquals("Number active", i + 1, pool.getNumActive());
+ ((ARCWriter)writers[i]).write("http://one.two.three", "no-type",
+ "0.0.0.0", 1234567890, CONTENT.length(), baos);
+ }
+
+ WriterPoolMember writer2Invalidate = writers[pool.getNumActive() - 1];
+ writers[pool.getNumActive() - 1] = null;
+ pool.invalidateFile(writer2Invalidate);
+ for (int i = 0; i < (MAX_ACTIVE - 1); i++) {
+ if (writers[i] == null) {
+ continue;
+ }
+ pool.returnFile(writers[i]);
+ }
+
+ for (int i = 0; i < MAX_ACTIVE; i++) {
+ writers[i] = pool.borrowFile();
+ assertEquals("Number active", i + 1, pool.getNumActive());
+ ((ARCWriter)writers[i]).write("http://one.two.three", "no-type",
+ "0.0.0.0", 1234567890, CONTENT.length(), baos);
+ }
+ for (int i = (MAX_ACTIVE - 1); i >= 0; i--) {
+ pool.returnFile(writers[i]);
+ assertEquals("Number active", i, pool.getNumActive());
+ assertEquals("Number idle", MAX_ACTIVE - pool.getNumActive(),
+ pool.getNumIdle());
+ }
+ pool.close();
+ }
+
+ private WriterPoolSettings getSettings(final boolean isCompressed) {
+ File [] files = {getTmpDir()};
+ return new WriterPoolSettingsData(
+ PREFIX,
+ "${prefix}-${timestamp17}-${serialno}-${heritrix.hostname}",
+ ARCConstants.DEFAULT_MAX_ARC_FILE_SIZE,
+ isCompressed,
+ Arrays.asList(files),
+ null);
+ }
+}
\ No newline at end of file
diff --git a/src/test/java/org/archive/io/arc/ARCWriterTest.java b/src/test/java/org/archive/io/arc/ARCWriterTest.java
new file mode 100644
index 00000000..f6e2bf6a
--- /dev/null
+++ b/src/test/java/org/archive/io/arc/ARCWriterTest.java
@@ -0,0 +1,699 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io.arc;
+
+import java.io.BufferedInputStream;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PrintStream;
+import java.util.Arrays;
+import java.util.Date;
+import java.util.Iterator;
+import java.util.List;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.input.NullInputStream;
+import org.apache.commons.io.output.NullOutputStream;
+import org.archive.io.ArchiveRecord;
+import org.archive.io.ArchiveRecordHeader;
+import org.archive.io.ReplayInputStream;
+import org.archive.io.WriterPoolMember;
+import org.archive.io.WriterPoolSettings;
+import org.archive.util.ArchiveUtils;
+import org.archive.util.TmpDirTestCase;
+
+import com.google.common.io.Closeables;
+
+
+/**
+ * Test ARCWriter class.
+ *
+ * This code exercises ARCWriter AND ARCReader. First it writes ARCs w/
+ * ARCWriter. Then it validates what was written w/ ARCReader.
+ *
+ * @author stack
+ */
+public class ARCWriterTest
+extends TmpDirTestCase implements ARCConstants {
+ /**
+ * Utility class for writing bad ARCs (with trailing junk)
+ */
+ public class CorruptibleARCWriter extends ARCWriter {
+ byte[] endJunk = null;
+
+ public CorruptibleARCWriter(AtomicInteger serial_no, WriterPoolSettings settings) {
+ super(serial_no, settings);
+ }
+
+ @Override
+ protected void postWriteRecordTasks() throws IOException {
+ if (endJunk != null) {
+ this.write(endJunk);
+ }
+ super.postWriteRecordTasks();
+ }
+
+ public void setEndJunk(byte[] b) throws IOException {
+ this.endJunk = b;
+ }
+ }
+
+ /**
+ * Suffix to use for ARC files made by JUNIT.
+ */
+ private static final String SUFFIX = "JUNIT";
+
+ private static final String SOME_URL = "http://www.archive.org/test/";
+
+
+ private static final AtomicInteger SERIAL_NO = new AtomicInteger();
+
+ /*
+ * @see TestCase#setUp()
+ */
+ protected void setUp() throws Exception {
+ super.setUp();
+ }
+
+ /*
+ * @see TestCase#tearDown()
+ */
+ protected void tearDown() throws Exception {
+ super.tearDown();
+ }
+
+ protected static String getContent() {
+ return getContent(null);
+ }
+
+ protected static String getContent(String indexStr) {
+ String page = (indexStr != null)? "Page #" + indexStr: "Some Page";
+ return "HTTP/1.1 200 OK\r\n" +
+ "Content-Type: text/html\r\n\r\n" +
+ "" + page +
+ "" +
+ "" + page +
+ "";
+ }
+
+ @SuppressWarnings("deprecation")
+ protected int writeRandomHTTPRecord(ARCWriter arcWriter, int index)
+ throws IOException {
+ String indexStr = Integer.toString(index);
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ // Start the record with an arbitrary 14-digit date per RFC2540
+ String now = ArchiveUtils.get14DigitDate();
+ int recordLength = 0;
+ byte[] record = (getContent(indexStr)).getBytes();
+ recordLength += record.length;
+ baos.write(record);
+ // Add the newline between records back in
+ baos.write("\n".getBytes());
+ recordLength += 1;
+ arcWriter.write("http://www.one.net/id=" + indexStr, "text/html",
+ "0.1.2.3", Long.parseLong(now), recordLength, baos);
+ return recordLength;
+ }
+
+ private File writeRecords(String baseName, boolean compress,
+ long maxSize, int recordCount)
+ throws IOException {
+ cleanUpOldFiles(baseName);
+ File [] files = {getTmpDir()};
+ ARCWriter arcWriter =
+ new ARCWriter(
+ SERIAL_NO,
+ new WriterPoolSettingsData(
+ baseName,
+ "${prefix}-"+SUFFIX,
+ maxSize,
+ compress,
+ Arrays.asList(files),
+ null));
+ assertNotNull(arcWriter);
+ for (int i = 0; i < recordCount; i++) {
+ writeRandomHTTPRecord(arcWriter, i);
+ }
+ arcWriter.close();
+ assertTrue("Doesn't exist: " +
+ arcWriter.getFile().getAbsolutePath(),
+ arcWriter.getFile().exists());
+ return arcWriter.getFile();
+ }
+
+ private void validate(File arcFile, int recordCount)
+ throws FileNotFoundException, IOException {
+ ARCReader reader = ARCReaderFactory.get(arcFile);
+ assertNotNull(reader);
+ List metaDatas = null;
+ if (recordCount == -1) {
+ metaDatas = reader.validate();
+ } else {
+ metaDatas = reader.validate(recordCount);
+ }
+ reader.close();
+ // Now, run through each of the records doing absolute get going from
+ // the end to start. Reopen the arc so no context between this test
+ // and the previous.
+
+ for (int i = metaDatas.size() - 1; i >= 0; i--) {
+ reader = ARCReaderFactory.get(arcFile);
+ ARCRecordMetaData meta = (ARCRecordMetaData)metaDatas.get(i);
+ ArchiveRecord r = reader.get(meta.getOffset());
+ String mimeType = r.getHeader().getMimetype();
+ assertTrue("Record is bogus",
+ mimeType != null && mimeType.length() > 0);
+ reader.close();
+ }
+ assertEquals("Metadata count not as expected",recordCount, metaDatas.size());
+ for (Iterator i = metaDatas.iterator(); i.hasNext();) {
+ ARCRecordMetaData r = (ARCRecordMetaData)i.next();
+ assertTrue("Record is empty", r.getLength() > 0);
+ }
+ }
+
+ public void testCheckARCFileSize()
+ throws IOException {
+ runCheckARCFileSizeTest("checkARCFileSize", false);
+ }
+
+ public void testCheckARCFileSizeCompressed()
+ throws IOException {
+ runCheckARCFileSizeTest("checkARCFileSize", true);
+ }
+
+ public void testWriteRecord() throws IOException {
+ final int recordCount = 2;
+ File arcFile = writeRecords("writeRecord", false,
+ DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
+ validate(arcFile, recordCount + 1); // Header record.
+ }
+
+ public void testRandomAccess() throws IOException {
+ final int recordCount = 3;
+ File arcFile = writeRecords("writeRecord", true,
+ DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
+ ARCReader reader = ARCReaderFactory.get(arcFile);
+ // Get to second record. Get its offset for later use.
+ boolean readFirst = false;
+ String url = null;
+ long offset = -1;
+ long totalRecords = 0;
+ boolean readSecond = false;
+ for (final Iterator i = reader.iterator(); i.hasNext(); totalRecords++) {
+ ARCRecord ar = (ARCRecord)i.next();
+ if (!readFirst) {
+ readFirst = true;
+ continue;
+ }
+ if (!readSecond) {
+ url = ar.getMetaData().getUrl();
+ offset = ar.getMetaData().getOffset();
+ readSecond = true;
+ }
+ }
+ reader.close();
+
+ reader = ARCReaderFactory.get(arcFile, offset);
+ ArchiveRecord ar = reader.get();
+ assertEquals(ar.getHeader().getUrl(), url);
+ ar.close();
+ reader.close();
+
+ // Get reader again. See how iterator works with offset
+ reader = ARCReaderFactory.get(arcFile, offset);
+ int count = 0;
+ for (final Iterator i = reader.iterator(); i.hasNext(); i.next()) {
+ count++;
+ }
+ reader.close();
+ assertEquals(totalRecords - 1, count);
+ }
+
+ public void testWriteRecordCompressed() throws IOException {
+ final int recordCount = 2;
+ File arcFile = writeRecords("writeRecordCompressed", true,
+ DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
+ validate(arcFile, recordCount + 1 /*Header record*/);
+ }
+
+ public void testWriteGiantRecord() throws IOException {
+ PrintStream dummyStream = new PrintStream(new NullOutputStream());
+ ARCWriter arcWriter =
+ new ARCWriter(
+ SERIAL_NO,
+ dummyStream,
+ new File("dummy"),
+ new WriterPoolSettingsData(
+ "",
+ "",
+ -1,
+ false,
+ null,
+ null));
+ assertNotNull(arcWriter);
+
+ // Start the record with an arbitrary 14-digit date per RFC2540
+ long now = System.currentTimeMillis();
+ long recordLength = org.apache.commons.io.FileUtils.ONE_GB * 3;
+
+ arcWriter.write("dummy:uri", "application/octet-stream",
+ "0.1.2.3", now, recordLength, new NullInputStream(recordLength));
+ arcWriter.close();
+ }
+
+ private void runCheckARCFileSizeTest(String baseName, boolean compress)
+ throws FileNotFoundException, IOException {
+ File f = writeRecords(baseName, compress, 1024, 15);
+ validate(f, 15+1);
+ }
+
+ protected CorruptibleARCWriter createARCWriter(String name, boolean compress) {
+ File [] files = {getTmpDir()};
+ return new CorruptibleARCWriter(
+ SERIAL_NO,
+ new WriterPoolSettingsData(
+ name,
+ "${prefix}-"+SUFFIX,
+ DEFAULT_MAX_ARC_FILE_SIZE,
+ compress,
+ Arrays.asList(files),
+ null));
+ }
+
+ protected static ByteArrayInputStream getBais(String str)
+ throws IOException {
+ return new ByteArrayInputStream(str.getBytes());
+ }
+
+ /**
+ * Writes a record, suppressing normal length-checks (so that
+ * intentionally malformed records may be written).
+ */
+ protected static void writeRecord(ARCWriter writer, String url,
+ String type, int len, ByteArrayInputStream bais)
+ throws IOException {
+ writer.write(url, type, "192.168.1.1", (new Date()).getTime(), len,
+ bais, false);
+ }
+
+ protected int iterateRecords(ARCReader r)
+ throws IOException {
+ int count = 0;
+ for (Iterator i = r.iterator(); i.hasNext();) {
+ ARCRecord rec = (ARCRecord)i.next();
+ rec.close();
+ if (count != 0) {
+ assertTrue("Unexpected URL " + rec.getMetaData().getUrl(),
+ rec.getMetaData().getUrl().startsWith(SOME_URL));
+ }
+ count++;
+ }
+ return count;
+ }
+
+ protected CorruptibleARCWriter createArcWithOneRecord(String name,
+ boolean compressed)
+ throws IOException {
+ CorruptibleARCWriter writer = createARCWriter(name, compressed);
+ String content = getContent();
+ writeRecord(writer, SOME_URL, "text/html",
+ content.length(), getBais(content));
+ return writer;
+ }
+
+ public void testSpaceInURL() {
+ String eMessage = null;
+ try {
+ holeyUrl("testSpaceInURL", false, " ");
+ } catch (IOException e) {
+ eMessage = e.getMessage();
+ }
+ assertTrue("Didn't get expected exception: " + eMessage,
+ eMessage.startsWith("Metadata line doesn't match"));
+ }
+
+ public void testTabInURL() {
+ String eMessage = null;
+ try {
+ holeyUrl("testTabInURL", false, "\t");
+ } catch (IOException e) {
+ eMessage = e.getMessage();
+ }
+ assertTrue("Didn't get expected exception: " + eMessage,
+ eMessage.startsWith("Metadata line doesn't match"));
+ }
+
+ protected void holeyUrl(String name, boolean compress, String urlInsert)
+ throws IOException {
+ ARCWriter writer = null;
+ try {
+ writer = createArcWithOneRecord(name, compress);
+ // Add some bytes on the end to mess up the record.
+ String content = getContent();
+ writeRecord(writer, SOME_URL + urlInsert + "/index.html", "text/html",
+ content.length(), getBais(content));
+ } finally {
+ Closeables.close(writer, true);
+ }
+ }
+
+// If uncompressed, length has to be right or parse will fail.
+//
+// public void testLengthTooShort() throws IOException {
+// lengthTooShort("testLengthTooShort-" + PREFIX, false);
+// }
+
+ public void testLengthTooShortCompressed() throws IOException {
+ lengthTooShort("testLengthTooShortCompressed", true, false);
+ }
+
+ public void testLengthTooShortCompressedStrict()
+ throws IOException {
+ String eMessage = null;
+ try {
+ lengthTooShort("testLengthTooShortCompressedStrict",
+ true, true);
+ } catch (RuntimeException e) {
+ eMessage = e.getMessage();
+ }
+ assertTrue("Didn't get expected exception: " + eMessage,
+ eMessage.startsWith("java.io.IOException: Record STARTING at"));
+ }
+
+ protected void lengthTooShort(String name, boolean compress, boolean strict)
+ throws IOException {
+ CorruptibleARCWriter writer = null;
+ try {
+ writer = createArcWithOneRecord(name, compress);
+ // Add some bytes on the end to mess up the record.
+ String content = getContent();
+ ByteArrayInputStream bais = getBais(content+"SOME TRAILING BYTES");
+ writeRecord(writer, SOME_URL, "text/html",
+ content.length(), bais);
+ writer.setEndJunk("SOME TRAILING BYTES".getBytes());
+ writeRecord(writer, SOME_URL, "text/html",
+ content.length(), getBais(content));
+ } finally {
+ Closeables.close(writer, true);
+ }
+
+ // Catch System.err into a byte stream.
+ ByteArrayOutputStream os = new ByteArrayOutputStream();
+ PrintStream origErr = System.err;
+ ARCReader r = null;
+ try {
+ System.setErr(new PrintStream(os));
+
+ r = ARCReaderFactory.get(writer.getFile());
+ r.setStrict(strict);
+ int count = iterateRecords(r);
+ assertTrue("Count wrong " + count, count == 4);
+
+ // Make sure we get the warning string which complains about the
+ // trailing bytes.
+ String err = os.toString();
+ assertTrue("No message " + err, err.startsWith("WARNING") &&
+ (err.indexOf("Record STARTING at") > 0));
+ r.close();
+ } finally {
+ Closeables.close(r, true);
+ System.setErr(origErr);
+ }
+ }
+
+// If uncompressed, length has to be right or parse will fail.
+//
+// public void testLengthTooLong()
+// throws IOException {
+// lengthTooLong("testLengthTooLongCompressed-" + PREFIX,
+// false, false);
+// }
+
+ public void testLengthTooLongCompressed()
+ throws IOException {
+ lengthTooLong("testLengthTooLongCompressed",
+ true, false);
+ }
+
+ public void testLengthTooLongCompressedStrict() {
+ String eMessage = null;
+ try {
+ lengthTooLong("testLengthTooLongCompressed",
+ true, true);
+ } catch (IOException e) {
+ eMessage = e.getMessage();
+ }
+ assertTrue("Didn't get expected exception: " + eMessage,
+ eMessage.startsWith("Premature EOF before end-of-record"));
+ }
+
+ protected void lengthTooLong(String name, boolean compress,
+ boolean strict)
+ throws IOException {
+ ARCWriter writer = createArcWithOneRecord(name, compress);
+ // Add a record with a length that is too long.
+ String content = getContent();
+ writeRecord(writer, SOME_URL+"2", "text/html",
+ content.length() + 10, getBais(content));
+ writeRecord(writer, SOME_URL+"3", "text/html",
+ content.length(), getBais(content));
+ writer.close();
+
+ // Catch System.err.
+ ByteArrayOutputStream os = new ByteArrayOutputStream();
+
+ PrintStream origErr = System.err;
+ ARCReader r = null;
+ try {
+ System.setErr(new PrintStream(os));
+
+ r = ARCReaderFactory.get(writer.getFile());
+ r.setStrict(strict);
+ int count = iterateRecords(r);
+ assertTrue("Count wrong " + count, count == 4);
+
+ // Make sure we get the warning string which complains about the
+ // trailing bytes.
+ String err = os.toString();
+ assertTrue("No message " + err,
+ err.startsWith("WARNING Premature EOF before end-of-record"));
+ } finally {
+ Closeables.close(r, true);
+ System.setErr(origErr);
+ }
+ }
+
+ public void testGapError() throws IOException {
+ ARCWriter writer = createArcWithOneRecord("testGapError", true);
+ String content = getContent();
+ // Make a 'weird' RIS that returns bad 'remaining' length
+ // awhen remaining should be 0
+ ReplayInputStream ris = new ReplayInputStream(content.getBytes(),
+ content.length(), null) {
+ public long remaining() {
+ return (super.remaining()==0) ? -1 : super.remaining();
+ }
+ };
+ String message = null;
+ try {
+ writer.write(SOME_URL, "text/html", "192.168.1.1",
+ (new Date()).getTime(), content.length(), ris);
+ } catch (IOException e) {
+ message = e.getMessage();
+ } finally {
+ IOUtils.closeQuietly(ris);
+ }
+ writer.close();
+ assertTrue("No gap when should be",
+ message != null &&
+ message.indexOf("Gap between expected and actual") >= 0);
+ }
+
+ /**
+ * Write an arc file for other tests to use.
+ * @param arcdir Directory to write to.
+ * @param compress True if file should be compressed.
+ * @return ARC written.
+ * @throws IOException
+ */
+ public static File createARCFile(File arcdir, boolean compress)
+ throws IOException {
+ File [] files = {arcdir};
+ ARCWriter writer = new ARCWriter(SERIAL_NO,
+ new WriterPoolSettingsData(
+ "",
+ "test",
+ DEFAULT_MAX_ARC_FILE_SIZE,
+ compress,
+ Arrays.asList(files),
+ null));
+ String content = getContent();
+ writeRecord(writer, SOME_URL, "text/html", content.length(),
+ getBais(content));
+ writer.close();
+ return writer.getFile();
+ }
+
+// public void testSpeed() throws IOException {
+// ARCWriter writer = createArcWithOneRecord("speed", true);
+// // Add a record with a length that is too long.
+// String content = getContent();
+// final int count = 100000;
+// logger.info("Starting speed write of " + count + " records.");
+// for (int i = 0; i < count; i++) {
+// writeRecord(writer, SOME_URL, "text/html", content.length(),
+// getBaos(content));
+// }
+// writer.close();
+// logger.info("Finished speed write test.");
+// }
+
+
+ public void testValidateMetaLine() throws Exception {
+ final String line = "http://www.aandw.net/images/walden2.png " +
+ "128.197.34.86 20060111174224 image/png 2160";
+ ARCWriter w = createARCWriter("testValidateMetaLine", true);
+ try {
+ w.validateMetaLine(line);
+ w.validateMetaLine(line + LINE_SEPARATOR);
+ w.validateMetaLine(line + "\\r\\n");
+ } finally {
+ w.close();
+ }
+ }
+
+ public void testArcRecordOffsetReads() throws Exception {
+ ARCReader r = getSingleRecordReader("testArcRecordInBufferStream");
+ ARCRecord ar = getSingleRecord(r);
+ // Now try getting some random set of bytes out of it
+ // at an odd offset (used to fail because we were
+ // doing bad math to find where in buffer to read).
+ final byte[] buffer = new byte[17];
+ final int maxRead = 4;
+ int totalRead = 0;
+ while (totalRead < maxRead) {
+ totalRead = totalRead
+ + ar.read(buffer, 13 + totalRead, maxRead - totalRead);
+ assertTrue(totalRead > 0);
+ }
+ r.close();
+ }
+
+ // available should always be >= 0; extra read()s should all give EOF
+ public void testArchiveRecordAvailableConsistent() throws Exception {
+ // first test reading byte-at-a-time via no-param read()
+ ARCReader r = getSingleRecordReader("testArchiveRecordAvailableConsistent");
+ ARCRecord record = getSingleRecord(r);
+ int c = record.read();
+ while(c>=0) {
+ c = record.read();
+ }
+ // consecutive reads after EOR should always give -1, still show zero available()
+ for (int i=0; i<5; i++) {
+ assertTrue("available negative:"+record.available(), record.available()>=0);
+ assertEquals(-1, record.read());
+ }
+ r.close();
+ }
+
+ // should always give -1 on repeated reads past EOR
+ public void testArchiveRecordEORConsistent() throws Exception {
+ ARCReader r = getSingleRecordReader("testArchiveRecordEORConsistent");
+ ARCRecord record = getSingleRecord(r);
+ this.readToEOS(record);
+ // consecutive reads after EOR should always give -1
+ for (int i=0; i<5; i++) {
+ assertEquals(-1, record.read(new byte[1]));
+ }
+ r.close();
+ }
+
+ // should not throw premature EOF when wrapped with BufferedInputStream
+ // [HER-1450] showed this was the case using Apache Tika
+ public void testArchiveRecordMarkSupport() throws Exception {
+ ARCReader r = getSingleRecordReader("testArchiveRecordMarkSupport");
+ ARCRecord record = getSingleRecord(r);
+ record.setStrict(true);
+ // ensure mark support
+ InputStream stream = new BufferedInputStream(record);
+ if (stream.markSupported()) {
+ for (int i=0; i<3; i++) {
+ this.readToEOS(stream);
+ stream.mark(stream.available());
+ stream.reset();
+ }
+ stream.close();
+ }
+ r.close();
+ }
+
+ /**
+ * Test a particular style of using the reader iterator. (Should
+ * possibly be on a reader-centric test class, but the best setup
+ * functionality is here.)
+ *
+ * @throws IOException
+ */
+ public void testReadIterator() throws IOException {
+ final int recordCount = 3;
+ File arcFile = writeRecords("writeRecord", true,
+ DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
+ ARCReader reader = ARCReaderFactory.get(arcFile);
+ Iterator it = reader.iterator();
+ while (it.hasNext()) {
+ ArchiveRecord next = it.next();
+ next.close();
+ }
+ reader.close();
+ }
+
+ protected void readToEOS(InputStream in) throws Exception {
+ byte [] buf = new byte[1024];
+ int read = 0;
+ while (read >= 0) {
+ read = in.read(buf);
+ // System.out.println("readToEOS read " + read + " bytes");
+ }
+ }
+
+ protected ARCReader getSingleRecordReader(String name) throws Exception {
+ // Get an ARC with one record.
+ WriterPoolMember w = createArcWithOneRecord(name, true);
+ w.close();
+ // Get reader on said ARC.
+ ARCReader r = ARCReaderFactory.get(w.getFile());
+ return r;
+ }
+
+ protected ARCRecord getSingleRecord(ARCReader r) {
+ final Iterator i = r.iterator();
+ // Skip first ARC meta record.
+ i.next();
+ i.hasNext();
+ // Now we're at first and only record in ARC.
+ return (ARCRecord) i.next();
+ }
+}
diff --git a/src/test/java/org/archive/io/warc/WARCWriterTest.java b/src/test/java/org/archive/io/warc/WARCWriterTest.java
new file mode 100644
index 00000000..35c68714
--- /dev/null
+++ b/src/test/java/org/archive/io/warc/WARCWriterTest.java
@@ -0,0 +1,512 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io.warc;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.net.URI;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.archive.io.ArchiveRecord;
+import org.archive.io.ArchiveRecordHeader;
+import org.archive.io.UTF8Bytes;
+import org.archive.io.WriterPoolMember;
+import org.archive.uid.RecordIDGenerator;
+import org.archive.uid.UUIDGenerator;
+import org.archive.util.ArchiveUtils;
+import org.archive.util.TmpDirTestCase;
+import org.archive.util.anvl.ANVLRecord;
+
+/**
+ * Test Writer and Reader.
+ * @author stack
+ * @version $Date: 2006-08-29 19:35:48 -0700 (Tue, 29 Aug 2006) $ $Version$
+ */
+public class WARCWriterTest
+extends TmpDirTestCase implements WARCConstants {
+
+ private static final AtomicInteger SERIAL_NO = new AtomicInteger();
+
+ RecordIDGenerator generator = new UUIDGenerator();
+
+ /**
+ * Prefix to use for ARC files made by JUNIT.
+ */
+ private static final String SUFFIX = "JUNIT";
+
+ private static final String SOME_URL = "http://www.archive.org/test/";
+
+ @SuppressWarnings("unchecked")
+ public void testCheckHeaderLineValue() throws Exception {
+ WARCWriter writer = new WARCWriter(
+ SERIAL_NO,
+ new WARCWriterPoolSettingsData(
+ "","test",1,false,Collections.EMPTY_LIST,Collections.EMPTY_LIST,generator));
+ writer.checkHeaderValue("one");
+ IllegalArgumentException exception = null;
+ try {
+ writer.checkHeaderValue("with space");
+ } catch(IllegalArgumentException e) {
+ exception = e;
+ }
+ assertNotNull(exception);
+ exception = null;
+ try {
+ writer.checkHeaderValue("with\0x0000controlcharacter");
+ } catch(IllegalArgumentException e) {
+ exception = e;
+ }
+ writer.close();
+ assertNotNull(exception);
+ }
+
+ @SuppressWarnings("unchecked")
+ public void testMimetypes() throws IOException {
+ WARCWriter writer = new WARCWriter(SERIAL_NO,
+ new WARCWriterPoolSettingsData(
+ "m","testM",1,false,Collections.EMPTY_LIST,Collections.EMPTY_LIST,generator));
+ writer.checkHeaderLineMimetypeParameter("text/xml");
+ writer.checkHeaderLineMimetypeParameter("text/xml+rdf");
+ assertEquals(writer.checkHeaderLineMimetypeParameter(
+ "text/plain; charset=SHIFT-JIS"), "text/plain; charset=SHIFT-JIS");
+ assertEquals(writer.checkHeaderLineMimetypeParameter(
+ "multipart/mixed; \r\n boundary=\"simple boundary\""),
+ "multipart/mixed; boundary=\"simple boundary\"");
+ }
+
+ public void testWriteRecord() throws IOException {
+ File [] files = {getTmpDir()};
+
+ // Write uncompressed.
+ WARCWriter writer =
+ new WARCWriter(SERIAL_NO, new WARCWriterPoolSettingsData(
+ this.getClass().getName(), "templateWR1", -1, false, Arrays.asList(files), null, generator));
+
+ writeFile(writer);
+ writer.close();
+
+ // Write compressed.
+ writer = new WARCWriter(SERIAL_NO, new WARCWriterPoolSettingsData(
+ this.getClass().getName(), "templateWR2", -1, true, Arrays.asList(files), null, generator));
+
+ writeFile(writer);
+ writer.close();
+ }
+
+ private void writeFile(final WARCWriter writer)
+ throws IOException {
+ try {
+ writeWarcinfoRecord(writer);
+ writeBasicRecords(writer);
+ } finally {
+ writer.close();
+ writer.getFile().delete();
+ }
+ }
+
+ private void writeWarcinfoRecord(WARCWriter writer)
+ throws IOException {
+ WARCRecordInfo recordInfo = new WARCRecordInfo();
+ recordInfo.setType(WARCRecordType.warcinfo);
+ recordInfo.setUrl(null);
+ recordInfo.setCreate14DigitDate(ArchiveUtils.getLog14Date());
+ recordInfo.setMimetype(ANVLRecord.MIMETYPE);
+ recordInfo.setExtraHeaders(null);
+ recordInfo.setEnforceLength(true);
+
+ ANVLRecord meta = new ANVLRecord();
+ meta.addLabelValue("size", "1G");
+ meta.addLabelValue("operator", "igor");
+ byte [] bytes = meta.getUTF8Bytes();
+ recordInfo.setContentStream(new ByteArrayInputStream(bytes));
+ recordInfo.setContentLength((long) bytes.length);
+
+ final URI recordid = writer.generateRecordId(WARCWriter.TYPE, WARCRecordType.warcinfo.toString());
+ recordInfo.setRecordId(recordid);
+
+ writer.writeRecord(recordInfo);
+ }
+
+ protected void writeBasicRecords(final WARCWriter writer)
+ throws IOException {
+ WARCRecordInfo recordInfo = new WARCRecordInfo();
+ recordInfo.setType(WARCRecordType.metadata);
+ recordInfo.setUrl("http://www.archive.org/");
+ recordInfo.setCreate14DigitDate(ArchiveUtils.get14DigitDate());
+ recordInfo.setMimetype("no/type");
+ recordInfo.setEnforceLength(true);
+
+ ANVLRecord headerFields = new ANVLRecord();
+ headerFields.addLabelValue("x", "y");
+ headerFields.addLabelValue("a", "b");
+ recordInfo.setExtraHeaders(headerFields);
+
+ URI rid = (new UUIDGenerator()).getQualifiedRecordID(TYPE, WARCRecordType.metadata.toString());
+ recordInfo.setRecordId(rid);
+
+ final String content = "Any old content.";
+ for (int i = 0; i < 10; i++) {
+ String body = i + ". " + content;
+ byte [] bodyBytes = body.getBytes(UTF8Bytes.UTF8);
+ recordInfo.setContentStream(new ByteArrayInputStream(bodyBytes));
+ recordInfo.setContentLength((long)bodyBytes.length);
+ writer.writeRecord(recordInfo);
+ }
+ }
+
+ /**
+ * @return Generic HTML Content.
+ */
+ protected static String getContent() {
+ return getContent(null);
+ }
+
+ /**
+ * @return Generic HTML Content with mention of passed indexStr
+ * in title and body.
+ */
+ protected static String getContent(String indexStr) {
+ String page = (indexStr != null)? "Page #" + indexStr: "Some Page";
+ return "HTTP/1.1 200 OK\r\n" +
+ "Content-Type: text/html\r\n\r\n" +
+ "" + page +
+ "" +
+ "" + page +
+ "";
+ }
+
+ /**
+ * Write random HTML Record.
+ * @param w Where to write.
+ * @param index An index to put into content.
+ * @return Length of record written.
+ * @throws IOException
+ */
+ protected int writeRandomHTTPRecord(WARCWriter w, int index)
+ throws IOException {
+ WARCRecordInfo recordInfo = new WARCRecordInfo();
+ recordInfo.setType(WARCRecordType.resource);
+ recordInfo.setCreate14DigitDate(ArchiveUtils.get14DigitDate());
+ recordInfo.setMimetype("text/html; charset=UTF-8");
+ recordInfo.setRecordId(w.generateRecordId(null));
+ recordInfo.setEnforceLength(true);
+
+ String indexStr = Integer.toString(index);
+ recordInfo.setUrl("http://www.one.net/id=" + indexStr);
+
+ byte[] record = (getContent(indexStr)).getBytes();
+ recordInfo.setContentLength((long) record.length);
+
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ baos.write(record);
+ recordInfo.setContentStream(new ByteArrayInputStream(baos.toByteArray()));
+
+ // Add named fields for ip, checksum, and relate the metadata
+ // and request to the resource field.
+ recordInfo.addExtraHeader(NAMED_FIELD_IP_LABEL, "127.0.0.1");
+
+ w.writeRecord(recordInfo);
+ return record.length;
+ }
+
+ /**
+ * Fill a WARC with HTML Records.
+ * @param baseName WARC basename.
+ * @param compress Whether to compress or not.
+ * @param maxSize Maximum WARC size.
+ * @param recordCount How many records.
+ * @return The written file.
+ * @throws IOException
+ */
+ private File writeRecords(String baseName, boolean compress,
+ int maxSize, int recordCount)
+ throws IOException {
+ cleanUpOldFiles(baseName);
+ File [] files = {getTmpDir()};
+ WARCWriter w = new WARCWriter(SERIAL_NO, new WARCWriterPoolSettingsData(
+ baseName + '-' + SUFFIX, "${prefix}", maxSize, compress, Arrays.asList(files), null, generator));
+
+ assertNotNull(w);
+ for (int i = 0; i < recordCount; i++) {
+ writeRandomHTTPRecord(w, i);
+ }
+ w.close();
+ assertTrue("Doesn't exist: " + w.getFile().getAbsolutePath(),
+ w.getFile().exists());
+ return w.getFile();
+ }
+
+ /**
+ * Run validation of passed file.
+ * @param f File to validate.
+ * @param recordCount Expected count of records.
+ * @throws FileNotFoundException
+ * @throws IOException
+ */
+ private void validate(File f, int recordCount)
+ throws FileNotFoundException, IOException {
+ WARCReader reader = WARCReaderFactory.get(f);
+ assertNotNull(reader);
+ List headers = null;
+ if (recordCount == -1) {
+ headers = reader.validate();
+ } else {
+ headers = reader.validate(recordCount);
+ }
+ reader.close();
+
+ // Now, run through each of the records doing absolute get going from
+ // the end to start. Reopen the arc so no context between this test
+ // and the previous.
+
+ for (int i = headers.size() - 1; i >= 0; i--) {
+ reader = WARCReaderFactory.get(f);
+ ArchiveRecordHeader h = (ArchiveRecordHeader)headers.get(i);
+ ArchiveRecord r = reader.get(h.getOffset());
+ String mimeType = r.getHeader().getMimetype();
+ assertTrue("Record is bogus",
+ mimeType != null && mimeType.length() > 0);
+ reader.close();
+ }
+
+ assertTrue("Metadatas not equal", headers.size() == recordCount);
+ for (Iterator i = headers.iterator(); i.hasNext();) {
+ ArchiveRecordHeader r = (ArchiveRecordHeader)i.next();
+ assertTrue("Record is empty", r.getLength() > 0);
+ }
+ }
+
+ public void testWriteRecords() throws IOException {
+ final int recordCount = 2;
+ File f = writeRecords("writeRecords", false, DEFAULT_MAX_WARC_FILE_SIZE,
+ recordCount);
+ validate(f, recordCount + 1); // Header record.
+ }
+
+ public void testRandomAccess() throws IOException {
+ final int recordCount = 3;
+ File f = writeRecords("randomAccess", true, DEFAULT_MAX_WARC_FILE_SIZE,
+ recordCount);
+ WARCReader reader = WARCReaderFactory.get(f);
+ // Get to second record. Get its offset for later use.
+ boolean readFirst = false;
+ String url = null;
+ long offset = -1;
+ long totalRecords = 0;
+ boolean readSecond = false;
+ for (final Iterator i = reader.iterator(); i.hasNext();
+ totalRecords++) {
+ WARCRecord ar = (WARCRecord)i.next();
+ if (!readFirst) {
+ readFirst = true;
+ continue;
+ }
+ if (!readSecond) {
+ url = ar.getHeader().getUrl();
+ offset = ar.getHeader().getOffset();
+ readSecond = true;
+ }
+ }
+ reader.close();
+
+ reader = WARCReaderFactory.get(f, offset);
+ ArchiveRecord ar = reader.get();
+ assertEquals(ar.getHeader().getUrl(), url);
+ ar.close();
+ reader.close();
+
+ // Get reader again. See how iterator works with offset
+ reader = WARCReaderFactory.get(f, offset);
+ int count = 0;
+ for (final Iterator i = reader.iterator(); i.hasNext(); i.next()) {
+ count++;
+ }
+ reader.close();
+ assertEquals(totalRecords - 1, count);
+ }
+
+ public void testWriteRecordCompressed() throws IOException {
+ final int recordCount = 2;
+ File arcFile = writeRecords("writeRecordCompressed", true,
+ DEFAULT_MAX_WARC_FILE_SIZE, recordCount);
+ validate(arcFile, recordCount + 1 /*Header record*/);
+ }
+
+ protected WARCWriter createWARCWriter(String name,
+ boolean compress) {
+ File [] files = {getTmpDir()};
+ return new WARCWriter(SERIAL_NO,
+ new WARCWriterPoolSettingsData(
+ name,
+ "${prefix}-"+SUFFIX,
+ DEFAULT_MAX_WARC_FILE_SIZE,
+ compress,
+ Arrays.asList(files),
+ null,
+ generator));
+ }
+
+ protected static ByteArrayOutputStream getBaos(String str)
+ throws IOException {
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ baos.write(str.getBytes());
+ return baos;
+ }
+
+ protected static void writeRecord(WARCWriter w, String url,
+ String mimetype, int len, ByteArrayOutputStream baos)
+ throws IOException {
+ WARCRecordInfo recordInfo = new WARCRecordInfo();
+ recordInfo.setType(WARCRecordType.resource);
+ recordInfo.setUrl(url);
+ recordInfo.setCreate14DigitDate(ArchiveUtils.get14DigitDate());
+ recordInfo.setMimetype(mimetype);
+ recordInfo.setRecordId(w.generateRecordId(null));
+ recordInfo.setExtraHeaders(null);
+ recordInfo.setContentStream(new ByteArrayInputStream(baos.toByteArray()));
+ recordInfo.setContentLength((long) len);
+ recordInfo.setEnforceLength(true);
+
+ w.writeRecord(recordInfo);
+ }
+
+ protected int iterateRecords(WARCReader r)
+ throws IOException {
+ int count = 0;
+ for (Iterator i = r.iterator(); i.hasNext();) {
+ ArchiveRecord ar = i.next();
+ ar.close();
+ if (count != 0) {
+ assertTrue("Unexpected URL " + ar.getHeader().getUrl(),
+ ar.getHeader().getUrl().equals(SOME_URL));
+ }
+ count++;
+ }
+ return count;
+ }
+
+ protected WARCWriter createWithOneRecord(String name,
+ boolean compressed)
+ throws IOException {
+ WARCWriter writer = createWARCWriter(name, compressed);
+ String content = getContent();
+ writeRecord(writer, SOME_URL, "text/html",
+ content.length(), getBaos(content));
+ return writer;
+ }
+
+ public void testSpaceInURL() throws IOException {
+ long bytesWritten = holeyUrl("testSpaceInURL", false, " ");
+ assertEquals("Unexpected successful writing occurred",0,bytesWritten);
+ }
+
+ public void testTabInURL() throws IOException {
+ long bytesWritten = holeyUrl("testTabInURL", false, "\t");
+ assertEquals("Unexpected successful writing occurred",0,bytesWritten);
+ }
+
+ protected long holeyUrl(String name, boolean compress, String urlInsert)
+ throws IOException {
+ WARCWriter writer = createWithOneRecord(name, compress);
+ // Add some bytes on the end to mess up the record.
+ long startPos = writer.getPosition();
+ String content = getContent();
+ ByteArrayOutputStream baos = getBaos(content);
+ writeRecord(writer, SOME_URL + urlInsert + "/index.html", "text/html",
+ content.length(), baos);
+ long endPos = writer.getPosition();
+ writer.close();
+ return endPos-startPos;
+ }
+
+ /**
+ * Write an arc file for other tests to use.
+ * @param arcdir Directory to write to.
+ * @param compress True if file should be compressed.
+ * @return ARC written.
+ * @throws IOException
+ */
+ public static File createWARCFile(File arcdir, boolean compress)
+ throws IOException {
+ File [] files = {arcdir};
+ WARCWriter writer =
+ new WARCWriter(SERIAL_NO,
+ new WARCWriterPoolSettingsData(
+ "",
+ "test",
+ DEFAULT_MAX_WARC_FILE_SIZE,
+ compress,
+ Arrays.asList(files),
+ null,
+ new UUIDGenerator()));
+ String content = getContent();
+ writeRecord(writer, SOME_URL, "text/html", content.length(),
+ getBaos(content));
+ writer.close();
+ return writer.getFile();
+ }
+
+// public void testSpeed() throws IOException {
+// ARCWriter writer = createArcWithOneRecord("speed", true);
+// // Add a record with a length that is too long.
+// String content = getContent();
+// final int count = 100000;
+// logger.info("Starting speed write of " + count + " records.");
+// for (int i = 0; i < count; i++) {
+// writeRecord(writer, SOME_URL, "text/html", content.length(),
+// getBaos(content));
+// }
+// writer.close();
+// logger.info("Finished speed write test.");
+// }
+
+ public void testArcRecordOffsetReads() throws Exception {
+ // Get an ARC with one record.
+ WriterPoolMember w =
+ createWithOneRecord("testArcRecordInBufferStream", true);
+ w.close();
+ // Get reader on said ARC.
+ WARCReader r = WARCReaderFactory.get(w.getFile());
+ final Iterator i = r.iterator();
+ // Skip first ARC meta record.
+ ArchiveRecord ar = i.next();
+ i.hasNext();
+ // Now we're at first and only record in ARC.
+ ar = (WARCRecord) i.next();
+ // Now try getting some random set of bytes out of it
+ // at an odd offset (used to fail because we were
+ // doing bad math to find where in buffer to read).
+ final byte[] buffer = new byte[17];
+ final int maxRead = 4;
+ int totalRead = 0;
+ while (totalRead < maxRead) {
+ totalRead = totalRead
+ + ar.read(buffer, 13 + totalRead, maxRead - totalRead);
+ assertTrue(totalRead > 0);
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/test/java/org/archive/uid/UUIDGeneratorTest.java b/src/test/java/org/archive/uid/UUIDGeneratorTest.java
new file mode 100644
index 00000000..79e98fb6
--- /dev/null
+++ b/src/test/java/org/archive/uid/UUIDGeneratorTest.java
@@ -0,0 +1,44 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.uid;
+
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.HashMap;
+import java.util.Map;
+
+import junit.framework.TestCase;
+
+/**
+ * @author stack
+ * @version $Revision$ $Date$
+ */
+public class UUIDGeneratorTest extends TestCase {
+ public void testQualifyRecordID() throws URISyntaxException {
+ RecordIDGenerator g = new UUIDGenerator();
+ URI uri = g.getRecordID();
+ Map qualifiers = new HashMap();
+ qualifiers.put("a", "b");
+ URI nuURI = g.qualifyRecordID(uri, qualifiers);
+ assertNotSame(uri, nuURI);
+ qualifiers.put("c", "d");
+ nuURI = g.qualifyRecordID(nuURI, qualifiers);
+ assertNotSame(uri, nuURI);
+ }
+}
diff --git a/src/test/java/org/archive/util/FileUtilsTest.java b/src/test/java/org/archive/util/FileUtilsTest.java
new file mode 100644
index 00000000..19271435
--- /dev/null
+++ b/src/test/java/org/archive/util/FileUtilsTest.java
@@ -0,0 +1,271 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.util;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Collections;
+import java.util.LinkedList;
+import java.util.List;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang.math.LongRange;
+
+
+/**
+ * FileUtils tests.
+ *
+ * @contributor stack
+ * @contributor gojomo
+ * @version $Date$, $Revision$
+ */
+public class FileUtilsTest extends TmpDirTestCase {
+ private String srcDirName = FileUtilsTest.class.getName() + ".srcdir";
+ private File srcDirFile = null;
+ private String tgtDirName = FileUtilsTest.class.getName() + ".tgtdir";
+ private File tgtDirFile = null;
+
+ protected File zeroLengthLinesUnix;
+ protected File zeroLengthLinesWindows;
+
+ protected File smallLinesUnix;
+ protected File smallLinesWindows;
+ protected File largeLinesUnix;
+ protected File largeLinesWindows;
+ protected File nakedLastLineUnix;
+ protected File nakedLastLineWindows;
+
+
+ protected void setUp() throws Exception {
+ super.setUp();
+ this.srcDirFile = new File(getTmpDir(), srcDirName);
+ FileUtils.ensureWriteableDirectory(srcDirFile);
+ this.tgtDirFile = new File(getTmpDir(), tgtDirName);
+ FileUtils.ensureWriteableDirectory(tgtDirFile);
+ addFiles();
+
+ zeroLengthLinesUnix = setUpLinesFile("zeroLengthLinesUnix",0,0,400,IOUtils.LINE_SEPARATOR_UNIX);
+ zeroLengthLinesWindows = setUpLinesFile("zeroLengthLinesUnix",0,0,400,IOUtils.LINE_SEPARATOR_WINDOWS);
+
+ smallLinesUnix = setUpLinesFile("smallLinesUnix", 0, 25, 400, IOUtils.LINE_SEPARATOR_UNIX);
+ smallLinesWindows = setUpLinesFile("smallLinesWindows", 0, 25, 400, IOUtils.LINE_SEPARATOR_WINDOWS);
+ largeLinesUnix = setUpLinesFile("largeLinesUnix", 128, 256, 5, IOUtils.LINE_SEPARATOR_UNIX);
+ largeLinesWindows = setUpLinesFile("largeLinesWindows", 128, 256, 4096, IOUtils.LINE_SEPARATOR_WINDOWS);
+
+ nakedLastLineUnix = setUpLinesFile("nakedLastLineUnix", 0, 50, 401, IOUtils.LINE_SEPARATOR_UNIX);
+ org.apache.commons.io.FileUtils.writeStringToFile(nakedLastLineUnix,"a");
+ nakedLastLineWindows = setUpLinesFile("nakedLastLineWindows", 0, 50, 401, IOUtils.LINE_SEPARATOR_WINDOWS);
+ org.apache.commons.io.FileUtils.writeStringToFile(nakedLastLineWindows,"a");
+ }
+
+ private void addFiles() throws IOException {
+ addFiles(3, this.getName());
+ }
+
+ private void addFiles(final int howMany, final String baseName)
+ throws IOException {
+ for (int i = 0; i < howMany; i++) {
+ File.createTempFile(baseName, null, this.srcDirFile);
+ }
+ }
+
+ private File setUpLinesFile(String name, int minLineSize, int maxLineSize, int lineCount, String lineEnding) throws IOException {
+ List lines = new LinkedList();
+ StringBuilder sb = new StringBuilder(maxLineSize);
+ for(int i = 0; i< lineSize; j++) {
+ sb.append("-");
+ }
+ lines.add(sb.toString());
+ }
+ File file = File.createTempFile(name, null);
+ org.apache.commons.io.FileUtils.writeLines(file, lines, lineEnding);
+ return file;
+
+ }
+
+ protected void tearDown() throws Exception {
+ super.tearDown();
+ org.apache.commons.io.FileUtils.deleteQuietly(this.srcDirFile);
+ org.apache.commons.io.FileUtils.deleteQuietly(this.tgtDirFile);
+ org.apache.commons.io.FileUtils.deleteQuietly(zeroLengthLinesUnix);
+ org.apache.commons.io.FileUtils.deleteQuietly(zeroLengthLinesWindows);
+ org.apache.commons.io.FileUtils.deleteQuietly(smallLinesUnix);
+ org.apache.commons.io.FileUtils.deleteQuietly(smallLinesWindows);
+ org.apache.commons.io.FileUtils.deleteQuietly(largeLinesUnix);
+ org.apache.commons.io.FileUtils.deleteQuietly(largeLinesWindows);
+ org.apache.commons.io.FileUtils.deleteQuietly(nakedLastLineUnix);
+ org.apache.commons.io.FileUtils.deleteQuietly(nakedLastLineWindows);
+
+ }
+
+ public void testCopyFile() {
+ // Test exception copying nonexistent file.
+ File [] srcFiles = this.srcDirFile.listFiles();
+ srcFiles[0].delete();
+ IOException e = null;
+ try {
+ FileUtils.copyFile(srcFiles[0],
+ new File(this.tgtDirFile, srcFiles[0].getName()));
+ } catch (IOException ioe) {
+ e = ioe;
+ }
+ assertNotNull("Didn't get expected IOE", e);
+ }
+
+ public void testTailLinesZeroLengthUnix() throws IOException {
+ verifyTailLines(zeroLengthLinesUnix);
+ }
+
+ public void testTailLinesZeroLengthWindows() throws IOException {
+ verifyTailLines(zeroLengthLinesWindows);
+ }
+
+ public void testTailLinesSmallUnix() throws IOException {
+ verifyTailLines(smallLinesUnix);
+ }
+
+ public void testTailLinesLargeUnix() throws IOException {
+ verifyTailLines(largeLinesUnix);
+ }
+
+ public void testTailLinesSmallWindows() throws IOException {
+ verifyTailLines(smallLinesWindows);
+ }
+
+ public void testTailLinesLargeWindows() throws IOException {
+ verifyTailLines(largeLinesWindows);
+ }
+
+ public void testTailLinesNakedUnix() throws IOException {
+ verifyTailLines(nakedLastLineUnix);
+ }
+
+ public void testTailLinesNakedWindows() throws IOException {
+ verifyTailLines(nakedLastLineWindows);
+ }
+
+ @SuppressWarnings("unchecked")
+ private void verifyTailLines(File file) throws IOException {
+ List lines = org.apache.commons.io.FileUtils.readLines(file);
+ verifyTailLines(file, lines, 1, 80);
+ verifyTailLines(file, lines, 5, 80);
+ verifyTailLines(file, lines, 10, 80);
+ verifyTailLines(file, lines, 20, 80);
+ verifyTailLines(file, lines, 100, 80);
+ verifyTailLines(file, lines, 1, 1);
+ verifyTailLines(file, lines, 5, 1);
+ verifyTailLines(file, lines, 10, 1);
+ verifyTailLines(file, lines, 20, 1);
+ verifyTailLines(file, lines, 100, 1);
+ }
+
+
+ private void verifyTailLines(File file, List lines, int count, int estimate) throws IOException {
+ List testLines;
+ testLines = getTestTailLines(file,count,estimate);
+ assertEquals("line counts not equal:"+file.getName()+" "+count+" "+estimate,lines.size(),testLines.size());
+ assertEquals("lines not equal: "+file.getName()+" "+count+" "+estimate,lines,testLines);
+ }
+
+ private List getTestTailLines(File file, int count, int estimate) throws IOException {
+ long pos = -1;
+ List testLines = new LinkedList();
+ do {
+ List returnedLines = new LinkedList();
+ LongRange range = FileUtils.pagedLines(file,pos,-count,returnedLines,estimate);
+ Collections.reverse(returnedLines);
+ testLines.addAll(returnedLines);
+ pos = range.getMinimumLong()-1;
+ } while (pos>=0);
+ Collections.reverse(testLines);
+ return testLines;
+ }
+
+ public void testHeadLinesZeroLengthUnix() throws IOException {
+ verifyHeadLines(zeroLengthLinesUnix);
+ }
+
+ public void testHeadLinesZeroLengthWindows() throws IOException {
+ verifyHeadLines(zeroLengthLinesWindows);
+ }
+
+ public void testHeadLinesSmallUnix() throws IOException {
+ verifyHeadLines(smallLinesUnix);
+ }
+
+ public void testHeadLinesLargeUnix() throws IOException {
+ verifyHeadLines(largeLinesUnix);
+ }
+
+ public void testHeadLinesSmallWindows() throws IOException {
+ verifyHeadLines(smallLinesWindows);
+ }
+
+ public void testHeadLinesLargeWindows() throws IOException {
+ verifyHeadLines(largeLinesWindows);
+ }
+
+ public void testHeadLinesNakedUnix() throws IOException {
+ verifyHeadLines(nakedLastLineUnix);
+ }
+
+ public void testHeadLinesNakedWindows() throws IOException {
+ verifyHeadLines(nakedLastLineWindows);
+ }
+
+
+ @SuppressWarnings("unchecked")
+ private void verifyHeadLines(File file) throws IOException {
+ List lines = org.apache.commons.io.FileUtils.readLines(file);
+ verifyHeadLines(file, lines, 1, 80);
+ verifyHeadLines(file, lines, 5, 80);
+ verifyHeadLines(file, lines, 10, 80);
+ verifyHeadLines(file, lines, 20, 80);
+ verifyHeadLines(file, lines, 100, 80);
+ verifyHeadLines(file, lines, 1, 1);
+ verifyHeadLines(file, lines, 5, 1);
+ verifyHeadLines(file, lines, 10, 1);
+ verifyHeadLines(file, lines, 20, 1);
+ verifyHeadLines(file, lines, 100, 1);
+ }
+
+
+ private void verifyHeadLines(File file, List lines, int count, int estimate) throws IOException {
+ List testLines;
+ testLines = getTestHeadLines(file,count,estimate);
+ assertEquals("line counts not equal:"+file.getName()+" "+count+" "+estimate,lines.size(),testLines.size());
+ assertEquals("lines not equal: "+file.getName()+" "+count+" "+estimate,lines,testLines);
+ }
+
+ private List getTestHeadLines(File file, int count, int estimate) throws IOException {
+ long pos = 0;
+ List testLines = new LinkedList();
+ do {
+ LongRange range = FileUtils.pagedLines(file,pos,count,testLines,estimate);
+ pos = range.getMaximumLong();
+ } while (pos m = am.asMap();
+ logger.fine(m.toString());
+ }
+
+ public void testEmptyRecord() throws Exception {
+ byte [] b = ANVLRecord.EMPTY_ANVL_RECORD.getUTF8Bytes();
+ assertEquals(b.length, 2);
+ assertEquals(b[0], '\r');
+ assertEquals(b[1], '\n');
+ }
+
+ public void testFolding() throws Exception {
+ ANVLRecord am = new ANVLRecord();
+ Exception e = null;
+ try {
+ am.addLabel("Label with \n in it");
+ } catch (IllegalArgumentException iae) {
+ e = iae;
+ }
+ assertTrue(e != null && e instanceof IllegalArgumentException);
+ am.addLabelValue("label", "value with \n in it");
+ }
+
+ public void testParse() throws UnsupportedEncodingException, IOException {
+ String record = " a: b\r\n#c#\r\nc:d\r\n \t\t\r\t\n\te" +
+ "\r\nx:\r\n # z\r\n\r\n";
+ ANVLRecord r = ANVLRecord.load(new ByteArrayInputStream(
+ record.getBytes("ISO-8859-1")));
+ logger.fine(r.toString());
+ assertEquals(r.get(0).toString(), "a: b");
+ record = " a: b\r\n\r\nsdfsdsdfds";
+ r = ANVLRecord.load(new ByteArrayInputStream(
+ record.getBytes("ISO-8859-1")));
+ logger.fine(r.toString());
+ record = "x:\r\n # z\r\ny:\r\n\r\n";
+ r = ANVLRecord.load(new ByteArrayInputStream(
+ record.getBytes("ISO-8859-1")));
+ logger.fine(r.toString());
+ assertEquals(r.get(0).toString(), "x:");
+ }
+
+ public void testExampleParse()
+ throws UnsupportedEncodingException, IOException {
+ final String sample = "entry:\t\t\r\n# first ###draft\r\n" +
+ "who:\tGilbert, W.S. | Sullivan, Arthur\r\n" +
+ "what:\tThe Yeoman of\r\n" +
+ "\t\tthe Guard\r\n" +
+ "when/created:\t 1888\r\n\r\n";
+ ANVLRecord r = ANVLRecord.load(new ByteArrayInputStream(
+ sample.getBytes("ISO-8859-1")));
+ logger.fine(r.toString());
+ }
+
+ public void testPoundLabel()
+ throws UnsupportedEncodingException, IOException {
+ final String sample = "ent#ry:\t\t\r\n# first ###draft\r\n" +
+ "who:\tGilbert, W.S. | Sullivan, Arthur\r\n" +
+ "what:\tThe Yeoman of\r\n" +
+ "\t\tthe Guard\r\n" +
+ "when/created:\t 1888\r\n\r\n";
+ ANVLRecord r = ANVLRecord.load(sample);
+ logger.fine(r.toString());
+ }
+
+ public void testNewlineLabel()
+ throws UnsupportedEncodingException, IOException {
+ final String sample = "ent\nry:\t\t\r\n# first ###draft\r\n" +
+ "who:\tGilbert, W.S. | Sullivan, Arthur\r\n" +
+ "what:\tThe Yeoman of\r\n" +
+ "\t\tthe Guard\r\n" +
+ "when/created:\t 1888\r\n\r\n";
+ IllegalArgumentException iae = null;
+ try {
+ ANVLRecord.load(sample);
+ } catch(IllegalArgumentException e) {
+ iae = e;
+ }
+ assertTrue(iae != null);
+ }
+}
From b04f5d82604245461b6a802f1962d86e3d899e98 Mon Sep 17 00:00:00 2001
From: Lauren Ko
Date: Thu, 9 Mar 2017 11:32:03 -0600
Subject: [PATCH 003/189] Updating CHANGES.md
---
CHANGES.md | 1 +
1 file changed, 1 insertion(+)
diff --git a/CHANGES.md b/CHANGES.md
index fee29e16..767881ec 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,5 +1,6 @@
1.1.8
-----
+* [Move unit tests over from heritrix3 to webarchive-commons](https://github.com/iipc/webarchive-commons/issues/25)
* [Strip empty port via URLParser](https://github.com/iipc/webarchive-commons/pull/69/)
* [Use CharsetDetector to guess encoding of HTML documents](https://github.com/iipc/webarchive-commons/pull/68/)
* [Fix last header was lost if LF LF](https://github.com/iipc/webarchive-commons/pull/65/)
From b655796770eb967c931d656b1c80d4967f91e7fc Mon Sep 17 00:00:00 2001
From: Lauren Ko
Date: Tue, 21 Mar 2017 14:20:54 -0500
Subject: [PATCH 004/189] Updating change log.
---
CHANGES.md | 1 +
1 file changed, 1 insertion(+)
diff --git a/CHANGES.md b/CHANGES.md
index 767881ec..ccdc1ce7 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,5 +1,6 @@
1.1.8
-----
+* [Improve HTML link extraction](https://github.com/iipc/webarchive-commons/pull/72)
* [Move unit tests over from heritrix3 to webarchive-commons](https://github.com/iipc/webarchive-commons/issues/25)
* [Strip empty port via URLParser](https://github.com/iipc/webarchive-commons/pull/69/)
* [Use CharsetDetector to guess encoding of HTML documents](https://github.com/iipc/webarchive-commons/pull/68/)
From aee6ff55bfcaa5a9e15092f8c3b1e40ec9faaf87 Mon Sep 17 00:00:00 2001
From: John Erik Halse
Date: Tue, 2 May 2017 12:25:28 +0200
Subject: [PATCH 005/189] [maven-release-plugin] prepare release
webarchive-commons-1.1.8
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index 24780063..63909b90 100644
--- a/pom.xml
+++ b/pom.xml
@@ -9,7 +9,7 @@
org.netpreserve.commons
webarchive-commons
- 1.1.8-SNAPSHOT
+ 1.1.8
jar
webarchive-commons
From dfe1f62e416f6a881fe15a2544449fff44dd1e51 Mon Sep 17 00:00:00 2001
From: John Erik Halse
Date: Tue, 2 May 2017 12:25:35 +0200
Subject: [PATCH 006/189] [maven-release-plugin] prepare for next development
iteration
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index 63909b90..23953c06 100644
--- a/pom.xml
+++ b/pom.xml
@@ -9,7 +9,7 @@
org.netpreserve.commons
webarchive-commons
- 1.1.8
+ 1.1.9-SNAPSHOT
jar
webarchive-commons
From cf34a3e13c09cfa4a1412492cfcf3503df698931 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Fri, 28 Apr 2017 22:41:56 +0200
Subject: [PATCH 007/189] Do not add value of preceding HTTP header field if
there is no value (or only white space)
---
.../archive/format/http/HttpHeaderParser.java | 4 ++--
.../format/http/HttpResponseParserTest.java | 24 +++++++++++++++++++
2 files changed, 26 insertions(+), 2 deletions(-)
diff --git a/src/main/java/org/archive/format/http/HttpHeaderParser.java b/src/main/java/org/archive/format/http/HttpHeaderParser.java
index d63ec405..bee3c28b 100755
--- a/src/main/java/org/archive/format/http/HttpHeaderParser.java
+++ b/src/main/java/org/archive/format/http/HttpHeaderParser.java
@@ -301,8 +301,9 @@ public ParseState handleByte(byte b, HttpHeaderParser parser) throws HttpParseEx
if(isLWSP(b)) {
return parser.postColonState;
}
+ // reset previous value also in case the header value is empty
+ parser.setValueStartIdx();
if(b == CR) {
- // TODO: THINK more...
parser.valuePreCRState = parser.postColonState;
return parser.valuePostCRState;
}
@@ -310,7 +311,6 @@ public ParseState handleByte(byte b, HttpHeaderParser parser) throws HttpParseEx
// TODO: this is lax, is LFLF an OK terminator?
return parser.lineStartState;
}
- parser.setValueStartIdx();
parser.addValueByte(b);
return parser.valueState;
}
diff --git a/src/test/java/org/archive/format/http/HttpResponseParserTest.java b/src/test/java/org/archive/format/http/HttpResponseParserTest.java
index c0d13230..ea076a69 100644
--- a/src/test/java/org/archive/format/http/HttpResponseParserTest.java
+++ b/src/test/java/org/archive/format/http/HttpResponseParserTest.java
@@ -57,4 +57,28 @@ public void testParseWithLf() throws IOException {
}
+ public void testParseEmptyHeaderField() throws IOException {
+
+ HttpResponseParser parser = new HttpResponseParser();
+ String message = "200 OK\r\nContent-Type: text/plain\r\nServer: \r\n\r\nHi there";
+ try {
+ HttpResponse response =
+ parser.parse(new ByteArrayInputStream(message.getBytes(IAUtils.UTF8)));
+ assertNotNull(response);
+ HttpHeaders headers = response.getHeaders();
+ assertNotNull(headers);
+ assertEquals(2, headers.size());
+ HttpHeader header = headers.get(1);
+ assertEquals("Server",header.getName());
+ System.err.println(header.getValue());
+ assertFalse("text/plain".equals(header.getValue()));
+ TestUtils.assertStreamEquals(response, "Hi there".getBytes(IAUtils.UTF8));
+
+ } catch (HttpParseException e) {
+ e.printStackTrace();
+ fail();
+ }
+
+ }
+
}
From bd08143577ea35cb48047a08b2bb67e806992cc2 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Thu, 29 Sep 2016 11:44:18 +0200
Subject: [PATCH 008/189] Extract also `property` attributes of HTML meta
elements, this fixes #67
---
.../java/org/archive/resource/html/ExtractingParseObserver.java | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
index 826851e0..52989455 100644
--- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
+++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
@@ -406,7 +406,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
private static class MetaTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
- ArrayList l = getAttrList(node,"name","rel","content","http-equiv");
+ ArrayList l = getAttrList(node,"name","rel","content","http-equiv","property");
if(l != null) {
data.addMeta(l);
}
From 4077670acca3f0d2958d926692cdb3a6b29428ca Mon Sep 17 00:00:00 2001
From: Lauren Ko
Date: Tue, 2 May 2017 15:15:06 -0500
Subject: [PATCH 009/189] Fix HTTP-Response-Metadata for wget WARCs. Changes
came from
https://github.com/commoncrawl/ia-web-commons/commit/58e85a60d75707da55ed499f836e57d49347484a
---
.../org/archive/extract/ExtractingResourceFactoryMapper.java | 5 ++++-
src/main/java/org/archive/format/warc/WARCConstants.java | 4 +++-
2 files changed, 7 insertions(+), 2 deletions(-)
diff --git a/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java b/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java
index ad10be40..0afe16fb 100644
--- a/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java
+++ b/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java
@@ -153,7 +153,10 @@ private boolean isWARCInfoResource(MetaData envelope) {
private boolean isHTTPResponseWARCResource(MetaData envelope) {
return childFieldEquals(envelope,WARC_HEADER_METADATA,
WARCConstants.CONTENT_TYPE,
- WARCConstants.HTTP_RESPONSE_MIMETYPE);
+ WARCConstants.HTTP_RESPONSE_MIMETYPE)
+ || childFieldEquals(envelope,WARC_HEADER_METADATA,
+ WARCConstants.CONTENT_TYPE,
+ WARCConstants.HTTP_RESPONSE_MIMETYPE_NS);
}
private boolean isWARCJSONResource(MetaData envelope) {
return childFieldEquals(envelope,WARC_HEADER_METADATA,
diff --git a/src/main/java/org/archive/format/warc/WARCConstants.java b/src/main/java/org/archive/format/warc/WARCConstants.java
index 93a81f96..504dc380 100644
--- a/src/main/java/org/archive/format/warc/WARCConstants.java
+++ b/src/main/java/org/archive/format/warc/WARCConstants.java
@@ -209,7 +209,9 @@ enum WARCRecordType {
"application/http; msgtype=request";
public static final String HTTP_RESPONSE_MIMETYPE =
"application/http; msgtype=response";
-
+ public static final String HTTP_RESPONSE_MIMETYPE_NS =
+ "application/http;msgtype=response"; // wget does this
+
public static final String FTP_CONTROL_CONVERSATION_MIMETYPE =
"text/x-ftp-control-conversation";
From 3bba7e489b7d946eea83344e2150faebe0b35ed2 Mon Sep 17 00:00:00 2001
From: Lauren Ko
Date: Tue, 2 May 2017 15:41:23 -0500
Subject: [PATCH 010/189] Update with fixes for 1.1.9
---
CHANGES.md | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/CHANGES.md b/CHANGES.md
index ccdc1ce7..1ba5c1de 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,3 +1,9 @@
+1.1.9
+-----
+* [Extract `property` attributes of HTML meta elements](https://github.com/iipc/webarchive-commons/pull/75)
+* [Do not add value of preceding HTTP header field if there is no value](https://github.com/iipc/webarchive-commons/pull/74)
+* [Fix WAT records corresponding to response records of Wget generated WARCs](https://github.com/iipc/webarchive-commons/pull/74)
+
1.1.8
-----
* [Improve HTML link extraction](https://github.com/iipc/webarchive-commons/pull/72)
From 4101f7e39cbdcc508a936faf8b519e68258b9639 Mon Sep 17 00:00:00 2001
From: Naomi Dushay
Date: Tue, 8 Aug 2017 16:08:43 -0700
Subject: [PATCH 011/189] use commons-collections v3.2.2 to avoid v3.2.1
vulnerability
---
pom.xml | 29 +++++++++++++++++++++--------
1 file changed, 21 insertions(+), 8 deletions(-)
diff --git a/pom.xml b/pom.xml
index 23953c06..8373cdad 100644
--- a/pom.xml
+++ b/pom.xml
@@ -72,7 +72,7 @@
guava
17.0
-
+
org.json
json
@@ -89,12 +89,12 @@
juniversalchardet
1.0.3
-
+
commons-httpclient
commons-httpclient
3.1
-
+
org.apache.hadoop
@@ -128,12 +128,12 @@
tomcat
jasper-compiler
-
+
hsqldb
hsqldb
-
-
+
+
@@ -160,7 +160,7 @@
libidn
1.15
-
+
it.unimi.dsi
dsiutils
2.0.12
@@ -170,13 +170,26 @@
ch.qos.logback
logback-classic
+
+
+ commons-collections
+ commons-collections
+
+
+
+
+ commons-collections
+ commons-collections
+ 3.2.2
+
+
org.apache.httpcomponents
httpcore
4.3
-
+
joda-time
joda-time
From 988bec707c27a01333becfc3bd502af4441ea1e1 Mon Sep 17 00:00:00 2001
From: Lauren Ko
Date: Wed, 9 Aug 2017 10:57:28 -0500
Subject: [PATCH 012/189] Update CHANGES.md for PR 77
---
CHANGES.md | 1 +
1 file changed, 1 insertion(+)
diff --git a/CHANGES.md b/CHANGES.md
index 1ba5c1de..dcb598d9 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,5 +1,6 @@
1.1.9
-----
+* [Use commons-collections v3.2.2 to avoid v3.2.1 vulnerability](https://github.com/iipc/webarchive-commons/pull/77)
* [Extract `property` attributes of HTML meta elements](https://github.com/iipc/webarchive-commons/pull/75)
* [Do not add value of preceding HTTP header field if there is no value](https://github.com/iipc/webarchive-commons/pull/74)
* [Fix WAT records corresponding to response records of Wget generated WARCs](https://github.com/iipc/webarchive-commons/pull/74)
From 2e8cdea3d245c11e1ea3a2a6153c0038479aef12 Mon Sep 17 00:00:00 2001
From: nruest
Date: Tue, 7 May 2019 13:23:28 -0400
Subject: [PATCH 013/189] [maven-release-plugin] prepare release
webarchive-commons-1.1.9
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index 8373cdad..833f42c3 100644
--- a/pom.xml
+++ b/pom.xml
@@ -9,7 +9,7 @@
org.netpreserve.commons
webarchive-commons
- 1.1.9-SNAPSHOT
+ 1.1.9
jar
webarchive-commons
From da029db2ba89205b93a5291ed18b9c69155271bb Mon Sep 17 00:00:00 2001
From: nruest
Date: Tue, 7 May 2019 13:23:34 -0400
Subject: [PATCH 014/189] [maven-release-plugin] prepare for next development
iteration
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index 833f42c3..1cbeb99a 100644
--- a/pom.xml
+++ b/pom.xml
@@ -9,7 +9,7 @@
org.netpreserve.commons
webarchive-commons
- 1.1.9
+ 1.1.10-SNAPSHOT
jar
webarchive-commons
From 723b18a35f8be786cb073282b5ea88b5d8c643ce Mon Sep 17 00:00:00 2001
From: nruest
Date: Tue, 7 May 2019 13:56:18 -0400
Subject: [PATCH 015/189] Update TravisCI config; resolves #82.
- Test Oracle Java 8
- Test OpenJDK Java 8
- Use trusty
- Require sudo for OpenJDK7
- Remove Oracle Java 7 (it's gone!)
- Remove mvn site from the build process since there is no javadoc site
(at least that I can tell)
---
.travis.yml | 13 +++++++++----
1 file changed, 9 insertions(+), 4 deletions(-)
diff --git a/.travis.yml b/.travis.yml
index 0dfd3f7f..54daf83b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,7 +1,13 @@
+dist: trusty
language: java
+# sudo required for OpenJDK7 support per:
+# https://github.com/travis-ci/travis-ci/issues/7884#issuecomment-309689557
+sudo: required
jdk:
- - oraclejdk7
+ - openjdk7
+ - oraclejdk8
+ - openjdk8
before_install:
- "git clone https://github.com/iipc/travis.git target/travis"
@@ -11,8 +17,8 @@ before_script:
- "export MAVEN_OPTS=-Xmx512m"
- "ulimit -u 2048"
-script:
- - "target/travis/deploy-if.sh"
+script:
+ - mvn install -B -V
# whitelist in the master branch only
branches:
@@ -23,4 +29,3 @@ env:
global:
- secure: "qDKjVdoe4Qcz4WfXiQydU7tyl51T62FUJrjqu4FUPBcgeQhFQiggwhpaE6xCOzOpxbsuBi2R1c8gMQf5esE5iDL5jZMu+kz++dYbuzMTd13ttvZWMW5wRPH0H8iHk609FP/RDtVKKBr7WO0JvvIAZEhWNHZrLXBrrKgdTey171g="
- secure: "FXGBKJNP9X7ePJfS4eYTZtoFo4RT1sxor34XxncSJr7uV6ggtZb4B4WNd16IlLcDk6E32sx8YoWdltaOGwQ5Vg/kux5Ko/wKZCoccS018Ln1bRT86dD1KoPY34rGoNJVQxe7J/1MPqpBKwmi2XCKfzpsEh3W7bbIqg8w9MEOOZA="
-
From 79aed910b44510294367a4acf4f3e6376b1c62c0 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Wed, 23 Aug 2017 17:04:52 +0200
Subject: [PATCH 016/189] ExtractingParseObserver: get links from onClick
attributes - extract links from JavaScript code snippets in onClick
attributes of INPUT and DIV elements
---
.../html/ExtractingParseObserver.java | 40 +++++++++++++++++-
.../html/ExtractingParseObserverTest.java | 10 +++++
.../resource/html/link-extraction-test.warc | 42 +++++++++++++++++++
3 files changed, 91 insertions(+), 1 deletion(-)
diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
index 52989455..e4fa83c7 100644
--- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
+++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
@@ -39,6 +39,15 @@ public class ExtractingParseObserver implements ParseObserver {
protected static Pattern cssUrlTrimPattern = Pattern.compile(cssUrlTrimPatString);
+ protected static String jsOnClickUrl1PatString =
+ "(?i)^(?:javascript:)?(?:(?:window|top|document|self|parent)\\.)?location(?:\\.href)?\\s*=\\s*('|')([^'\"]{3,256})\\1$";
+ protected static String jsOnClickUrl2PatString =
+ "(?i)^(?:javascript:)?(?:window|parent)\\.open\\((['\"]|')([^\"']{3,256}?)\\1[,)]";
+ protected static Pattern[] jsOnClickUrlPatterns = {
+ Pattern.compile(jsOnClickUrl1PatString),
+ Pattern.compile(jsOnClickUrl2PatString)
+ };
+
private final static int MAX_TEXT_LEN = 100;
private static final String PATH = "path";
@@ -51,6 +60,7 @@ public class ExtractingParseObserver implements ParseObserver {
extractors.put("APPLET", new AppletTagExtractor());
extractors.put("AREA", new AreaTagExtractor());
extractors.put("BASE", new BaseTagExtractor());
+ extractors.put("DIV", new DivTagExtractor());
extractors.put("EMBED", new EmbedTagExtractor());
extractors.put("FORM", new FormTagExtractor());
extractors.put("FRAME", new FrameTagExtractor());
@@ -268,7 +278,20 @@ private static void addHrefWithAttrs(HTMLMetaData data, TagNode node,
if(l != null) {
data.addHref(l);
}
- }
+ }
+
+ private static void addHrefsOnclick(HTMLMetaData data, TagNode node) {
+ String onclick = node.getAttribute("onclick");
+ if (onclick != null) {
+ String path = makePath(node.getTagName(), "onclick");
+ for (Pattern pattern : jsOnClickUrlPatterns) {
+ String url = patternJSExtract(pattern, onclick);
+ if (url != null) {
+ data.addHref(PATH, path, "url", url);
+ }
+ }
+ }
+ }
private interface TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs);
@@ -330,6 +353,12 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
}
}
+ private static class DivTagExtractor implements TagExtractor {
+ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
+ addHrefsOnclick(data,node);
+ }
+ }
+
private static class EmbedTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
addBasicHrefs(data,node,"src");
@@ -386,6 +415,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
private static class InputTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
addBasicHrefs(data,node,"src","formaction");
+ addHrefsOnclick(data,node);
}
}
@@ -450,4 +480,12 @@ private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String conten
}
}
}
+
+ private static String patternJSExtract(Pattern pattern, String content) {
+ Matcher m = pattern.matcher(content);
+ if (m.find()) {
+ return m.group(2);
+ }
+ return null;
+ }
}
diff --git a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
index 8f690a06..4828ad64 100644
--- a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
+++ b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
@@ -263,6 +263,16 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
{"http://www.your-domain.com/your-page.html", "DIV@/data-href"}
};
checkLinks(extractor.getNext(), fbSocialLinks);
+ String[][] onClickLinks = {
+ {"webpage.html", "DIV@/onclick"},
+ {"index.html", "INPUT@/onclick"},
+ {"http://www.x.com/", "INPUT@/onclick"},
+ {"button-child.php", "INPUT@/onclick"},
+ {"http://example.com/", "INPUT@/onclick"},
+ {"http://example.com/location/href/1.html", "INPUT@/onclick"},
+ {"http://example.com/location/href/2.html", "INPUT@/onclick"}
+ };
+ checkLinks(extractor.getNext(), onClickLinks);
}
}
diff --git a/src/test/resources/org/archive/resource/html/link-extraction-test.warc b/src/test/resources/org/archive/resource/html/link-extraction-test.warc
index ab0e54c8..1a30598e 100644
--- a/src/test/resources/org/archive/resource/html/link-extraction-test.warc
+++ b/src/test/resources/org/archive/resource/html/link-extraction-test.warc
@@ -318,3 +318,45 @@ Content-Type: text/html
+WARC/1.0
+WARC-Type: response
+WARC-Date: 2017-08-23T13:54:59Z
+Content-Type: application/http;msgtype=response
+Content-Length: 1279
+
+HTTP/1.1 200 OK
+Date: Wed, 23 Aug 2017 13:54:59 GMT
+Server: Apache/2.4.18 (Ubuntu)
+Last-Modified: Wed, 23 Aug 2017 13:54:03 GMT
+ETag: "3ca-5576c0b718ab3"
+Accept-Ranges: bytes
+Content-Length: 971
+Vary: Accept-Encoding
+Keep-Alive: timeout=5, max=100
+Connection: Keep-Alive
+Content-Type: text/html
+
+
+
+Test Extraction of URLs from INPUT onClick Attributes
+
+
+
+
+ Click to load webpage
+
+
+
+
+
+
+
+
From 26b1e7af27abec102ab36faf6a786dfedf9436fd Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Wed, 23 Aug 2017 14:48:05 +0200
Subject: [PATCH 017/189] ExtractingParseObserver: extract rel, hreflang and
type attributes - add "rel" attribute to A and AREA links - add attributes
"hreflang" and "type" (MIME type) to A@/href links
---
.../html/ExtractingParseObserver.java | 19 +++++++++++++++++--
1 file changed, 17 insertions(+), 2 deletions(-)
diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
index 52989455..a487fd34 100644
--- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
+++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
@@ -284,7 +284,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
l.add(makePath("A","href"));
l.add("url");
l.add(url);
- for(String a : new String[] {"target","alt","title"}) {
+ for(String a : new String[] {"target","alt","title","rel","hreflang","type"}) {
String v = node.getAttribute(a);
if(v != null) {
l.add(a);
@@ -311,7 +311,22 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
private static class AreaTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
- addBasicHrefs(data,node,"href");
+ String url = node.getAttribute("href");
+ if(url != null) {
+ ArrayList l = new ArrayList();
+ l.add(PATH);
+ l.add(makePath("AREA","href"));
+ l.add("url");
+ l.add(url);
+ for(String a : new String[] {"rel"}) {
+ String v = node.getAttribute(a);
+ if(v != null) {
+ l.add(a);
+ l.add(v);
+ }
+ }
+ data.addHref(l);
+ }
}
}
From a2cc42cac2777d06ab40e09811cdc883773775b9 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Thu, 11 Jun 2020 14:24:03 +0200
Subject: [PATCH 018/189] WAT extractor: do not fail on missing WARC-Filename
in warcinfo record, fixes #88 - do not throw IOException if there is no
WARC-Filename in warcinfo record - write metadata record (corresponding to
warcinfo) without WARC-Target-URI
---
src/main/java/org/archive/extract/WATExtractorOutput.java | 2 +-
src/main/java/org/archive/format/warc/WARCRecordWriter.java | 5 ++++-
2 files changed, 5 insertions(+), 2 deletions(-)
diff --git a/src/main/java/org/archive/extract/WATExtractorOutput.java b/src/main/java/org/archive/extract/WATExtractorOutput.java
index 3bcfa924..4b5f72ed 100644
--- a/src/main/java/org/archive/extract/WATExtractorOutput.java
+++ b/src/main/java/org/archive/extract/WATExtractorOutput.java
@@ -151,7 +151,7 @@ private void writeWARC(OutputStream recOut, MetaData md) throws IOException {
String warcType = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Type");
String targetURI;
if(warcType.equals("warcinfo")) {
- targetURI = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Filename");
+ targetURI = JSONUtils.extractSingle(md, "Envelope.WARC-Header-Metadata.WARC-Filename");
} else {
targetURI = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Target-URI");
}
diff --git a/src/main/java/org/archive/format/warc/WARCRecordWriter.java b/src/main/java/org/archive/format/warc/WARCRecordWriter.java
index 0aab83b7..3278b289 100644
--- a/src/main/java/org/archive/format/warc/WARCRecordWriter.java
+++ b/src/main/java/org/archive/format/warc/WARCRecordWriter.java
@@ -88,7 +88,10 @@ public void writeJSONMetadataRecord( OutputStream out,
{
HttpHeaders headers = new HttpHeaders();
headers.add(HEADER_KEY_TYPE, WARCRecordType.metadata.name());
- headers.add(HEADER_KEY_URI, targetURI);
+ if (targetURI != null) {
+ // WARC-Target-URI is optional in metadata records
+ headers.add(HEADER_KEY_URI, targetURI);
+ }
headers.add(HEADER_KEY_DATE, DateUtils.getLog14Date(originalDate));
headers.add(HEADER_KEY_ID, makeRecordId());
headers.add(HEADER_KEY_REFERS_TO, origRecordId);
From 04e10397b9137a36812c17276826bc60d1a37ede Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Mon, 15 Jun 2020 13:29:25 +0200
Subject: [PATCH 019/189] Update change log to include #85, #86 and #89
---
CHANGES.md | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/CHANGES.md b/CHANGES.md
index dcb598d9..bf985ada 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,3 +1,10 @@
+1.1.10
+------
+* [WAT extractor: do not fail on missing WARC-Filename in warcinfo record](https://github.com/iipc/webarchive-commons/pull/89)
+* [ExtractingParseObserver: extract rel, hreflang and type attributes](https://github.com/iipc/webarchive-commons/pull/86)
+* [ExtractingParseObserver: extract links from onClick attributes](https://github.com/iipc/webarchive-commons/pull/85)
+* [Update TravisCI config](https://github.com/iipc/webarchive-commons/pull/83)
+
1.1.9
-----
* [Use commons-collections v3.2.2 to avoid v3.2.1 vulnerability](https://github.com/iipc/webarchive-commons/pull/77)
From 9041ff4e96f6554658742affe490223dc0241d06 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 13 Oct 2020 01:28:48 +0000
Subject: [PATCH 020/189] Bump junit from 3.8.1 to 4.13.1
Bumps [junit](https://github.com/junit-team/junit4) from 3.8.1 to 4.13.1.
- [Release notes](https://github.com/junit-team/junit4/releases)
- [Changelog](https://github.com/junit-team/junit4/blob/main/doc/ReleaseNotes4.13.1.md)
- [Commits](https://github.com/junit-team/junit4/commits/r4.13.1)
Signed-off-by: dependabot[bot]
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index 1cbeb99a..5ca7e1a3 100644
--- a/pom.xml
+++ b/pom.xml
@@ -64,7 +64,7 @@
junit
junit
- 3.8.1
+ 4.13.1
From c2530d77b73838c31f4e83f2be941ec61032ebb2 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Tue, 16 Mar 2021 11:58:11 +0100
Subject: [PATCH 021/189] Fix InterruptibleCharSequenceTest
(testInterruptibility) to run on JDK 11 - if thread running the regexp
matching is already finished after the initial/current sleeping time, rerun
the test again with a shorter sleeping time until the expected
RuntimeException is hit
---
.../util/InterruptibleCharSequenceTest.java | 26 +++++++++++++------
1 file changed, 18 insertions(+), 8 deletions(-)
diff --git a/src/test/java/org/archive/util/InterruptibleCharSequenceTest.java b/src/test/java/org/archive/util/InterruptibleCharSequenceTest.java
index a3a5f180..8b5c5d1b 100644
--- a/src/test/java/org/archive/util/InterruptibleCharSequenceTest.java
+++ b/src/test/java/org/archive/util/InterruptibleCharSequenceTest.java
@@ -107,14 +107,24 @@ public void testNoninterruptible() throws InterruptedException {
}
public void testInterruptibility() throws InterruptedException {
- BlockingQueue