From 11579c2baab0db08f14341f70b848353eed17269 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Wed, 22 Feb 2017 13:11:13 +0100 Subject: [PATCH 001/189] Improve HTML link extraction - add extractors for more elements which can take URLs as attribute values, add missing attributes - generalize extraction of "global" attributes (`background`) - add custom data attributes frequently used for linking (`data-href`, `data-uri`) - add unit test to cover link extraction --- .../html/ExtractingParseObserver.java | 79 ++++- .../html/ExtractingParseObserverTest.java | 161 +++++++++ .../resource/html/link-extraction-test.warc | 320 ++++++++++++++++++ 3 files changed, 551 insertions(+), 9 deletions(-) create mode 100644 src/test/resources/org/archive/resource/html/link-extraction-test.warc diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java index deb8c8c0..826851e0 100644 --- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java +++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java @@ -2,12 +2,17 @@ import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; +import java.util.Locale; import java.util.Map; +import java.util.Set; import java.util.Stack; +import java.util.Vector; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.archive.format.text.html.ParseObserver; +import org.htmlparser.Attribute; import org.htmlparser.nodes.RemarkNode; import org.htmlparser.nodes.TagNode; import org.htmlparser.nodes.TextNode; @@ -36,11 +41,10 @@ public class ExtractingParseObserver implements ParseObserver { private final static int MAX_TEXT_LEN = 100; -// private static String GLOBAL_ATTR[] = {"background"}; - private static final String PATH = "path"; private static final String PATH_SEPARATOR = "@/"; - private final static Map extractors; + private static final Map extractors; + private static final Set globalHrefAttributes; static { extractors = new HashMap(); extractors.put("A", new AnchorTagExtractor()); @@ -57,6 +61,22 @@ public class ExtractingParseObserver implements ParseObserver { extractors.put("META", new MetaTagExtractor()); extractors.put("OBJECT", new ObjectTagExtractor()); extractors.put("SCRIPT", new ScriptTagExtractor()); + extractors.put("Q", new QuotationLinkTagExtractor()); + extractors.put("BLOCKQUOTE", new QuotationLinkTagExtractor()); + extractors.put("DEL", new QuotationLinkTagExtractor()); + extractors.put("INS", new QuotationLinkTagExtractor()); + // HTML5: + extractors.put("BUTTON", new ButtonTagExtractor()); + extractors.put("MENUITEM", new MenuitemTagExtractor()); + extractors.put("VIDEO", new EmbedVideoTagExtractor()); + extractors.put("AUDIO", new EmbedTagExtractor()); + extractors.put("TRACK", new EmbedTagExtractor()); + extractors.put("SOURCE", new EmbedTagExtractor()); + + globalHrefAttributes = new HashSet(); + globalHrefAttributes.add("background"); + globalHrefAttributes.add("data-href"); + globalHrefAttributes.add("data-uri"); } @@ -84,11 +104,19 @@ public void handleTagOpen(TagNode tag) { inTitle = !tag.isEmptyXmlTag(); return; } + // first the global attributes: - // background - String v = tag.getAttribute("background"); - if(v != null) { - data.addHref(PATH,makePath(name,"background"),"url",v); + Vector attributes = tag.getAttributesEx(); + for (Attribute a : attributes) { + String attrName = a.getName(); + String attrValue = a.getValue(); + if (attrName == null || attrValue == null) { + continue; + } + attrName = attrName.toLowerCase(Locale.ROOT); + if (globalHrefAttributes.contains(attrName)) { + data.addHref(PATH,makePath(name,attrName),"url",attrValue); + } } // TODO: style attribute, BASE(href) tag, Resolve URLs @@ -296,12 +324,24 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs } } + private static class ButtonTagExtractor implements TagExtractor { + public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { + addBasicHrefs(data,node,"formaction"); + } + } + private static class EmbedTagExtractor implements TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { addBasicHrefs(data,node,"src"); } } + private static class EmbedVideoTagExtractor implements TagExtractor { + public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { + addBasicHrefs(data,node,"src","poster"); + } + } + private static class FormTagExtractor implements TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { ArrayList l = new ArrayList(); @@ -329,21 +369,26 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs addBasicHrefs(data,node,"src"); } } + private static class IFrameTagExtractor implements TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { addBasicHrefs(data,node,"src"); } } + private static class ImgTagExtractor implements TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { addHrefWithAttrs(data,node,"src","alt","title"); + addBasicHrefs(data,node,"longdesc"); } } + private static class InputTagExtractor implements TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { - addBasicHrefs(data,node,"src"); + addBasicHrefs(data,node,"src","formaction"); } } + private static class LinkTagExtractor implements TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { ArrayList l = getAttrListUrl(node,"href","rel","type"); @@ -352,6 +397,13 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs } } } + + private static class MenuitemTagExtractor implements TagExtractor { + public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { + addBasicHrefs(data,node,"icon"); + } + } + private static class MetaTagExtractor implements TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { ArrayList l = getAttrList(node,"name","rel","content","http-equiv"); @@ -360,11 +412,19 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs } } } + private static class ObjectTagExtractor implements TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { - addBasicHrefs(data,node,"codebase","cdata"); + addBasicHrefs(data,node,"codebase","cdata","data"); } } + + private static class QuotationLinkTagExtractor implements TagExtractor { + public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { + addBasicHrefs(data,node,"cite"); + } + } + private static class ScriptTagExtractor implements TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { ArrayList l = getAttrListUrl(node,"src","type"); @@ -373,6 +433,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs } } } + private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String content) { Matcher m = pattern.matcher(content); int idx = 0; diff --git a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java index bfbd6f02..8f690a06 100644 --- a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java +++ b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java @@ -1,15 +1,33 @@ package org.archive.resource.html; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.logging.Logger; + +import org.archive.extract.ExtractingResourceFactoryMapper; +import org.archive.extract.ExtractingResourceProducer; +import org.archive.extract.ProducerUtils; +import org.archive.extract.ResourceFactoryMapper; import org.archive.resource.MetaData; +import org.archive.resource.Resource; +import org.archive.resource.ResourceParseException; +import org.archive.resource.ResourceProducer; import org.htmlparser.nodes.TextNode; import org.json.JSONArray; import org.json.JSONException; import org.json.JSONObject; +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.Multimap; + import junit.framework.TestCase; public class ExtractingParseObserverTest extends TestCase { + private static final Logger LOG = + Logger.getLogger(ExtractingParseObserverTest.class.getName()); + public void testHandleStyleNodeExceptions() throws Exception { String[] tests = { "some css", @@ -103,5 +121,148 @@ private void checkExtract(String[] data) throws JSONException { } } + private void checkLink(Multimap links, String url, String path) { + assertTrue("Link with URL " + url + " not found", links.containsKey(url)); + assertTrue("Wrong path " + path + " for " + url, links.get(url).contains(path)); + } + + private void checkLinks(Resource resource, String[][] expectedLinks) { + assertNotNull(resource); + assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource); + MetaData md = resource.getMetaData(); + LOG.info(md.toString()); + Multimap links = ArrayListMultimap.create(); + JSONObject head = md.optJSONObject("Head"); + if (head != null) { + // + String baseUrl = (String) head.opt("Base"); + if (baseUrl != null) { + links.put(baseUrl, "__base__"); + } + // + JSONArray metas = head.optJSONArray("Metas"); + if (metas != null) { + for (int i = 0; i < metas.length(); i++) { + JSONObject o = (JSONObject) metas.optJSONObject(i); + String httpEquiv = o.optString("http-equiv"); + if (httpEquiv != null && httpEquiv.equalsIgnoreCase("Refresh")) { + String metaRefreshTarget = o.optString("content"); + if (metaRefreshTarget != null) { + metaRefreshTarget = metaRefreshTarget.replaceFirst("(?i)(?:^\\d+\\s*;)?\\s*url=", ""); + links.put(metaRefreshTarget, "__meta_refresh__"); + } + } + } + } + } + // extract outlinks + List linkArrays = new ArrayList(); + if (md.optJSONArray("Links") != null) { + linkArrays.add(md.optJSONArray("Links")); + } + try { + if (md.getJSONObject("Head") != null && md.getJSONObject("Head").getJSONArray("Link") != null) { + linkArrays.add(md.getJSONObject("Head").getJSONArray("Link")); + } + } catch (JSONException e1) { + } + for (JSONArray ldata : linkArrays) { + for (int i = 0; i < ldata.length(); i++) { + JSONObject o = (JSONObject) ldata.optJSONObject(i); + try { + String url = o.getString("url"); + links.put(url, o.getString("path")); + LOG.info(" found link: " + o.getString("url") + " " + o.getString("path")); + } catch (JSONException e) { + fail("Failed to extract URL from link: " + e.getMessage()); + } + } + } + assertEquals("Unexpected number of links", expectedLinks.length, links.size()); + for (String[] l : expectedLinks) { + checkLink(links, l[0], l[1]); + } + } + + public void testLinkExtraction() throws ResourceParseException, IOException { + String testFileName = "link-extraction-test.warc"; + ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath()); + ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper(); + ExtractingResourceProducer extractor = + new ExtractingResourceProducer(producer, mapper); + extractor.getNext(); // skip warcinfo record + String[][] html4links = { + {"http://www.example.com/", "__base__"}, + {"http://www.example.com/redirected.html", "__meta_refresh__"}, + {"background.jpg", "BODY@/background"}, + {"http://www.example.com/a-href.html", "A@/href"}, + {"#anchor", "A@/href"}, + {"image.png", "IMG@/src"}, + {"image.gif", "IMG@/src"}, + {"http://example.com/image-description.html#image.gif", "IMG@/longdesc"}, + {"helloworld.swf", "OBJECT@/data"}, + {"http://www.example.com/shakespeare.html", "Q@/cite"}, + {"http://www.example.com/shakespeare-long.html", "BLOCKQUOTE@/cite"} + }; + checkLinks(extractor.getNext(), html4links); + String[][] html5links = { + {"http:///www.example.com/video.html", "LINK@/href", "canonical"}, + {"video.rss", "LINK@/href", "alternate"}, + {"https://archive.org/download/WebmVp8Vorbis/webmvp8.gif", "VIDEO@/poster"}, + {"https://archive.org/download/WebmVp8Vorbis/webmvp8.webm", "SOURCE@/src"}, + {"https://archive.org/download/WebmVp8Vorbis/webmvp8_512kb.mp4", "SOURCE@/src"}, + {"https://archive.org/download/WebmVp8Vorbis/webmvp8.ogv", "SOURCE@/src"} + }; + checkLinks(extractor.getNext(), html5links); + String[][] html5links2 = { + {"http://www.example.com/", "A@/href"}, + }; + checkLinks(extractor.getNext(), html5links2); + String[][] fbVideoLinks = { + {"https://www.facebook.com/facebook/videos/10153231379946729/", "BLOCKQUOTE@/cite"}, + {"https://www.facebook.com/facebook/videos/10153231379946729/", "A@/href"}, + {"https://www.facebook.com/facebook/", "A@/href"}, + {"https://www.facebook.com/facebook/videos/10153231379946729/", "DIV@/data-href"} + }; + checkLinks(extractor.getNext(), fbVideoLinks); + String[][] dataHrefLinks = { + {"standard.css", "LINK@/href", "stylesheet"}, + {"https://www.facebook.com/elegantthemes/videos/10153760379211923/", "DIV@/data-href"}, + {"https://www.facebook.com/facebook/videos/10153231379946729/", "DIV@/data-href"}, + {"https://www.facebook.com/facebook/videos/10153231379946729/", "BLOCKQUOTE@/cite"}, + {"https://www.facebook.com/facebook/videos/10153231379946729/", "A@/href"}, + {"https://www.facebook.com/facebook/", "A@/href"}, + {"//edge.flowplayer.org/bauhaus.webm", "SOURCE@/src"}, + {"//edge.flowplayer.org/bauhaus.mp4", "SOURCE@/src"}, + {"//edge.flowplayer.org/functional.webm", "BUTTON@/data-href"}, + {"/content-page", "ARTICLE@/data-href"}, + {"/content-page", "A@/href"}, + {"/tags/content","A@/href"}, + {"/tags/headlines", "A@/href"}, + {"http://grabaperch.com", "DIV@/data-href"}, + {"green.css", "LINK@/data-href"}, + {"blue.css", "LINK@/data-href"}, + {"http://codecanyon.net/user/CodingJack", "A@/data-href"}, + {"jackbox/img/thumbs/4.jpg", "IMG@/src"}, + {"//venobox-destination", "A@/data-href"}, + {"#", "A@/href"}, + {"http://www.youtube.com/v/itTskyFLSS8&rel=0&autohide=1&showinfo=0&autoplay=1", "DIV@/data-href"}, + {"#", "A@/href"}, + {"http://www.youtube.com/v/itTskyFLSS8&rel=0&autohide=1&showinfo=0", "IFRAME@/src"} + }; + checkLinks(extractor.getNext(), dataHrefLinks); + String[][] fbSocialLinks = { + {"http://www.your-domain.com/your-page.html", "DIV@/data-uri"}, + {"https://developers.facebook.com/docs/plugins/comments#configurator", "DIV@/data-href"}, + {"https://www.facebook.com/zuck/posts/10102735452532991?comment_id=1070233703036185", "DIV@/data-href"}, + {"https://www.facebook.com/zuck", "DIV@/data-href"}, + {"https://developers.facebook.com/docs/plugins/", "DIV@/data-href"}, + {"https://www.facebook.com/facebook", "DIV@/data-href"}, + {"https://www.facebook.com/facebook", "BLOCKQUOTE@/cite"}, + {"https://www.facebook.com/facebook", "A@/href"}, + {"http://www.your-domain.com/your-page.html", "DIV@/data-href"} + }; + checkLinks(extractor.getNext(), fbSocialLinks); + } } diff --git a/src/test/resources/org/archive/resource/html/link-extraction-test.warc b/src/test/resources/org/archive/resource/html/link-extraction-test.warc new file mode 100644 index 00000000..ab0e54c8 --- /dev/null +++ b/src/test/resources/org/archive/resource/html/link-extraction-test.warc @@ -0,0 +1,320 @@ +WARC/1.0 +WARC-Type: warcinfo +Content-Type: application/warc-fields +WARC-Date: 2017-02-20T14:00:56Z +Content-Length: 128 + +format: WARC File Format 1.0 +conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf +robots: classic + + + +WARC/1.0 +WARC-Type: response +WARC-Date: 2017-02-20T14:00:56Z +WARC-Target-URI: http://www.example.com/html4.html +Content-Type: application/http; msgtype=response +Content-Length: 1243 + +HTTP/1.1 200 OK +Date: Mon, 20 Feb 2017 14:00:56 GMT +Content-Length: 1125 +Content-Type: application/xhtml+xml + + + + + + + +Test XHTML Link Extraction + + +A@/href +

+ anchor only + IMG@/src + IMG@/longdesc + +

+ To be or not to be. +

+To be, or not to be, that is the question:
+Whether 'tis nobler in the mind to suffer
+The slings and arrows of outrageous fortune, … +

+ + + + +WARC/1.0 +WARC-Type: response +WARC-Target-URI: http://www.example.com/link-extraction-test-html5-video.html +WARC-Date: 2017-02-20T21:35:03Z +Content-Type: application/http; msgtype=response +Content-Length: 890 + +HTTP/1.1 200 OK +Date: Mon, 20 Feb 2017 21:35:03 GMT +Content-Length: 789 +Content-Type: text/html + + + + +Test HTML5 Video Tag + + + + + + +

+ + + + +WARC/1.0 +WARC-Type: response +WARC-Target-URI: http://www.example.com/poor_html5.html +WARC-Date: 2017-02-21T15:50:40Z +Content-Type: application/http; msgtype=response +Content-Length: 594 + +HTTP/1.1 200 OK +Date: Tue, 21 Feb 2017 15:50:40 GMT +Content-Length: 486 +Content-Type: text/html + + +Testing poor HTML5 + + + + + +This is valid HTML5! + + + +

header

+ +

headline

+ +

paragraph one with link. + + +WARC/1.0 +WARC-Type: response +WARC-Target-URI: http://www.example.com/fb-video.html +WARC-Date: 2017-02-20T16:58:50Z +Content-Type: application/http; msgtype=response +Content-Length: 1330 + +HTTP/1.1 200 OK +Date: Mon, 20 Feb 2017 16:58:50 GMT +Content-Length: 1194 +Content-Type: text/html + + + + + fb-video - Embedded Videos - Social Plugins + + + + +

+ + + +

+ How to Share With Just Friends +
How to share with just friends.
+ Posted by Facebook on Friday, December 5, 2014 +

+ + + + + +WARC/1.0 +WARC-Type: response +WARC-Target-URI: http://www.example.com/data-href.examples.html +WARC-Date: 2017-02-21T21:05:10Z +Content-Type: application/http; msgtype=response +Content-Length: 3160 + +HTTP/1.1 200 OK +Date: Tue, 21 Feb 2017 21:05:10 GMT +Content-Length: 3057 +Content-Type: text/html + + + + + + + + + + + + +

+ + +

+ How to Share With Just Friends +
How to share with just friends.
+ Posted by Facebook on Friday, December 5, 2014 +

+ + +

+ +

+ + +

Headline goes here.

And here goes a bit of copy about the content of the article.

+ Tags: content, headlines +

+ + +

+ + + +

+ + + +venobox + + + + + +WARC/1.0 +WARC-Type: response +WARC-Target-URI: http://www.example.com/fb-social-plugins.html +WARC-Date: 2017-02-22T09:33:02Z +Content-Type: application/http; msgtype=response +Content-Length: 1870 + +HTTP/1.1 200 OK +Date: Wed, 22 Feb 2017 09:33:02 GMT +Content-Length: 1767 +Content-Type: text/html + + +

+ + +

Facebook

+ + + + + From 6aa43f83a2cbc2acd0feb7f2c81d66f4ef1b13c5 Mon Sep 17 00:00:00 2001 From: Mohamed Elsayed Date: Thu, 2 Mar 2017 15:28:16 +0200 Subject: [PATCH 002/189] Fix #25: move missing unit tests over from Heritrix3 --- .../archive/io/ArchiveReaderFactoryTest.java | 94 +++ .../io/BufferedSeekInputStreamTest.java | 67 ++ .../archive/io/HeaderedArchiveRecordTest.java | 209 ++++++ .../archive/io/RecordingInputStreamTest.java | 132 ++++ .../archive/io/ReplayCharSequenceTest.java | 391 ++++++++++ .../io/RepositionableInputStreamTest.java | 70 ++ .../org/archive/io/arc/ARCWriterPoolTest.java | 122 +++ .../org/archive/io/arc/ARCWriterTest.java | 699 ++++++++++++++++++ .../org/archive/io/warc/WARCWriterTest.java | 512 +++++++++++++ .../org/archive/uid/UUIDGeneratorTest.java | 44 ++ .../java/org/archive/util/FileUtilsTest.java | 271 +++++++ .../org/archive/util/MimetypeUtilsTest.java | 63 ++ .../org/archive/util/PropertyUtilsTest.java | 45 ++ .../org/archive/util/anvl/ANVLRecordTest.java | 128 ++++ 14 files changed, 2847 insertions(+) create mode 100644 src/test/java/org/archive/io/ArchiveReaderFactoryTest.java create mode 100644 src/test/java/org/archive/io/BufferedSeekInputStreamTest.java create mode 100644 src/test/java/org/archive/io/HeaderedArchiveRecordTest.java create mode 100644 src/test/java/org/archive/io/RecordingInputStreamTest.java create mode 100644 src/test/java/org/archive/io/ReplayCharSequenceTest.java create mode 100644 src/test/java/org/archive/io/RepositionableInputStreamTest.java create mode 100644 src/test/java/org/archive/io/arc/ARCWriterPoolTest.java create mode 100644 src/test/java/org/archive/io/arc/ARCWriterTest.java create mode 100644 src/test/java/org/archive/io/warc/WARCWriterTest.java create mode 100644 src/test/java/org/archive/uid/UUIDGeneratorTest.java create mode 100644 src/test/java/org/archive/util/FileUtilsTest.java create mode 100644 src/test/java/org/archive/util/MimetypeUtilsTest.java create mode 100644 src/test/java/org/archive/util/PropertyUtilsTest.java create mode 100644 src/test/java/org/archive/util/anvl/ANVLRecordTest.java diff --git a/src/test/java/org/archive/io/ArchiveReaderFactoryTest.java b/src/test/java/org/archive/io/ArchiveReaderFactoryTest.java new file mode 100644 index 00000000..2313868c --- /dev/null +++ b/src/test/java/org/archive/io/ArchiveReaderFactoryTest.java @@ -0,0 +1,94 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + +import java.io.File; +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.Iterator; + +import org.apache.commons.lang.StringUtils; +import org.archive.io.ArchiveRecord; +import org.archive.io.arc.ARCWriterTest; +import org.archive.util.TmpDirTestCase; + +public class ArchiveReaderFactoryTest extends TmpDirTestCase { + /** + * Test local file as URL + * @throws IOException + */ + public void testGetFileURL() throws IOException { + File arc = ARCWriterTest.createARCFile(getTmpDir(), true); + ArchiveReader reader = null; + try { + reader = ArchiveReaderFactory. + get(new URL("file:////" + arc.getAbsolutePath())); + for (Iterator i = reader.iterator(); i.hasNext();) { + ArchiveRecord r = (ArchiveRecord)i.next(); + assertTrue("mime unread",StringUtils.isNotBlank(r.getHeader().getMimetype())); + } + } finally { + if (reader != null) { + reader.close(); + } + } + } + + /** + * Test local file as File + * @throws IOException + */ + public void testGetFile() throws IOException { + File arc = ARCWriterTest.createARCFile(getTmpDir(), true); + ArchiveReader reader = null; + try { + reader = ArchiveReaderFactory.get(arc.getAbsoluteFile()); + for (Iterator i = reader.iterator(); i.hasNext();) { + ArchiveRecord r = (ArchiveRecord)i.next(); + assertTrue("mime unread",StringUtils.isNotBlank(r.getHeader().getMimetype())); + } + } finally { + if (reader != null) { + reader.close(); + } + } + } + + /** + * Test local file as String path + * @throws IOException + */ + public void testGetPath() throws IOException { + File arc = ARCWriterTest.createARCFile(getTmpDir(), true); + ArchiveReader reader = null; + try { + reader = ArchiveReaderFactory.get(arc.getAbsoluteFile().getAbsolutePath()); + for (Iterator i = reader.iterator(); i.hasNext();) { + ArchiveRecord r = (ArchiveRecord)i.next(); + assertTrue("mime unread",StringUtils.isNotBlank(r.getHeader().getMimetype())); + } + } finally { + if (reader != null) { + reader.close(); + } + } + } +} diff --git a/src/test/java/org/archive/io/BufferedSeekInputStreamTest.java b/src/test/java/org/archive/io/BufferedSeekInputStreamTest.java new file mode 100644 index 00000000..270e45e0 --- /dev/null +++ b/src/test/java/org/archive/io/BufferedSeekInputStreamTest.java @@ -0,0 +1,67 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.util.Random; + +import junit.framework.TestCase; + + +/** + * Unit test for BufferedSeekInputStream. The tests do some random + * repositioning in the stream to make sure the buffer is always valid. + * + * @author pjack + */ +public class BufferedSeekInputStreamTest extends TestCase { + + + private static byte[] TEST_DATA = makeTestData(); + + public void testPosition() throws Exception { + Random random = new Random(); + ArraySeekInputStream asis = new ArraySeekInputStream(TEST_DATA); + BufferedSeekInputStream bsis = new BufferedSeekInputStream(asis, 11); + for (int i = 0; i < TEST_DATA.length; i++) { + byte b = (byte)bsis.read(); + assertEquals(TEST_DATA[i], b); + } + for (int i = 0; i < 1000; i++) { + int index = random.nextInt(TEST_DATA.length); + bsis.position(index); + char expected = (char)((int)TEST_DATA[index] & 0xFF); + char read = (char)(bsis.read() & 0xFF); + assertEquals(expected, read); + } + } + + + private static byte[] makeTestData() { + String s = "If the dull substance of my flesh were thought\n" + + "Injurious distance could not stop my way\n" + + "For then, despite of space, I would be brought\n" + + "From limits far remote where thou dost stay.\n"; + byte[] r = new byte[s.length()]; + for (int i = 0; i < r.length; i++) { + r[i] = (byte)s.charAt(i); +// r[i] = (byte)s.charAt(i); + } + return r; + } +} diff --git a/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java b/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java new file mode 100644 index 00000000..9f7e2a15 --- /dev/null +++ b/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java @@ -0,0 +1,209 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import junit.framework.TestCase; + +import org.apache.commons.httpclient.Header; +import org.archive.io.arc.ARCRecord; +import org.archive.io.warc.WARCRecord; + +public class HeaderedArchiveRecordTest extends TestCase { + private static final String HTTPHEADER = "HTTP/1.1 200 OK\r\n" + + "Last-Modified: Sun, 28 Aug 2005 14:10:55 GMT\r\n" + + "Content-Length: 108\r\n" + "Connection: close\r\n" + + "Content-Type: text/html\r\n" + "\r\n"; + private static final String BODY = "\r\n" + " \r\n" + + " Neue Seite 1\r\n" + " \r\n" + + " \r\n" + " \r\n" + ""; + + public void testParseHttpHeadersInWARC() throws IOException { + final String url = "http://foo.maths.uq.edu.au/index.html"; + // final String warcHeader = "WARC/0.10 000000000486 response " + + // url + " 20070315152520 " + + // "urn:uuid:d8b342a8-dba4-4d7f-a551-1d8184f2ff58 " + + // "application/http; msgtype=response\r\n" + + // "Checksum: sha1:IT6YEX5WHKK57GOEHV2YHTTXEP5KPM6A\r\n" + + // "IP-Address: 80.150.6.184\r\n" + + // "\r\n"; + + final String warcHeader = "WARC/0.12\r\n" + + "MIME-Version: 1.0\r\n" + + "WARC-Record-Type: response\r\n" + + "WARC-Target-URI: http://foo.maths.uq.edu.au/index.html\r\n" + + "WARC-Date: 2006-09-19T17:20:24Z\r\n" + + "WARC-Digest: sha1:IT6YEX5WHKK57GOEHV2YHTTXEP5KPM6A\r\n" + + "WARC-IP-Address: 80.150.6.184\r\n" + + "Content-ID: \r\n" + + "Content-Type: application/http; msgtype=response\r\n" + + "Content-Length: " + (HTTPHEADER.length() + BODY.length()) + "\r\n" + + "\r\n"; + + final String hdr = warcHeader + HTTPHEADER + BODY; + + WARCRecord r = new WARCRecord(new ByteArrayInputStream(hdr.getBytes()), + "READER_IDENTIFIER", 0, false, true); + HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true); + + har.skipHttpHeader(); + + byte[] b = new byte[BODY.length()]; + har.read(b); + String bodyRead = new String(b); + assertEquals(BODY, bodyRead); + assertHeaderCorrectlyParsed(har.getContentHeaders()); + assertEquals("failed to retrieve Url from metadata", har.getHeader() + .getUrl(), url); + } + + public void testParseHttpHeadersInARC() throws IOException { + final int len = HTTPHEADER.length() + BODY.length(); + final int contentLength = BODY.length(); + final String url = "http://www.ly.gov.tw:80/accpart.htm"; + final String hdr = HTTPHEADER + BODY; + // Interesting difference between ARCRecord and WARCRecord is that the + // stream passed the ARCRecord is supposed to be just past the + // ARCRecord metadata line where as stream passed WARCRecord is at + // record start. TODO: Add to ARCRecord constructor that doesn't + // take an ArchiveRecordHeader but rather parses it from the stream. + ArchiveRecordHeader arh = new ArchiveRecordHeader() { + public int getContentBegin() { + // TODO: In ARCs, this is where http headers end and + // the content begins. Need to reconcile for generic + // HeaderedArchiveRecord processing. In this context, it + // makes sense setting it to zero -- HeaderedArchiveRecord + // will then figure it out. + return 0; + } + + public String getDate() { + return null; + } + + public String getDigest() { + return null; + } + + public Set getHeaderFieldKeys() { + return null; + } + + public Map getHeaderFields() { + return null; + } + + public Object getHeaderValue(String key) { + return null; + } + + public long getLength() { + return len; + } + + public long getContentLength() { + return contentLength; + } + + public String getMimetype() { + return null; + } + + public long getOffset() { + return 0; + } + + public String getReaderIdentifier() { + return null; + } + + public String getRecordIdentifier() { + return null; + } + + public String getUrl() { + return url; + } + + public String getVersion() { + return null; + } + + }; + ARCRecord r = new ARCRecord(new ByteArrayInputStream(hdr.getBytes()), + arh, 0, false, true, false); + + HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true); + har.skipHttpHeader(); + byte[] b = new byte[BODY.length()]; + har.read(b); + String bodyRead = new String(b); + assertEquals(BODY, bodyRead); + assertHeaderCorrectlyParsed(har.getContentHeaders()); + } + + public void testEasierParseHttpHeadersInARC() throws IOException { + final String url = "http://www.archive.org/index.htm"; + final String arcHeader = url + + " 192.168.0.1 20070515111004 text/html 167568\n"; + final String hdr = arcHeader + HTTPHEADER + BODY; + + ARCRecord r = new ARCRecord(new ByteArrayInputStream(hdr.getBytes()), + "READER_IDENTIFIER", 0, false, true, false); + + HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true); + har.skipHttpHeader(); + byte[] b = new byte[BODY.length()]; + har.read(b); + String bodyRead = new String(b); + assertEquals(BODY, bodyRead); + assertHeaderCorrectlyParsed(har.getContentHeaders()); + assertEquals("failed to retrieve Url from metadata", har.getHeader() + .getUrl(), url); + } + + private void assertHeaderCorrectlyParsed(Header[] headers) { + final List orgHeaders = Arrays.asList(HTTPHEADER.split("\r\n")); + assertEquals("not all HTTP header entries have been retrieved", + orgHeaders.size(), headers.length + 1); + + for (Header header : headers) { + assertTrue(orgHeaders.contains(header.getName() + ": " + + header.getValue())); + } + } + + public void testNoheaderWARC() throws IOException { + String b = "hello world"; + String c = "WARC/0.12\r\nContent-Type: text/plain\r\n" + + "Content-Length: " + b.length() + "\r\n\r\n" + b; + org.archive.io.warc.WARCRecord r = new org.archive.io.warc.WARCRecord( + new ByteArrayInputStream(c.getBytes()), "READER_IDENTIFIER", 0, + false, true); + HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true); + assertTrue(har.isStrict()); + } +} diff --git a/src/test/java/org/archive/io/RecordingInputStreamTest.java b/src/test/java/org/archive/io/RecordingInputStreamTest.java new file mode 100644 index 00000000..20a8b8b3 --- /dev/null +++ b/src/test/java/org/archive/io/RecordingInputStreamTest.java @@ -0,0 +1,132 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.IOException; +import java.io.PipedInputStream; +import java.io.PipedOutputStream; + +import org.archive.util.TmpDirTestCase; + + +/** + * Test cases for RecordingInputStream. + * + * @author gojomo + */ +public class RecordingInputStreamTest extends TmpDirTestCase +{ + + + /* + * @see TmpDirTestCase#setUp() + */ + protected void setUp() throws Exception + { + super.setUp(); + } + + /** + * Test readFullyOrUntil soft (no exception) and hard (exception) + * length cutoffs, timeout, and rate-throttling. + * + * @throws IOException + * @throws InterruptedException + * @throws RecorderTimeoutException + */ + public void testReadFullyOrUntil() throws RecorderTimeoutException, IOException, InterruptedException + { + RecordingInputStream ris = new RecordingInputStream(16384, (new File( + getTmpDir(), "testReadFullyOrUntil").getAbsolutePath())); + ByteArrayInputStream bais = new ByteArrayInputStream( + "abcdefghijklmnopqrstuvwxyz".getBytes()); + // test soft max + ris.open(bais); + ris.setLimits(10,0,0); + ris.readFullyOrUntil(7); + ris.close(); + ReplayInputStream res = ris.getReplayInputStream(); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + res.readFullyTo(baos); + assertEquals("soft max cutoff","abcdefg",new String(baos.toByteArray())); + // test hard max + bais.reset(); + baos.reset(); + ris.open(bais); + boolean exceptionThrown = false; + try { + ris.setLimits(10,0,0); + ris.readFullyOrUntil(13); + } catch (RecorderLengthExceededException ex) { + exceptionThrown = true; + } + assertTrue("hard max exception",exceptionThrown); + ris.close(); + res = ris.getReplayInputStream(); + res.readFullyTo(baos); + assertEquals("hard max cutoff","abcdefghijk", + new String(baos.toByteArray())); + // test timeout + PipedInputStream pin = new PipedInputStream(); + PipedOutputStream pout = new PipedOutputStream(pin); + ris.open(pin); + exceptionThrown = false; + trickle("abcdefghijklmnopqrstuvwxyz".getBytes(),pout); + try { + ris.setLimits(0,5000,0); + ris.readFullyOrUntil(0); + } catch (RecorderTimeoutException ex) { + exceptionThrown = true; + } + assertTrue("timeout exception",exceptionThrown); + ris.close(); + // test rate limit + bais = new ByteArrayInputStream(new byte[1024*2*5]); + ris.open(bais); + long startTime = System.currentTimeMillis(); + ris.setLimits(0,0,2); + ris.readFullyOrUntil(0); + long endTime = System.currentTimeMillis(); + long duration = endTime - startTime; + assertTrue("read too fast: "+duration,duration>=5000); + ris.close(); + } + + protected void trickle(final byte[] bytes, final PipedOutputStream pout) { + new Thread() { + public void run() { + try { + for (int i = 0; i < bytes.length; i++) { + Thread.sleep(1000); + pout.write(bytes[i]); + } + pout.close(); + } catch (IOException e) { + // do nothing + } catch (Exception e) { + System.err.print(e); + } + } + }.start(); + + } +} diff --git a/src/test/java/org/archive/io/ReplayCharSequenceTest.java b/src/test/java/org/archive/io/ReplayCharSequenceTest.java new file mode 100644 index 00000000..9208594a --- /dev/null +++ b/src/test/java/org/archive/io/ReplayCharSequenceTest.java @@ -0,0 +1,391 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + +import java.io.IOException; +import java.nio.charset.Charset; +import java.text.NumberFormat; +import java.util.Date; +import java.util.Random; +import java.util.logging.Logger; + +import org.archive.util.FileUtils; +import org.archive.util.TmpDirTestCase; + +import com.google.common.base.Charsets; + +/** + * Test ReplayCharSequences. + * + * @author stack, gojomo + * @version $Revision$, $Date$ + */ +public class ReplayCharSequenceTest extends TmpDirTestCase +{ + /** + * Logger. + */ + private static Logger logger = + Logger.getLogger("org.archive.io.ReplayCharSequenceFactoryTest"); + + + private static final int SEQUENCE_LENGTH = 127; + private static final int MULTIPLIER = 3; + private static final int BUFFER_SIZE = SEQUENCE_LENGTH * MULTIPLIER; + private static final int INCREMENT = 1; + + /** + * Buffer of regular content. + */ + private byte [] regularBuffer = null; + + /* + * @see TestCase#setUp() + */ + protected void setUp() throws Exception + { + super.setUp(); + this.regularBuffer = + fillBufferWithRegularContent(new byte [BUFFER_SIZE]); + } + + public void testShiftjis() throws IOException { + + // Here's the bytes for the JIS encoding of the Japanese form of Nihongo + byte[] bytes_nihongo = { + (byte) 0x1B, (byte) 0x24, (byte) 0x42, (byte) 0x46, + (byte) 0x7C, (byte) 0x4B, (byte) 0x5C, (byte) 0x38, + (byte) 0x6C, (byte) 0x1B, (byte) 0x28, (byte) 0x42, + (byte) 0x1B, (byte) 0x28, (byte) 0x42 }; + final String ENCODING = "SJIS"; + // Here is nihongo converted to JVM encoding. + String nihongo = new String(bytes_nihongo, ENCODING); + + RecordingOutputStream ros = writeTestStream( + bytes_nihongo,MULTIPLIER, + "testShiftjis",MULTIPLIER); + // TODO: check for existence of overflow file? + ReplayCharSequence rcs = getReplayCharSequence(ros,Charset.forName(ENCODING)); + + // Now check that start of the rcs comes back in as nihongo string. + String rcsStr = rcs.subSequence(0, nihongo.length()).toString(); + assertTrue("Nihongo " + nihongo + " does not equal converted string" + + " from rcs " + rcsStr, + nihongo.equals(rcsStr)); + // And assert next string is also properly nihongo. + if (rcs.length() >= (nihongo.length() * 2)) { + rcsStr = rcs.subSequence(nihongo.length(), + nihongo.length() + nihongo.length()).toString(); + assertTrue("Nihongo " + nihongo + " does not equal converted " + + " string from rcs (2nd time)" + rcsStr, + nihongo.equals(rcsStr)); + } + } + + public void testGetReplayCharSequenceByteZeroOffset() throws IOException { + + RecordingOutputStream ros = writeTestStream( + regularBuffer,MULTIPLIER, + "testGetReplayCharSequenceByteZeroOffset",MULTIPLIER); + ReplayCharSequence rcs = getReplayCharSequence(ros); + + for (int i = 0; i < MULTIPLIER; i++) { + accessingCharacters(rcs); + } + } + + private ReplayCharSequence getReplayCharSequence(RecordingOutputStream ros) throws IOException { + return getReplayCharSequence(ros,null); + } + + private ReplayCharSequence getReplayCharSequence(RecordingOutputStream ros, Charset charset) throws IOException { + return new GenericReplayCharSequence(ros.getReplayInputStream(), + ros.getBufferLength()/2, ros.backingFilename, charset); + } + + + public void testGetReplayCharSequenceMultiByteZeroOffset() + throws IOException { + + RecordingOutputStream ros = writeTestStream( + regularBuffer,MULTIPLIER, + "testGetReplayCharSequenceMultiByteZeroOffset",MULTIPLIER); + ReplayCharSequence rcs = getReplayCharSequence(ros,Charsets.UTF_8); + + for (int i = 0; i < MULTIPLIER; i++) { + accessingCharacters(rcs); + } + } + + public void testReplayCharSequenceByteToString() throws IOException { + String fileContent = "Some file content"; + byte [] buffer = fileContent.getBytes(); + RecordingOutputStream ros = writeTestStream( + buffer,1, + "testReplayCharSequenceByteToString.txt",0); + ReplayCharSequence rcs = getReplayCharSequence(ros); + String result = rcs.toString(); + assertEquals("Strings don't match",result,fileContent); + } + + private String toHexString(String str) + { + if (str != null) { + StringBuilder buf = new StringBuilder("{ "); + buf.append(Integer.toString(str.charAt(0), 16)); + for (int i = 1; i < str.length(); i++) { + buf.append(", "); + buf.append(Integer.toString(str.charAt(i), 16)); + } + buf.append(" }"); + return buf.toString(); + } + else + return "null"; + } + + public void testSingleByteEncodings() throws IOException { + byte[] bytes = { + (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, + (byte) 0x7d, (byte) 0x7e, (byte) 0x7f, (byte) 0x80, + (byte) 0x81, (byte) 0x82, (byte) 0x83, (byte) 0x84, + (byte) 0xfc, (byte) 0xfd, (byte) 0xfe, (byte) 0xff }; + + String latin1String = new String(bytes, "latin1"); + RecordingOutputStream ros = writeTestStream( + bytes, 1, "testSingleByteEncodings-latin1.txt", 0); + ReplayCharSequence rcs = getReplayCharSequence(ros,Charsets.ISO_8859_1); + String result = rcs.toString(); + logger.fine("latin1[0] " + toHexString(latin1String)); + logger.fine("latin1[1] " + toHexString(result)); + assertEquals("latin1 strings don't match", result, latin1String); + + String w1252String = new String(bytes, "windows-1252"); + ros = writeTestStream( + bytes, 1, "testSingleByteEncodings-windows-1252.txt", 0); + rcs = getReplayCharSequence(ros,Charset.forName("windows-1252")); + result = rcs.toString(); + logger.fine("windows-1252[0] " + toHexString(w1252String)); + logger.fine("windows-1252[1] " + toHexString(result)); + assertEquals("windows-1252 strings don't match", result, w1252String); + + String asciiString = new String(bytes, "ascii"); + ros = writeTestStream( + bytes, 1, "testSingleByteEncodings-ascii.txt", 0); + rcs = getReplayCharSequence(ros,Charset.forName("ascii")); + result = rcs.toString(); + logger.fine("ascii[0] " + toHexString(asciiString)); + logger.fine("ascii[1] " + toHexString(result)); + assertEquals("ascii strings don't match", result, asciiString); + } + + public void testReplayCharSequenceByteToStringOverflow() throws IOException { + String fileContent = "Some file content. "; // ascii + byte [] buffer = fileContent.getBytes(); + RecordingOutputStream ros = writeTestStream( + buffer,1, + "testReplayCharSequenceByteToStringOverflow.txt",1); + String expectedContent = fileContent+fileContent; + + // The string is ascii which is a subset of both these encodings. Use + // both encodings because they exercise different code paths. UTF-8 is + // decoded to UTF-16 while windows-1252 is memory mapped directly. See + // GenericReplayCharSequence + ReplayCharSequence rcsUtf8 = getReplayCharSequence(ros,Charsets.UTF_8); + ReplayCharSequence rcs1252 = getReplayCharSequence(ros,Charset.forName("windows-1252")); + + String result = rcsUtf8.toString(); + assertEquals("Strings don't match", expectedContent, result); + + result = rcs1252.toString(); + assertEquals("Strings don't match", expectedContent, result); + } + + public void testReplayCharSequenceByteToStringMulti() throws IOException { + String fileContent = "Some file content"; + byte [] buffer = fileContent.getBytes("UTF-8"); + final int MULTIPLICAND = 10; + StringBuilder sb = + new StringBuilder(MULTIPLICAND * fileContent.length()); + for (int i = 0; i < MULTIPLICAND; i++) { + sb.append(fileContent); + } + String expectedResult = sb.toString(); + RecordingOutputStream ros = writeTestStream( + buffer,1, + "testReplayCharSequenceByteToStringMulti.txt",MULTIPLICAND-1); + for (int i = 0; i < 3; i++) { + ReplayCharSequence rcs = getReplayCharSequence(ros,Charsets.UTF_8); + String result = rcs.toString(); + assertEquals("Strings don't match", result, expectedResult); + rcs.close(); + System.gc(); + System.runFinalization(); + } + } + + public void xestHugeReplayCharSequence() throws IOException { + String fileContent = "01234567890123456789"; + String characterEncoding = "ascii"; + byte[] buffer = fileContent.getBytes(characterEncoding); + + long reps = (long) Integer.MAX_VALUE / (long) buffer.length + 1000000l; + + logger.info("writing " + (reps * buffer.length) + + " bytes to testHugeReplayCharSequence.txt"); + RecordingOutputStream ros = writeTestStream(buffer, 0, + "testHugeReplayCharSequence.txt", reps); + ReplayCharSequence rcs = getReplayCharSequence(ros,Charset.forName(characterEncoding)); + + if (reps * fileContent.length() > (long) Integer.MAX_VALUE) { + assertTrue("ReplayCharSequence has wrong length (length()=" + + rcs.length() + ") (should be " + Integer.MAX_VALUE + ")", + rcs.length() == Integer.MAX_VALUE); + } else { + assertEquals("ReplayCharSequence has wrong length (length()=" + + rcs.length() + ") (should be " + + (reps * fileContent.length()) + ")", (long) rcs.length(), + reps * (long) fileContent.length()); + } + + // boundary cases or something + for (int index : new int[] { 0, rcs.length() / 4, rcs.length() / 2, + rcs.length() - 1, rcs.length() / 4 }) { + // logger.info("testing char at index=" + + // NumberFormat.getInstance().format(index)); + assertEquals("Characters don't match (index=" + + NumberFormat.getInstance().format(index) + ")", + fileContent.charAt(index % fileContent.length()), rcs + .charAt(index)); + } + + // check that out of bounds indices throw exception + for (int n : new int[] { -1, Integer.MIN_VALUE, rcs.length() + 1 }) { + try { + String message = "rcs.charAt(" + n + ")=" + rcs.charAt(n) + + " ?!? -- expected IndexOutOfBoundsException"; + logger.severe(message); + fail(message); + } catch (IndexOutOfBoundsException e) { + logger.info("got expected exception: " + e); + } + } + + // check some characters at random spots & kinda stress test the + // system's memory mapping facility + Random rand = new Random(0); // seed so we get the same ones each time + for (int i = 0; i < 5000; i++) { + int index = rand.nextInt(rcs.length()); + // logger.info(i + ". testing char at index=" + + // NumberFormat.getInstance().format(index)); + assertEquals("Characters don't match (index=" + + NumberFormat.getInstance().format(index) + ")", + fileContent.charAt(index % fileContent.length()), rcs + .charAt(index)); + } + } + + /** + * Accessing characters test. + * + * Checks that characters in the rcs are in sequence. + * + * @param rcs The ReplayCharSequence to try out. + */ + private void accessingCharacters(CharSequence rcs) { + long timestamp = (new Date()).getTime(); + int seeks = 0; + for (int i = (INCREMENT * 2); (i + INCREMENT) < rcs.length(); + i += INCREMENT) { + checkCharacter(rcs, i); + seeks++; + for (int j = i - INCREMENT; j < i; j++) { + checkCharacter(rcs, j); + seeks++; + } + } + // Note that printing out below breaks cruisecontrols drawing + // of the xml unit test results because it outputs disallowed + // xml characters. + logger.fine(rcs + " seeks count " + seeks + " in " + + ((new Date().getTime()) - timestamp) + " milliseconds."); + } + + /** + * Check the character read. + * + * Throws assertion if not expected result. + * + * @param rcs ReplayCharSequence to read from. + * @param i Character offset. + */ + private void checkCharacter(CharSequence rcs, int i) { + int c = rcs.charAt(i); + assertTrue("Character " + Integer.toString(c) + " at offset " + i + + " unexpected.", (c % SEQUENCE_LENGTH) == (i % SEQUENCE_LENGTH)); + } + + /** + * @param baseName + * @return RecordingOutputStream + * @throws IOException + */ + private RecordingOutputStream writeTestStream(byte[] content, + int memReps, String baseName, long fileReps) throws IOException { + String backingFilename = FileUtils.maybeRelative(getTmpDir(),baseName).getAbsolutePath(); + RecordingOutputStream ros = new RecordingOutputStream( + content.length * memReps, + backingFilename); + ros.open(); + ros.markMessageBodyBegin(); + for(long i = 0; i < (memReps+fileReps); i++) { + // fill buffer (repeat MULTIPLIER times) and + // overflow to disk (also MULTIPLIER times) + ros.write(content); + } + ros.close(); + return ros; + } + + + /** + * Fill a buffer w/ regular progression of single-byte + * (and <= 127) characters. + * @param buffer Buffer to fill. + * @return The buffer we filled. + */ + private byte [] fillBufferWithRegularContent(byte [] buffer) { + int index = 0; + for (int i = 0; i < buffer.length; i++) { + buffer[i] = (byte) (index & 0x00ff); + index++; + if (index >= SEQUENCE_LENGTH) { + // Reset the index. + index = 0; + } + } + return buffer; + } + + public void testCheckParameters() + { + // TODO. + } +} diff --git a/src/test/java/org/archive/io/RepositionableInputStreamTest.java b/src/test/java/org/archive/io/RepositionableInputStreamTest.java new file mode 100644 index 00000000..1c7cc74c --- /dev/null +++ b/src/test/java/org/archive/io/RepositionableInputStreamTest.java @@ -0,0 +1,70 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.PrintWriter; + +import org.archive.util.TmpDirTestCase; + +public class RepositionableInputStreamTest extends TmpDirTestCase { + private File testFile; + private static final String LINE = "0123456789abcdefghijklmnopqrstuv"; + protected void setUp() throws Exception { + super.setUp(); + this.testFile = new File(getTmpDir(), this.getClass().getName()); + PrintWriter pw = new PrintWriter(new FileOutputStream(testFile)); + for (int i = 0; i < 100; i++) { + pw.print(LINE); + } + pw.close(); + } + protected void tearDown() throws Exception { + super.tearDown(); + } + public void testname() throws Exception { + // Make buffer awkward size so we run into buffers spanning issues. + RepositionableInputStream ris = + new RepositionableInputStream(new FileInputStream(this.testFile), + 57); + int c = ris.read(); + assertEquals(1, ris.position()); + ris.read(); + ris.position(0); + assertEquals(0, ris.position()); + int c1 = ris.read(); + assertEquals(c, c1); + ris.position(0); + byte [] bytes = new byte[LINE.length()]; + long offset = 0; + for (int i = 0; i < 10; i++) { + ris.read(bytes, 0, LINE.length()); + assertEquals(LINE, new String(bytes)); + offset += LINE.length(); + assertEquals(offset, ris.position()); + } + long p = ris.position(); + ris.position(p - LINE.length()); + assertEquals(p - LINE.length(), ris.position()); + c = ris.read(); + assertEquals(c, c1); + } +} diff --git a/src/test/java/org/archive/io/arc/ARCWriterPoolTest.java b/src/test/java/org/archive/io/arc/ARCWriterPoolTest.java new file mode 100644 index 00000000..f0be6506 --- /dev/null +++ b/src/test/java/org/archive/io/arc/ARCWriterPoolTest.java @@ -0,0 +1,122 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io.arc; + +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.util.Arrays; + +import org.archive.io.WriterPool; +import org.archive.io.WriterPoolMember; +import org.archive.io.WriterPoolSettings; +import org.archive.util.TmpDirTestCase; + + +/** + * Test ARCWriterPool + */ +@SuppressWarnings("deprecation") +public class ARCWriterPoolTest extends TmpDirTestCase { + private static final String PREFIX = "TEST"; + + public void testARCWriterPool() + throws Exception { + final int MAX_ACTIVE = 3; + final int MAX_WAIT_MILLISECONDS = 100; + cleanUpOldFiles(PREFIX); + WriterPool pool = new ARCWriterPool(getSettings(true), + MAX_ACTIVE, MAX_WAIT_MILLISECONDS); + WriterPoolMember [] writers = new WriterPoolMember[MAX_ACTIVE]; + final String CONTENT = "Any old content"; + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + baos.write(CONTENT.getBytes()); + for (int i = 0; i < MAX_ACTIVE; i++) { + writers[i] = pool.borrowFile(); + assertEquals("Number active", i + 1, pool.getNumActive()); + ((ARCWriter)writers[i]).write("http://one.two.three", "no-type", + "0.0.0.0", 1234567890, CONTENT.length(), baos); + } + + // Pool is maxed out. New behavior is that additional requests + // block as long as necessary -- so no longer testing for timeout/ + // exception + + for (int i = (MAX_ACTIVE - 1); i >= 0; i--) { + pool.returnFile(writers[i]); + assertEquals("Number active", i, pool.getNumActive()); + assertEquals("Number idle", MAX_ACTIVE - pool.getNumActive(), + pool.getNumIdle()); + } + pool.close(); + } + + public void testInvalidate() throws Exception { + final int MAX_ACTIVE = 3; + final int MAX_WAIT_MILLISECONDS = 100; + cleanUpOldFiles(PREFIX); + WriterPool pool = new ARCWriterPool(getSettings(true), + MAX_ACTIVE, MAX_WAIT_MILLISECONDS); + WriterPoolMember [] writers = new WriterPoolMember[MAX_ACTIVE]; + final String CONTENT = "Any old content"; + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + baos.write(CONTENT.getBytes()); + for (int i = 0; i < MAX_ACTIVE; i++) { + writers[i] = pool.borrowFile(); + assertEquals("Number active", i + 1, pool.getNumActive()); + ((ARCWriter)writers[i]).write("http://one.two.three", "no-type", + "0.0.0.0", 1234567890, CONTENT.length(), baos); + } + + WriterPoolMember writer2Invalidate = writers[pool.getNumActive() - 1]; + writers[pool.getNumActive() - 1] = null; + pool.invalidateFile(writer2Invalidate); + for (int i = 0; i < (MAX_ACTIVE - 1); i++) { + if (writers[i] == null) { + continue; + } + pool.returnFile(writers[i]); + } + + for (int i = 0; i < MAX_ACTIVE; i++) { + writers[i] = pool.borrowFile(); + assertEquals("Number active", i + 1, pool.getNumActive()); + ((ARCWriter)writers[i]).write("http://one.two.three", "no-type", + "0.0.0.0", 1234567890, CONTENT.length(), baos); + } + for (int i = (MAX_ACTIVE - 1); i >= 0; i--) { + pool.returnFile(writers[i]); + assertEquals("Number active", i, pool.getNumActive()); + assertEquals("Number idle", MAX_ACTIVE - pool.getNumActive(), + pool.getNumIdle()); + } + pool.close(); + } + + private WriterPoolSettings getSettings(final boolean isCompressed) { + File [] files = {getTmpDir()}; + return new WriterPoolSettingsData( + PREFIX, + "${prefix}-${timestamp17}-${serialno}-${heritrix.hostname}", + ARCConstants.DEFAULT_MAX_ARC_FILE_SIZE, + isCompressed, + Arrays.asList(files), + null); + } +} \ No newline at end of file diff --git a/src/test/java/org/archive/io/arc/ARCWriterTest.java b/src/test/java/org/archive/io/arc/ARCWriterTest.java new file mode 100644 index 00000000..f6e2bf6a --- /dev/null +++ b/src/test/java/org/archive/io/arc/ARCWriterTest.java @@ -0,0 +1,699 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io.arc; + +import java.io.BufferedInputStream; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.io.PrintStream; +import java.util.Arrays; +import java.util.Date; +import java.util.Iterator; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.io.input.NullInputStream; +import org.apache.commons.io.output.NullOutputStream; +import org.archive.io.ArchiveRecord; +import org.archive.io.ArchiveRecordHeader; +import org.archive.io.ReplayInputStream; +import org.archive.io.WriterPoolMember; +import org.archive.io.WriterPoolSettings; +import org.archive.util.ArchiveUtils; +import org.archive.util.TmpDirTestCase; + +import com.google.common.io.Closeables; + + +/** + * Test ARCWriter class. + * + * This code exercises ARCWriter AND ARCReader. First it writes ARCs w/ + * ARCWriter. Then it validates what was written w/ ARCReader. + * + * @author stack + */ +public class ARCWriterTest +extends TmpDirTestCase implements ARCConstants { + /** + * Utility class for writing bad ARCs (with trailing junk) + */ + public class CorruptibleARCWriter extends ARCWriter { + byte[] endJunk = null; + + public CorruptibleARCWriter(AtomicInteger serial_no, WriterPoolSettings settings) { + super(serial_no, settings); + } + + @Override + protected void postWriteRecordTasks() throws IOException { + if (endJunk != null) { + this.write(endJunk); + } + super.postWriteRecordTasks(); + } + + public void setEndJunk(byte[] b) throws IOException { + this.endJunk = b; + } + } + + /** + * Suffix to use for ARC files made by JUNIT. + */ + private static final String SUFFIX = "JUNIT"; + + private static final String SOME_URL = "http://www.archive.org/test/"; + + + private static final AtomicInteger SERIAL_NO = new AtomicInteger(); + + /* + * @see TestCase#setUp() + */ + protected void setUp() throws Exception { + super.setUp(); + } + + /* + * @see TestCase#tearDown() + */ + protected void tearDown() throws Exception { + super.tearDown(); + } + + protected static String getContent() { + return getContent(null); + } + + protected static String getContent(String indexStr) { + String page = (indexStr != null)? "Page #" + indexStr: "Some Page"; + return "HTTP/1.1 200 OK\r\n" + + "Content-Type: text/html\r\n\r\n" + + "" + page + + "" + + "" + page + + ""; + } + + @SuppressWarnings("deprecation") + protected int writeRandomHTTPRecord(ARCWriter arcWriter, int index) + throws IOException { + String indexStr = Integer.toString(index); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + // Start the record with an arbitrary 14-digit date per RFC2540 + String now = ArchiveUtils.get14DigitDate(); + int recordLength = 0; + byte[] record = (getContent(indexStr)).getBytes(); + recordLength += record.length; + baos.write(record); + // Add the newline between records back in + baos.write("\n".getBytes()); + recordLength += 1; + arcWriter.write("http://www.one.net/id=" + indexStr, "text/html", + "0.1.2.3", Long.parseLong(now), recordLength, baos); + return recordLength; + } + + private File writeRecords(String baseName, boolean compress, + long maxSize, int recordCount) + throws IOException { + cleanUpOldFiles(baseName); + File [] files = {getTmpDir()}; + ARCWriter arcWriter = + new ARCWriter( + SERIAL_NO, + new WriterPoolSettingsData( + baseName, + "${prefix}-"+SUFFIX, + maxSize, + compress, + Arrays.asList(files), + null)); + assertNotNull(arcWriter); + for (int i = 0; i < recordCount; i++) { + writeRandomHTTPRecord(arcWriter, i); + } + arcWriter.close(); + assertTrue("Doesn't exist: " + + arcWriter.getFile().getAbsolutePath(), + arcWriter.getFile().exists()); + return arcWriter.getFile(); + } + + private void validate(File arcFile, int recordCount) + throws FileNotFoundException, IOException { + ARCReader reader = ARCReaderFactory.get(arcFile); + assertNotNull(reader); + List metaDatas = null; + if (recordCount == -1) { + metaDatas = reader.validate(); + } else { + metaDatas = reader.validate(recordCount); + } + reader.close(); + // Now, run through each of the records doing absolute get going from + // the end to start. Reopen the arc so no context between this test + // and the previous. + + for (int i = metaDatas.size() - 1; i >= 0; i--) { + reader = ARCReaderFactory.get(arcFile); + ARCRecordMetaData meta = (ARCRecordMetaData)metaDatas.get(i); + ArchiveRecord r = reader.get(meta.getOffset()); + String mimeType = r.getHeader().getMimetype(); + assertTrue("Record is bogus", + mimeType != null && mimeType.length() > 0); + reader.close(); + } + assertEquals("Metadata count not as expected",recordCount, metaDatas.size()); + for (Iterator i = metaDatas.iterator(); i.hasNext();) { + ARCRecordMetaData r = (ARCRecordMetaData)i.next(); + assertTrue("Record is empty", r.getLength() > 0); + } + } + + public void testCheckARCFileSize() + throws IOException { + runCheckARCFileSizeTest("checkARCFileSize", false); + } + + public void testCheckARCFileSizeCompressed() + throws IOException { + runCheckARCFileSizeTest("checkARCFileSize", true); + } + + public void testWriteRecord() throws IOException { + final int recordCount = 2; + File arcFile = writeRecords("writeRecord", false, + DEFAULT_MAX_ARC_FILE_SIZE, recordCount); + validate(arcFile, recordCount + 1); // Header record. + } + + public void testRandomAccess() throws IOException { + final int recordCount = 3; + File arcFile = writeRecords("writeRecord", true, + DEFAULT_MAX_ARC_FILE_SIZE, recordCount); + ARCReader reader = ARCReaderFactory.get(arcFile); + // Get to second record. Get its offset for later use. + boolean readFirst = false; + String url = null; + long offset = -1; + long totalRecords = 0; + boolean readSecond = false; + for (final Iterator i = reader.iterator(); i.hasNext(); totalRecords++) { + ARCRecord ar = (ARCRecord)i.next(); + if (!readFirst) { + readFirst = true; + continue; + } + if (!readSecond) { + url = ar.getMetaData().getUrl(); + offset = ar.getMetaData().getOffset(); + readSecond = true; + } + } + reader.close(); + + reader = ARCReaderFactory.get(arcFile, offset); + ArchiveRecord ar = reader.get(); + assertEquals(ar.getHeader().getUrl(), url); + ar.close(); + reader.close(); + + // Get reader again. See how iterator works with offset + reader = ARCReaderFactory.get(arcFile, offset); + int count = 0; + for (final Iterator i = reader.iterator(); i.hasNext(); i.next()) { + count++; + } + reader.close(); + assertEquals(totalRecords - 1, count); + } + + public void testWriteRecordCompressed() throws IOException { + final int recordCount = 2; + File arcFile = writeRecords("writeRecordCompressed", true, + DEFAULT_MAX_ARC_FILE_SIZE, recordCount); + validate(arcFile, recordCount + 1 /*Header record*/); + } + + public void testWriteGiantRecord() throws IOException { + PrintStream dummyStream = new PrintStream(new NullOutputStream()); + ARCWriter arcWriter = + new ARCWriter( + SERIAL_NO, + dummyStream, + new File("dummy"), + new WriterPoolSettingsData( + "", + "", + -1, + false, + null, + null)); + assertNotNull(arcWriter); + + // Start the record with an arbitrary 14-digit date per RFC2540 + long now = System.currentTimeMillis(); + long recordLength = org.apache.commons.io.FileUtils.ONE_GB * 3; + + arcWriter.write("dummy:uri", "application/octet-stream", + "0.1.2.3", now, recordLength, new NullInputStream(recordLength)); + arcWriter.close(); + } + + private void runCheckARCFileSizeTest(String baseName, boolean compress) + throws FileNotFoundException, IOException { + File f = writeRecords(baseName, compress, 1024, 15); + validate(f, 15+1); + } + + protected CorruptibleARCWriter createARCWriter(String name, boolean compress) { + File [] files = {getTmpDir()}; + return new CorruptibleARCWriter( + SERIAL_NO, + new WriterPoolSettingsData( + name, + "${prefix}-"+SUFFIX, + DEFAULT_MAX_ARC_FILE_SIZE, + compress, + Arrays.asList(files), + null)); + } + + protected static ByteArrayInputStream getBais(String str) + throws IOException { + return new ByteArrayInputStream(str.getBytes()); + } + + /** + * Writes a record, suppressing normal length-checks (so that + * intentionally malformed records may be written). + */ + protected static void writeRecord(ARCWriter writer, String url, + String type, int len, ByteArrayInputStream bais) + throws IOException { + writer.write(url, type, "192.168.1.1", (new Date()).getTime(), len, + bais, false); + } + + protected int iterateRecords(ARCReader r) + throws IOException { + int count = 0; + for (Iterator i = r.iterator(); i.hasNext();) { + ARCRecord rec = (ARCRecord)i.next(); + rec.close(); + if (count != 0) { + assertTrue("Unexpected URL " + rec.getMetaData().getUrl(), + rec.getMetaData().getUrl().startsWith(SOME_URL)); + } + count++; + } + return count; + } + + protected CorruptibleARCWriter createArcWithOneRecord(String name, + boolean compressed) + throws IOException { + CorruptibleARCWriter writer = createARCWriter(name, compressed); + String content = getContent(); + writeRecord(writer, SOME_URL, "text/html", + content.length(), getBais(content)); + return writer; + } + + public void testSpaceInURL() { + String eMessage = null; + try { + holeyUrl("testSpaceInURL", false, " "); + } catch (IOException e) { + eMessage = e.getMessage(); + } + assertTrue("Didn't get expected exception: " + eMessage, + eMessage.startsWith("Metadata line doesn't match")); + } + + public void testTabInURL() { + String eMessage = null; + try { + holeyUrl("testTabInURL", false, "\t"); + } catch (IOException e) { + eMessage = e.getMessage(); + } + assertTrue("Didn't get expected exception: " + eMessage, + eMessage.startsWith("Metadata line doesn't match")); + } + + protected void holeyUrl(String name, boolean compress, String urlInsert) + throws IOException { + ARCWriter writer = null; + try { + writer = createArcWithOneRecord(name, compress); + // Add some bytes on the end to mess up the record. + String content = getContent(); + writeRecord(writer, SOME_URL + urlInsert + "/index.html", "text/html", + content.length(), getBais(content)); + } finally { + Closeables.close(writer, true); + } + } + +// If uncompressed, length has to be right or parse will fail. +// +// public void testLengthTooShort() throws IOException { +// lengthTooShort("testLengthTooShort-" + PREFIX, false); +// } + + public void testLengthTooShortCompressed() throws IOException { + lengthTooShort("testLengthTooShortCompressed", true, false); + } + + public void testLengthTooShortCompressedStrict() + throws IOException { + String eMessage = null; + try { + lengthTooShort("testLengthTooShortCompressedStrict", + true, true); + } catch (RuntimeException e) { + eMessage = e.getMessage(); + } + assertTrue("Didn't get expected exception: " + eMessage, + eMessage.startsWith("java.io.IOException: Record STARTING at")); + } + + protected void lengthTooShort(String name, boolean compress, boolean strict) + throws IOException { + CorruptibleARCWriter writer = null; + try { + writer = createArcWithOneRecord(name, compress); + // Add some bytes on the end to mess up the record. + String content = getContent(); + ByteArrayInputStream bais = getBais(content+"SOME TRAILING BYTES"); + writeRecord(writer, SOME_URL, "text/html", + content.length(), bais); + writer.setEndJunk("SOME TRAILING BYTES".getBytes()); + writeRecord(writer, SOME_URL, "text/html", + content.length(), getBais(content)); + } finally { + Closeables.close(writer, true); + } + + // Catch System.err into a byte stream. + ByteArrayOutputStream os = new ByteArrayOutputStream(); + PrintStream origErr = System.err; + ARCReader r = null; + try { + System.setErr(new PrintStream(os)); + + r = ARCReaderFactory.get(writer.getFile()); + r.setStrict(strict); + int count = iterateRecords(r); + assertTrue("Count wrong " + count, count == 4); + + // Make sure we get the warning string which complains about the + // trailing bytes. + String err = os.toString(); + assertTrue("No message " + err, err.startsWith("WARNING") && + (err.indexOf("Record STARTING at") > 0)); + r.close(); + } finally { + Closeables.close(r, true); + System.setErr(origErr); + } + } + +// If uncompressed, length has to be right or parse will fail. +// +// public void testLengthTooLong() +// throws IOException { +// lengthTooLong("testLengthTooLongCompressed-" + PREFIX, +// false, false); +// } + + public void testLengthTooLongCompressed() + throws IOException { + lengthTooLong("testLengthTooLongCompressed", + true, false); + } + + public void testLengthTooLongCompressedStrict() { + String eMessage = null; + try { + lengthTooLong("testLengthTooLongCompressed", + true, true); + } catch (IOException e) { + eMessage = e.getMessage(); + } + assertTrue("Didn't get expected exception: " + eMessage, + eMessage.startsWith("Premature EOF before end-of-record")); + } + + protected void lengthTooLong(String name, boolean compress, + boolean strict) + throws IOException { + ARCWriter writer = createArcWithOneRecord(name, compress); + // Add a record with a length that is too long. + String content = getContent(); + writeRecord(writer, SOME_URL+"2", "text/html", + content.length() + 10, getBais(content)); + writeRecord(writer, SOME_URL+"3", "text/html", + content.length(), getBais(content)); + writer.close(); + + // Catch System.err. + ByteArrayOutputStream os = new ByteArrayOutputStream(); + + PrintStream origErr = System.err; + ARCReader r = null; + try { + System.setErr(new PrintStream(os)); + + r = ARCReaderFactory.get(writer.getFile()); + r.setStrict(strict); + int count = iterateRecords(r); + assertTrue("Count wrong " + count, count == 4); + + // Make sure we get the warning string which complains about the + // trailing bytes. + String err = os.toString(); + assertTrue("No message " + err, + err.startsWith("WARNING Premature EOF before end-of-record")); + } finally { + Closeables.close(r, true); + System.setErr(origErr); + } + } + + public void testGapError() throws IOException { + ARCWriter writer = createArcWithOneRecord("testGapError", true); + String content = getContent(); + // Make a 'weird' RIS that returns bad 'remaining' length + // awhen remaining should be 0 + ReplayInputStream ris = new ReplayInputStream(content.getBytes(), + content.length(), null) { + public long remaining() { + return (super.remaining()==0) ? -1 : super.remaining(); + } + }; + String message = null; + try { + writer.write(SOME_URL, "text/html", "192.168.1.1", + (new Date()).getTime(), content.length(), ris); + } catch (IOException e) { + message = e.getMessage(); + } finally { + IOUtils.closeQuietly(ris); + } + writer.close(); + assertTrue("No gap when should be", + message != null && + message.indexOf("Gap between expected and actual") >= 0); + } + + /** + * Write an arc file for other tests to use. + * @param arcdir Directory to write to. + * @param compress True if file should be compressed. + * @return ARC written. + * @throws IOException + */ + public static File createARCFile(File arcdir, boolean compress) + throws IOException { + File [] files = {arcdir}; + ARCWriter writer = new ARCWriter(SERIAL_NO, + new WriterPoolSettingsData( + "", + "test", + DEFAULT_MAX_ARC_FILE_SIZE, + compress, + Arrays.asList(files), + null)); + String content = getContent(); + writeRecord(writer, SOME_URL, "text/html", content.length(), + getBais(content)); + writer.close(); + return writer.getFile(); + } + +// public void testSpeed() throws IOException { +// ARCWriter writer = createArcWithOneRecord("speed", true); +// // Add a record with a length that is too long. +// String content = getContent(); +// final int count = 100000; +// logger.info("Starting speed write of " + count + " records."); +// for (int i = 0; i < count; i++) { +// writeRecord(writer, SOME_URL, "text/html", content.length(), +// getBaos(content)); +// } +// writer.close(); +// logger.info("Finished speed write test."); +// } + + + public void testValidateMetaLine() throws Exception { + final String line = "http://www.aandw.net/images/walden2.png " + + "128.197.34.86 20060111174224 image/png 2160"; + ARCWriter w = createARCWriter("testValidateMetaLine", true); + try { + w.validateMetaLine(line); + w.validateMetaLine(line + LINE_SEPARATOR); + w.validateMetaLine(line + "\\r\\n"); + } finally { + w.close(); + } + } + + public void testArcRecordOffsetReads() throws Exception { + ARCReader r = getSingleRecordReader("testArcRecordInBufferStream"); + ARCRecord ar = getSingleRecord(r); + // Now try getting some random set of bytes out of it + // at an odd offset (used to fail because we were + // doing bad math to find where in buffer to read). + final byte[] buffer = new byte[17]; + final int maxRead = 4; + int totalRead = 0; + while (totalRead < maxRead) { + totalRead = totalRead + + ar.read(buffer, 13 + totalRead, maxRead - totalRead); + assertTrue(totalRead > 0); + } + r.close(); + } + + // available should always be >= 0; extra read()s should all give EOF + public void testArchiveRecordAvailableConsistent() throws Exception { + // first test reading byte-at-a-time via no-param read() + ARCReader r = getSingleRecordReader("testArchiveRecordAvailableConsistent"); + ARCRecord record = getSingleRecord(r); + int c = record.read(); + while(c>=0) { + c = record.read(); + } + // consecutive reads after EOR should always give -1, still show zero available() + for (int i=0; i<5; i++) { + assertTrue("available negative:"+record.available(), record.available()>=0); + assertEquals(-1, record.read()); + } + r.close(); + } + + // should always give -1 on repeated reads past EOR + public void testArchiveRecordEORConsistent() throws Exception { + ARCReader r = getSingleRecordReader("testArchiveRecordEORConsistent"); + ARCRecord record = getSingleRecord(r); + this.readToEOS(record); + // consecutive reads after EOR should always give -1 + for (int i=0; i<5; i++) { + assertEquals(-1, record.read(new byte[1])); + } + r.close(); + } + + // should not throw premature EOF when wrapped with BufferedInputStream + // [HER-1450] showed this was the case using Apache Tika + public void testArchiveRecordMarkSupport() throws Exception { + ARCReader r = getSingleRecordReader("testArchiveRecordMarkSupport"); + ARCRecord record = getSingleRecord(r); + record.setStrict(true); + // ensure mark support + InputStream stream = new BufferedInputStream(record); + if (stream.markSupported()) { + for (int i=0; i<3; i++) { + this.readToEOS(stream); + stream.mark(stream.available()); + stream.reset(); + } + stream.close(); + } + r.close(); + } + + /** + * Test a particular style of using the reader iterator. (Should + * possibly be on a reader-centric test class, but the best setup + * functionality is here.) + * + * @throws IOException + */ + public void testReadIterator() throws IOException { + final int recordCount = 3; + File arcFile = writeRecords("writeRecord", true, + DEFAULT_MAX_ARC_FILE_SIZE, recordCount); + ARCReader reader = ARCReaderFactory.get(arcFile); + Iterator it = reader.iterator(); + while (it.hasNext()) { + ArchiveRecord next = it.next(); + next.close(); + } + reader.close(); + } + + protected void readToEOS(InputStream in) throws Exception { + byte [] buf = new byte[1024]; + int read = 0; + while (read >= 0) { + read = in.read(buf); + // System.out.println("readToEOS read " + read + " bytes"); + } + } + + protected ARCReader getSingleRecordReader(String name) throws Exception { + // Get an ARC with one record. + WriterPoolMember w = createArcWithOneRecord(name, true); + w.close(); + // Get reader on said ARC. + ARCReader r = ARCReaderFactory.get(w.getFile()); + return r; + } + + protected ARCRecord getSingleRecord(ARCReader r) { + final Iterator i = r.iterator(); + // Skip first ARC meta record. + i.next(); + i.hasNext(); + // Now we're at first and only record in ARC. + return (ARCRecord) i.next(); + } +} diff --git a/src/test/java/org/archive/io/warc/WARCWriterTest.java b/src/test/java/org/archive/io/warc/WARCWriterTest.java new file mode 100644 index 00000000..35c68714 --- /dev/null +++ b/src/test/java/org/archive/io/warc/WARCWriterTest.java @@ -0,0 +1,512 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io.warc; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.net.URI; +import java.util.Arrays; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; + +import org.archive.io.ArchiveRecord; +import org.archive.io.ArchiveRecordHeader; +import org.archive.io.UTF8Bytes; +import org.archive.io.WriterPoolMember; +import org.archive.uid.RecordIDGenerator; +import org.archive.uid.UUIDGenerator; +import org.archive.util.ArchiveUtils; +import org.archive.util.TmpDirTestCase; +import org.archive.util.anvl.ANVLRecord; + +/** + * Test Writer and Reader. + * @author stack + * @version $Date: 2006-08-29 19:35:48 -0700 (Tue, 29 Aug 2006) $ $Version$ + */ +public class WARCWriterTest +extends TmpDirTestCase implements WARCConstants { + + private static final AtomicInteger SERIAL_NO = new AtomicInteger(); + + RecordIDGenerator generator = new UUIDGenerator(); + + /** + * Prefix to use for ARC files made by JUNIT. + */ + private static final String SUFFIX = "JUNIT"; + + private static final String SOME_URL = "http://www.archive.org/test/"; + + @SuppressWarnings("unchecked") + public void testCheckHeaderLineValue() throws Exception { + WARCWriter writer = new WARCWriter( + SERIAL_NO, + new WARCWriterPoolSettingsData( + "","test",1,false,Collections.EMPTY_LIST,Collections.EMPTY_LIST,generator)); + writer.checkHeaderValue("one"); + IllegalArgumentException exception = null; + try { + writer.checkHeaderValue("with space"); + } catch(IllegalArgumentException e) { + exception = e; + } + assertNotNull(exception); + exception = null; + try { + writer.checkHeaderValue("with\0x0000controlcharacter"); + } catch(IllegalArgumentException e) { + exception = e; + } + writer.close(); + assertNotNull(exception); + } + + @SuppressWarnings("unchecked") + public void testMimetypes() throws IOException { + WARCWriter writer = new WARCWriter(SERIAL_NO, + new WARCWriterPoolSettingsData( + "m","testM",1,false,Collections.EMPTY_LIST,Collections.EMPTY_LIST,generator)); + writer.checkHeaderLineMimetypeParameter("text/xml"); + writer.checkHeaderLineMimetypeParameter("text/xml+rdf"); + assertEquals(writer.checkHeaderLineMimetypeParameter( + "text/plain; charset=SHIFT-JIS"), "text/plain; charset=SHIFT-JIS"); + assertEquals(writer.checkHeaderLineMimetypeParameter( + "multipart/mixed; \r\n boundary=\"simple boundary\""), + "multipart/mixed; boundary=\"simple boundary\""); + } + + public void testWriteRecord() throws IOException { + File [] files = {getTmpDir()}; + + // Write uncompressed. + WARCWriter writer = + new WARCWriter(SERIAL_NO, new WARCWriterPoolSettingsData( + this.getClass().getName(), "templateWR1", -1, false, Arrays.asList(files), null, generator)); + + writeFile(writer); + writer.close(); + + // Write compressed. + writer = new WARCWriter(SERIAL_NO, new WARCWriterPoolSettingsData( + this.getClass().getName(), "templateWR2", -1, true, Arrays.asList(files), null, generator)); + + writeFile(writer); + writer.close(); + } + + private void writeFile(final WARCWriter writer) + throws IOException { + try { + writeWarcinfoRecord(writer); + writeBasicRecords(writer); + } finally { + writer.close(); + writer.getFile().delete(); + } + } + + private void writeWarcinfoRecord(WARCWriter writer) + throws IOException { + WARCRecordInfo recordInfo = new WARCRecordInfo(); + recordInfo.setType(WARCRecordType.warcinfo); + recordInfo.setUrl(null); + recordInfo.setCreate14DigitDate(ArchiveUtils.getLog14Date()); + recordInfo.setMimetype(ANVLRecord.MIMETYPE); + recordInfo.setExtraHeaders(null); + recordInfo.setEnforceLength(true); + + ANVLRecord meta = new ANVLRecord(); + meta.addLabelValue("size", "1G"); + meta.addLabelValue("operator", "igor"); + byte [] bytes = meta.getUTF8Bytes(); + recordInfo.setContentStream(new ByteArrayInputStream(bytes)); + recordInfo.setContentLength((long) bytes.length); + + final URI recordid = writer.generateRecordId(WARCWriter.TYPE, WARCRecordType.warcinfo.toString()); + recordInfo.setRecordId(recordid); + + writer.writeRecord(recordInfo); + } + + protected void writeBasicRecords(final WARCWriter writer) + throws IOException { + WARCRecordInfo recordInfo = new WARCRecordInfo(); + recordInfo.setType(WARCRecordType.metadata); + recordInfo.setUrl("http://www.archive.org/"); + recordInfo.setCreate14DigitDate(ArchiveUtils.get14DigitDate()); + recordInfo.setMimetype("no/type"); + recordInfo.setEnforceLength(true); + + ANVLRecord headerFields = new ANVLRecord(); + headerFields.addLabelValue("x", "y"); + headerFields.addLabelValue("a", "b"); + recordInfo.setExtraHeaders(headerFields); + + URI rid = (new UUIDGenerator()).getQualifiedRecordID(TYPE, WARCRecordType.metadata.toString()); + recordInfo.setRecordId(rid); + + final String content = "Any old content."; + for (int i = 0; i < 10; i++) { + String body = i + ". " + content; + byte [] bodyBytes = body.getBytes(UTF8Bytes.UTF8); + recordInfo.setContentStream(new ByteArrayInputStream(bodyBytes)); + recordInfo.setContentLength((long)bodyBytes.length); + writer.writeRecord(recordInfo); + } + } + + /** + * @return Generic HTML Content. + */ + protected static String getContent() { + return getContent(null); + } + + /** + * @return Generic HTML Content with mention of passed indexStr + * in title and body. + */ + protected static String getContent(String indexStr) { + String page = (indexStr != null)? "Page #" + indexStr: "Some Page"; + return "HTTP/1.1 200 OK\r\n" + + "Content-Type: text/html\r\n\r\n" + + "" + page + + "" + + "" + page + + ""; + } + + /** + * Write random HTML Record. + * @param w Where to write. + * @param index An index to put into content. + * @return Length of record written. + * @throws IOException + */ + protected int writeRandomHTTPRecord(WARCWriter w, int index) + throws IOException { + WARCRecordInfo recordInfo = new WARCRecordInfo(); + recordInfo.setType(WARCRecordType.resource); + recordInfo.setCreate14DigitDate(ArchiveUtils.get14DigitDate()); + recordInfo.setMimetype("text/html; charset=UTF-8"); + recordInfo.setRecordId(w.generateRecordId(null)); + recordInfo.setEnforceLength(true); + + String indexStr = Integer.toString(index); + recordInfo.setUrl("http://www.one.net/id=" + indexStr); + + byte[] record = (getContent(indexStr)).getBytes(); + recordInfo.setContentLength((long) record.length); + + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + baos.write(record); + recordInfo.setContentStream(new ByteArrayInputStream(baos.toByteArray())); + + // Add named fields for ip, checksum, and relate the metadata + // and request to the resource field. + recordInfo.addExtraHeader(NAMED_FIELD_IP_LABEL, "127.0.0.1"); + + w.writeRecord(recordInfo); + return record.length; + } + + /** + * Fill a WARC with HTML Records. + * @param baseName WARC basename. + * @param compress Whether to compress or not. + * @param maxSize Maximum WARC size. + * @param recordCount How many records. + * @return The written file. + * @throws IOException + */ + private File writeRecords(String baseName, boolean compress, + int maxSize, int recordCount) + throws IOException { + cleanUpOldFiles(baseName); + File [] files = {getTmpDir()}; + WARCWriter w = new WARCWriter(SERIAL_NO, new WARCWriterPoolSettingsData( + baseName + '-' + SUFFIX, "${prefix}", maxSize, compress, Arrays.asList(files), null, generator)); + + assertNotNull(w); + for (int i = 0; i < recordCount; i++) { + writeRandomHTTPRecord(w, i); + } + w.close(); + assertTrue("Doesn't exist: " + w.getFile().getAbsolutePath(), + w.getFile().exists()); + return w.getFile(); + } + + /** + * Run validation of passed file. + * @param f File to validate. + * @param recordCount Expected count of records. + * @throws FileNotFoundException + * @throws IOException + */ + private void validate(File f, int recordCount) + throws FileNotFoundException, IOException { + WARCReader reader = WARCReaderFactory.get(f); + assertNotNull(reader); + List headers = null; + if (recordCount == -1) { + headers = reader.validate(); + } else { + headers = reader.validate(recordCount); + } + reader.close(); + + // Now, run through each of the records doing absolute get going from + // the end to start. Reopen the arc so no context between this test + // and the previous. + + for (int i = headers.size() - 1; i >= 0; i--) { + reader = WARCReaderFactory.get(f); + ArchiveRecordHeader h = (ArchiveRecordHeader)headers.get(i); + ArchiveRecord r = reader.get(h.getOffset()); + String mimeType = r.getHeader().getMimetype(); + assertTrue("Record is bogus", + mimeType != null && mimeType.length() > 0); + reader.close(); + } + + assertTrue("Metadatas not equal", headers.size() == recordCount); + for (Iterator i = headers.iterator(); i.hasNext();) { + ArchiveRecordHeader r = (ArchiveRecordHeader)i.next(); + assertTrue("Record is empty", r.getLength() > 0); + } + } + + public void testWriteRecords() throws IOException { + final int recordCount = 2; + File f = writeRecords("writeRecords", false, DEFAULT_MAX_WARC_FILE_SIZE, + recordCount); + validate(f, recordCount + 1); // Header record. + } + + public void testRandomAccess() throws IOException { + final int recordCount = 3; + File f = writeRecords("randomAccess", true, DEFAULT_MAX_WARC_FILE_SIZE, + recordCount); + WARCReader reader = WARCReaderFactory.get(f); + // Get to second record. Get its offset for later use. + boolean readFirst = false; + String url = null; + long offset = -1; + long totalRecords = 0; + boolean readSecond = false; + for (final Iterator i = reader.iterator(); i.hasNext(); + totalRecords++) { + WARCRecord ar = (WARCRecord)i.next(); + if (!readFirst) { + readFirst = true; + continue; + } + if (!readSecond) { + url = ar.getHeader().getUrl(); + offset = ar.getHeader().getOffset(); + readSecond = true; + } + } + reader.close(); + + reader = WARCReaderFactory.get(f, offset); + ArchiveRecord ar = reader.get(); + assertEquals(ar.getHeader().getUrl(), url); + ar.close(); + reader.close(); + + // Get reader again. See how iterator works with offset + reader = WARCReaderFactory.get(f, offset); + int count = 0; + for (final Iterator i = reader.iterator(); i.hasNext(); i.next()) { + count++; + } + reader.close(); + assertEquals(totalRecords - 1, count); + } + + public void testWriteRecordCompressed() throws IOException { + final int recordCount = 2; + File arcFile = writeRecords("writeRecordCompressed", true, + DEFAULT_MAX_WARC_FILE_SIZE, recordCount); + validate(arcFile, recordCount + 1 /*Header record*/); + } + + protected WARCWriter createWARCWriter(String name, + boolean compress) { + File [] files = {getTmpDir()}; + return new WARCWriter(SERIAL_NO, + new WARCWriterPoolSettingsData( + name, + "${prefix}-"+SUFFIX, + DEFAULT_MAX_WARC_FILE_SIZE, + compress, + Arrays.asList(files), + null, + generator)); + } + + protected static ByteArrayOutputStream getBaos(String str) + throws IOException { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + baos.write(str.getBytes()); + return baos; + } + + protected static void writeRecord(WARCWriter w, String url, + String mimetype, int len, ByteArrayOutputStream baos) + throws IOException { + WARCRecordInfo recordInfo = new WARCRecordInfo(); + recordInfo.setType(WARCRecordType.resource); + recordInfo.setUrl(url); + recordInfo.setCreate14DigitDate(ArchiveUtils.get14DigitDate()); + recordInfo.setMimetype(mimetype); + recordInfo.setRecordId(w.generateRecordId(null)); + recordInfo.setExtraHeaders(null); + recordInfo.setContentStream(new ByteArrayInputStream(baos.toByteArray())); + recordInfo.setContentLength((long) len); + recordInfo.setEnforceLength(true); + + w.writeRecord(recordInfo); + } + + protected int iterateRecords(WARCReader r) + throws IOException { + int count = 0; + for (Iterator i = r.iterator(); i.hasNext();) { + ArchiveRecord ar = i.next(); + ar.close(); + if (count != 0) { + assertTrue("Unexpected URL " + ar.getHeader().getUrl(), + ar.getHeader().getUrl().equals(SOME_URL)); + } + count++; + } + return count; + } + + protected WARCWriter createWithOneRecord(String name, + boolean compressed) + throws IOException { + WARCWriter writer = createWARCWriter(name, compressed); + String content = getContent(); + writeRecord(writer, SOME_URL, "text/html", + content.length(), getBaos(content)); + return writer; + } + + public void testSpaceInURL() throws IOException { + long bytesWritten = holeyUrl("testSpaceInURL", false, " "); + assertEquals("Unexpected successful writing occurred",0,bytesWritten); + } + + public void testTabInURL() throws IOException { + long bytesWritten = holeyUrl("testTabInURL", false, "\t"); + assertEquals("Unexpected successful writing occurred",0,bytesWritten); + } + + protected long holeyUrl(String name, boolean compress, String urlInsert) + throws IOException { + WARCWriter writer = createWithOneRecord(name, compress); + // Add some bytes on the end to mess up the record. + long startPos = writer.getPosition(); + String content = getContent(); + ByteArrayOutputStream baos = getBaos(content); + writeRecord(writer, SOME_URL + urlInsert + "/index.html", "text/html", + content.length(), baos); + long endPos = writer.getPosition(); + writer.close(); + return endPos-startPos; + } + + /** + * Write an arc file for other tests to use. + * @param arcdir Directory to write to. + * @param compress True if file should be compressed. + * @return ARC written. + * @throws IOException + */ + public static File createWARCFile(File arcdir, boolean compress) + throws IOException { + File [] files = {arcdir}; + WARCWriter writer = + new WARCWriter(SERIAL_NO, + new WARCWriterPoolSettingsData( + "", + "test", + DEFAULT_MAX_WARC_FILE_SIZE, + compress, + Arrays.asList(files), + null, + new UUIDGenerator())); + String content = getContent(); + writeRecord(writer, SOME_URL, "text/html", content.length(), + getBaos(content)); + writer.close(); + return writer.getFile(); + } + +// public void testSpeed() throws IOException { +// ARCWriter writer = createArcWithOneRecord("speed", true); +// // Add a record with a length that is too long. +// String content = getContent(); +// final int count = 100000; +// logger.info("Starting speed write of " + count + " records."); +// for (int i = 0; i < count; i++) { +// writeRecord(writer, SOME_URL, "text/html", content.length(), +// getBaos(content)); +// } +// writer.close(); +// logger.info("Finished speed write test."); +// } + + public void testArcRecordOffsetReads() throws Exception { + // Get an ARC with one record. + WriterPoolMember w = + createWithOneRecord("testArcRecordInBufferStream", true); + w.close(); + // Get reader on said ARC. + WARCReader r = WARCReaderFactory.get(w.getFile()); + final Iterator i = r.iterator(); + // Skip first ARC meta record. + ArchiveRecord ar = i.next(); + i.hasNext(); + // Now we're at first and only record in ARC. + ar = (WARCRecord) i.next(); + // Now try getting some random set of bytes out of it + // at an odd offset (used to fail because we were + // doing bad math to find where in buffer to read). + final byte[] buffer = new byte[17]; + final int maxRead = 4; + int totalRead = 0; + while (totalRead < maxRead) { + totalRead = totalRead + + ar.read(buffer, 13 + totalRead, maxRead - totalRead); + assertTrue(totalRead > 0); + } + } +} \ No newline at end of file diff --git a/src/test/java/org/archive/uid/UUIDGeneratorTest.java b/src/test/java/org/archive/uid/UUIDGeneratorTest.java new file mode 100644 index 00000000..79e98fb6 --- /dev/null +++ b/src/test/java/org/archive/uid/UUIDGeneratorTest.java @@ -0,0 +1,44 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.uid; + +import java.net.URI; +import java.net.URISyntaxException; +import java.util.HashMap; +import java.util.Map; + +import junit.framework.TestCase; + +/** + * @author stack + * @version $Revision$ $Date$ + */ +public class UUIDGeneratorTest extends TestCase { + public void testQualifyRecordID() throws URISyntaxException { + RecordIDGenerator g = new UUIDGenerator(); + URI uri = g.getRecordID(); + Map qualifiers = new HashMap(); + qualifiers.put("a", "b"); + URI nuURI = g.qualifyRecordID(uri, qualifiers); + assertNotSame(uri, nuURI); + qualifiers.put("c", "d"); + nuURI = g.qualifyRecordID(nuURI, qualifiers); + assertNotSame(uri, nuURI); + } +} diff --git a/src/test/java/org/archive/util/FileUtilsTest.java b/src/test/java/org/archive/util/FileUtilsTest.java new file mode 100644 index 00000000..19271435 --- /dev/null +++ b/src/test/java/org/archive/util/FileUtilsTest.java @@ -0,0 +1,271 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.util; + +import java.io.File; +import java.io.IOException; +import java.util.Collections; +import java.util.LinkedList; +import java.util.List; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang.math.LongRange; + + +/** + * FileUtils tests. + * + * @contributor stack + * @contributor gojomo + * @version $Date$, $Revision$ + */ +public class FileUtilsTest extends TmpDirTestCase { + private String srcDirName = FileUtilsTest.class.getName() + ".srcdir"; + private File srcDirFile = null; + private String tgtDirName = FileUtilsTest.class.getName() + ".tgtdir"; + private File tgtDirFile = null; + + protected File zeroLengthLinesUnix; + protected File zeroLengthLinesWindows; + + protected File smallLinesUnix; + protected File smallLinesWindows; + protected File largeLinesUnix; + protected File largeLinesWindows; + protected File nakedLastLineUnix; + protected File nakedLastLineWindows; + + + protected void setUp() throws Exception { + super.setUp(); + this.srcDirFile = new File(getTmpDir(), srcDirName); + FileUtils.ensureWriteableDirectory(srcDirFile); + this.tgtDirFile = new File(getTmpDir(), tgtDirName); + FileUtils.ensureWriteableDirectory(tgtDirFile); + addFiles(); + + zeroLengthLinesUnix = setUpLinesFile("zeroLengthLinesUnix",0,0,400,IOUtils.LINE_SEPARATOR_UNIX); + zeroLengthLinesWindows = setUpLinesFile("zeroLengthLinesUnix",0,0,400,IOUtils.LINE_SEPARATOR_WINDOWS); + + smallLinesUnix = setUpLinesFile("smallLinesUnix", 0, 25, 400, IOUtils.LINE_SEPARATOR_UNIX); + smallLinesWindows = setUpLinesFile("smallLinesWindows", 0, 25, 400, IOUtils.LINE_SEPARATOR_WINDOWS); + largeLinesUnix = setUpLinesFile("largeLinesUnix", 128, 256, 5, IOUtils.LINE_SEPARATOR_UNIX); + largeLinesWindows = setUpLinesFile("largeLinesWindows", 128, 256, 4096, IOUtils.LINE_SEPARATOR_WINDOWS); + + nakedLastLineUnix = setUpLinesFile("nakedLastLineUnix", 0, 50, 401, IOUtils.LINE_SEPARATOR_UNIX); + org.apache.commons.io.FileUtils.writeStringToFile(nakedLastLineUnix,"a"); + nakedLastLineWindows = setUpLinesFile("nakedLastLineWindows", 0, 50, 401, IOUtils.LINE_SEPARATOR_WINDOWS); + org.apache.commons.io.FileUtils.writeStringToFile(nakedLastLineWindows,"a"); + } + + private void addFiles() throws IOException { + addFiles(3, this.getName()); + } + + private void addFiles(final int howMany, final String baseName) + throws IOException { + for (int i = 0; i < howMany; i++) { + File.createTempFile(baseName, null, this.srcDirFile); + } + } + + private File setUpLinesFile(String name, int minLineSize, int maxLineSize, int lineCount, String lineEnding) throws IOException { + List lines = new LinkedList(); + StringBuilder sb = new StringBuilder(maxLineSize); + for(int i = 0; i< lineSize; j++) { + sb.append("-"); + } + lines.add(sb.toString()); + } + File file = File.createTempFile(name, null); + org.apache.commons.io.FileUtils.writeLines(file, lines, lineEnding); + return file; + + } + + protected void tearDown() throws Exception { + super.tearDown(); + org.apache.commons.io.FileUtils.deleteQuietly(this.srcDirFile); + org.apache.commons.io.FileUtils.deleteQuietly(this.tgtDirFile); + org.apache.commons.io.FileUtils.deleteQuietly(zeroLengthLinesUnix); + org.apache.commons.io.FileUtils.deleteQuietly(zeroLengthLinesWindows); + org.apache.commons.io.FileUtils.deleteQuietly(smallLinesUnix); + org.apache.commons.io.FileUtils.deleteQuietly(smallLinesWindows); + org.apache.commons.io.FileUtils.deleteQuietly(largeLinesUnix); + org.apache.commons.io.FileUtils.deleteQuietly(largeLinesWindows); + org.apache.commons.io.FileUtils.deleteQuietly(nakedLastLineUnix); + org.apache.commons.io.FileUtils.deleteQuietly(nakedLastLineWindows); + + } + + public void testCopyFile() { + // Test exception copying nonexistent file. + File [] srcFiles = this.srcDirFile.listFiles(); + srcFiles[0].delete(); + IOException e = null; + try { + FileUtils.copyFile(srcFiles[0], + new File(this.tgtDirFile, srcFiles[0].getName())); + } catch (IOException ioe) { + e = ioe; + } + assertNotNull("Didn't get expected IOE", e); + } + + public void testTailLinesZeroLengthUnix() throws IOException { + verifyTailLines(zeroLengthLinesUnix); + } + + public void testTailLinesZeroLengthWindows() throws IOException { + verifyTailLines(zeroLengthLinesWindows); + } + + public void testTailLinesSmallUnix() throws IOException { + verifyTailLines(smallLinesUnix); + } + + public void testTailLinesLargeUnix() throws IOException { + verifyTailLines(largeLinesUnix); + } + + public void testTailLinesSmallWindows() throws IOException { + verifyTailLines(smallLinesWindows); + } + + public void testTailLinesLargeWindows() throws IOException { + verifyTailLines(largeLinesWindows); + } + + public void testTailLinesNakedUnix() throws IOException { + verifyTailLines(nakedLastLineUnix); + } + + public void testTailLinesNakedWindows() throws IOException { + verifyTailLines(nakedLastLineWindows); + } + + @SuppressWarnings("unchecked") + private void verifyTailLines(File file) throws IOException { + List lines = org.apache.commons.io.FileUtils.readLines(file); + verifyTailLines(file, lines, 1, 80); + verifyTailLines(file, lines, 5, 80); + verifyTailLines(file, lines, 10, 80); + verifyTailLines(file, lines, 20, 80); + verifyTailLines(file, lines, 100, 80); + verifyTailLines(file, lines, 1, 1); + verifyTailLines(file, lines, 5, 1); + verifyTailLines(file, lines, 10, 1); + verifyTailLines(file, lines, 20, 1); + verifyTailLines(file, lines, 100, 1); + } + + + private void verifyTailLines(File file, List lines, int count, int estimate) throws IOException { + List testLines; + testLines = getTestTailLines(file,count,estimate); + assertEquals("line counts not equal:"+file.getName()+" "+count+" "+estimate,lines.size(),testLines.size()); + assertEquals("lines not equal: "+file.getName()+" "+count+" "+estimate,lines,testLines); + } + + private List getTestTailLines(File file, int count, int estimate) throws IOException { + long pos = -1; + List testLines = new LinkedList(); + do { + List returnedLines = new LinkedList(); + LongRange range = FileUtils.pagedLines(file,pos,-count,returnedLines,estimate); + Collections.reverse(returnedLines); + testLines.addAll(returnedLines); + pos = range.getMinimumLong()-1; + } while (pos>=0); + Collections.reverse(testLines); + return testLines; + } + + public void testHeadLinesZeroLengthUnix() throws IOException { + verifyHeadLines(zeroLengthLinesUnix); + } + + public void testHeadLinesZeroLengthWindows() throws IOException { + verifyHeadLines(zeroLengthLinesWindows); + } + + public void testHeadLinesSmallUnix() throws IOException { + verifyHeadLines(smallLinesUnix); + } + + public void testHeadLinesLargeUnix() throws IOException { + verifyHeadLines(largeLinesUnix); + } + + public void testHeadLinesSmallWindows() throws IOException { + verifyHeadLines(smallLinesWindows); + } + + public void testHeadLinesLargeWindows() throws IOException { + verifyHeadLines(largeLinesWindows); + } + + public void testHeadLinesNakedUnix() throws IOException { + verifyHeadLines(nakedLastLineUnix); + } + + public void testHeadLinesNakedWindows() throws IOException { + verifyHeadLines(nakedLastLineWindows); + } + + + @SuppressWarnings("unchecked") + private void verifyHeadLines(File file) throws IOException { + List lines = org.apache.commons.io.FileUtils.readLines(file); + verifyHeadLines(file, lines, 1, 80); + verifyHeadLines(file, lines, 5, 80); + verifyHeadLines(file, lines, 10, 80); + verifyHeadLines(file, lines, 20, 80); + verifyHeadLines(file, lines, 100, 80); + verifyHeadLines(file, lines, 1, 1); + verifyHeadLines(file, lines, 5, 1); + verifyHeadLines(file, lines, 10, 1); + verifyHeadLines(file, lines, 20, 1); + verifyHeadLines(file, lines, 100, 1); + } + + + private void verifyHeadLines(File file, List lines, int count, int estimate) throws IOException { + List testLines; + testLines = getTestHeadLines(file,count,estimate); + assertEquals("line counts not equal:"+file.getName()+" "+count+" "+estimate,lines.size(),testLines.size()); + assertEquals("lines not equal: "+file.getName()+" "+count+" "+estimate,lines,testLines); + } + + private List getTestHeadLines(File file, int count, int estimate) throws IOException { + long pos = 0; + List testLines = new LinkedList(); + do { + LongRange range = FileUtils.pagedLines(file,pos,count,testLines,estimate); + pos = range.getMaximumLong(); + } while (pos m = am.asMap(); + logger.fine(m.toString()); + } + + public void testEmptyRecord() throws Exception { + byte [] b = ANVLRecord.EMPTY_ANVL_RECORD.getUTF8Bytes(); + assertEquals(b.length, 2); + assertEquals(b[0], '\r'); + assertEquals(b[1], '\n'); + } + + public void testFolding() throws Exception { + ANVLRecord am = new ANVLRecord(); + Exception e = null; + try { + am.addLabel("Label with \n in it"); + } catch (IllegalArgumentException iae) { + e = iae; + } + assertTrue(e != null && e instanceof IllegalArgumentException); + am.addLabelValue("label", "value with \n in it"); + } + + public void testParse() throws UnsupportedEncodingException, IOException { + String record = " a: b\r\n#c#\r\nc:d\r\n \t\t\r\t\n\te" + + "\r\nx:\r\n # z\r\n\r\n"; + ANVLRecord r = ANVLRecord.load(new ByteArrayInputStream( + record.getBytes("ISO-8859-1"))); + logger.fine(r.toString()); + assertEquals(r.get(0).toString(), "a: b"); + record = " a: b\r\n\r\nsdfsdsdfds"; + r = ANVLRecord.load(new ByteArrayInputStream( + record.getBytes("ISO-8859-1"))); + logger.fine(r.toString()); + record = "x:\r\n # z\r\ny:\r\n\r\n"; + r = ANVLRecord.load(new ByteArrayInputStream( + record.getBytes("ISO-8859-1"))); + logger.fine(r.toString()); + assertEquals(r.get(0).toString(), "x:"); + } + + public void testExampleParse() + throws UnsupportedEncodingException, IOException { + final String sample = "entry:\t\t\r\n# first ###draft\r\n" + + "who:\tGilbert, W.S. | Sullivan, Arthur\r\n" + + "what:\tThe Yeoman of\r\n" + + "\t\tthe Guard\r\n" + + "when/created:\t 1888\r\n\r\n"; + ANVLRecord r = ANVLRecord.load(new ByteArrayInputStream( + sample.getBytes("ISO-8859-1"))); + logger.fine(r.toString()); + } + + public void testPoundLabel() + throws UnsupportedEncodingException, IOException { + final String sample = "ent#ry:\t\t\r\n# first ###draft\r\n" + + "who:\tGilbert, W.S. | Sullivan, Arthur\r\n" + + "what:\tThe Yeoman of\r\n" + + "\t\tthe Guard\r\n" + + "when/created:\t 1888\r\n\r\n"; + ANVLRecord r = ANVLRecord.load(sample); + logger.fine(r.toString()); + } + + public void testNewlineLabel() + throws UnsupportedEncodingException, IOException { + final String sample = "ent\nry:\t\t\r\n# first ###draft\r\n" + + "who:\tGilbert, W.S. | Sullivan, Arthur\r\n" + + "what:\tThe Yeoman of\r\n" + + "\t\tthe Guard\r\n" + + "when/created:\t 1888\r\n\r\n"; + IllegalArgumentException iae = null; + try { + ANVLRecord.load(sample); + } catch(IllegalArgumentException e) { + iae = e; + } + assertTrue(iae != null); + } +} From b04f5d82604245461b6a802f1962d86e3d899e98 Mon Sep 17 00:00:00 2001 From: Lauren Ko Date: Thu, 9 Mar 2017 11:32:03 -0600 Subject: [PATCH 003/189] Updating CHANGES.md --- CHANGES.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGES.md b/CHANGES.md index fee29e16..767881ec 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,6 @@ 1.1.8 ----- +* [Move unit tests over from heritrix3 to webarchive-commons](https://github.com/iipc/webarchive-commons/issues/25) * [Strip empty port via URLParser](https://github.com/iipc/webarchive-commons/pull/69/) * [Use CharsetDetector to guess encoding of HTML documents](https://github.com/iipc/webarchive-commons/pull/68/) * [Fix last header was lost if LF LF](https://github.com/iipc/webarchive-commons/pull/65/) From b655796770eb967c931d656b1c80d4967f91e7fc Mon Sep 17 00:00:00 2001 From: Lauren Ko Date: Tue, 21 Mar 2017 14:20:54 -0500 Subject: [PATCH 004/189] Updating change log. --- CHANGES.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGES.md b/CHANGES.md index 767881ec..ccdc1ce7 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,6 @@ 1.1.8 ----- +* [Improve HTML link extraction](https://github.com/iipc/webarchive-commons/pull/72) * [Move unit tests over from heritrix3 to webarchive-commons](https://github.com/iipc/webarchive-commons/issues/25) * [Strip empty port via URLParser](https://github.com/iipc/webarchive-commons/pull/69/) * [Use CharsetDetector to guess encoding of HTML documents](https://github.com/iipc/webarchive-commons/pull/68/) From aee6ff55bfcaa5a9e15092f8c3b1e40ec9faaf87 Mon Sep 17 00:00:00 2001 From: John Erik Halse Date: Tue, 2 May 2017 12:25:28 +0200 Subject: [PATCH 005/189] [maven-release-plugin] prepare release webarchive-commons-1.1.8 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 24780063..63909b90 100644 --- a/pom.xml +++ b/pom.xml @@ -9,7 +9,7 @@ org.netpreserve.commons webarchive-commons - 1.1.8-SNAPSHOT + 1.1.8 jar webarchive-commons From dfe1f62e416f6a881fe15a2544449fff44dd1e51 Mon Sep 17 00:00:00 2001 From: John Erik Halse Date: Tue, 2 May 2017 12:25:35 +0200 Subject: [PATCH 006/189] [maven-release-plugin] prepare for next development iteration --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 63909b90..23953c06 100644 --- a/pom.xml +++ b/pom.xml @@ -9,7 +9,7 @@ org.netpreserve.commons webarchive-commons - 1.1.8 + 1.1.9-SNAPSHOT jar webarchive-commons From cf34a3e13c09cfa4a1412492cfcf3503df698931 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Fri, 28 Apr 2017 22:41:56 +0200 Subject: [PATCH 007/189] Do not add value of preceding HTTP header field if there is no value (or only white space) --- .../archive/format/http/HttpHeaderParser.java | 4 ++-- .../format/http/HttpResponseParserTest.java | 24 +++++++++++++++++++ 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/archive/format/http/HttpHeaderParser.java b/src/main/java/org/archive/format/http/HttpHeaderParser.java index d63ec405..bee3c28b 100755 --- a/src/main/java/org/archive/format/http/HttpHeaderParser.java +++ b/src/main/java/org/archive/format/http/HttpHeaderParser.java @@ -301,8 +301,9 @@ public ParseState handleByte(byte b, HttpHeaderParser parser) throws HttpParseEx if(isLWSP(b)) { return parser.postColonState; } + // reset previous value also in case the header value is empty + parser.setValueStartIdx(); if(b == CR) { - // TODO: THINK more... parser.valuePreCRState = parser.postColonState; return parser.valuePostCRState; } @@ -310,7 +311,6 @@ public ParseState handleByte(byte b, HttpHeaderParser parser) throws HttpParseEx // TODO: this is lax, is LFLF an OK terminator? return parser.lineStartState; } - parser.setValueStartIdx(); parser.addValueByte(b); return parser.valueState; } diff --git a/src/test/java/org/archive/format/http/HttpResponseParserTest.java b/src/test/java/org/archive/format/http/HttpResponseParserTest.java index c0d13230..ea076a69 100644 --- a/src/test/java/org/archive/format/http/HttpResponseParserTest.java +++ b/src/test/java/org/archive/format/http/HttpResponseParserTest.java @@ -57,4 +57,28 @@ public void testParseWithLf() throws IOException { } + public void testParseEmptyHeaderField() throws IOException { + + HttpResponseParser parser = new HttpResponseParser(); + String message = "200 OK\r\nContent-Type: text/plain\r\nServer: \r\n\r\nHi there"; + try { + HttpResponse response = + parser.parse(new ByteArrayInputStream(message.getBytes(IAUtils.UTF8))); + assertNotNull(response); + HttpHeaders headers = response.getHeaders(); + assertNotNull(headers); + assertEquals(2, headers.size()); + HttpHeader header = headers.get(1); + assertEquals("Server",header.getName()); + System.err.println(header.getValue()); + assertFalse("text/plain".equals(header.getValue())); + TestUtils.assertStreamEquals(response, "Hi there".getBytes(IAUtils.UTF8)); + + } catch (HttpParseException e) { + e.printStackTrace(); + fail(); + } + + } + } From bd08143577ea35cb48047a08b2bb67e806992cc2 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Thu, 29 Sep 2016 11:44:18 +0200 Subject: [PATCH 008/189] Extract also `property` attributes of HTML meta elements, this fixes #67 --- .../java/org/archive/resource/html/ExtractingParseObserver.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java index 826851e0..52989455 100644 --- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java +++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java @@ -406,7 +406,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs private static class MetaTagExtractor implements TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { - ArrayList l = getAttrList(node,"name","rel","content","http-equiv"); + ArrayList l = getAttrList(node,"name","rel","content","http-equiv","property"); if(l != null) { data.addMeta(l); } From 4077670acca3f0d2958d926692cdb3a6b29428ca Mon Sep 17 00:00:00 2001 From: Lauren Ko Date: Tue, 2 May 2017 15:15:06 -0500 Subject: [PATCH 009/189] Fix HTTP-Response-Metadata for wget WARCs. Changes came from https://github.com/commoncrawl/ia-web-commons/commit/58e85a60d75707da55ed499f836e57d49347484a --- .../org/archive/extract/ExtractingResourceFactoryMapper.java | 5 ++++- src/main/java/org/archive/format/warc/WARCConstants.java | 4 +++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java b/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java index ad10be40..0afe16fb 100644 --- a/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java +++ b/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java @@ -153,7 +153,10 @@ private boolean isWARCInfoResource(MetaData envelope) { private boolean isHTTPResponseWARCResource(MetaData envelope) { return childFieldEquals(envelope,WARC_HEADER_METADATA, WARCConstants.CONTENT_TYPE, - WARCConstants.HTTP_RESPONSE_MIMETYPE); + WARCConstants.HTTP_RESPONSE_MIMETYPE) + || childFieldEquals(envelope,WARC_HEADER_METADATA, + WARCConstants.CONTENT_TYPE, + WARCConstants.HTTP_RESPONSE_MIMETYPE_NS); } private boolean isWARCJSONResource(MetaData envelope) { return childFieldEquals(envelope,WARC_HEADER_METADATA, diff --git a/src/main/java/org/archive/format/warc/WARCConstants.java b/src/main/java/org/archive/format/warc/WARCConstants.java index 93a81f96..504dc380 100644 --- a/src/main/java/org/archive/format/warc/WARCConstants.java +++ b/src/main/java/org/archive/format/warc/WARCConstants.java @@ -209,7 +209,9 @@ enum WARCRecordType { "application/http; msgtype=request"; public static final String HTTP_RESPONSE_MIMETYPE = "application/http; msgtype=response"; - + public static final String HTTP_RESPONSE_MIMETYPE_NS = + "application/http;msgtype=response"; // wget does this + public static final String FTP_CONTROL_CONVERSATION_MIMETYPE = "text/x-ftp-control-conversation"; From 3bba7e489b7d946eea83344e2150faebe0b35ed2 Mon Sep 17 00:00:00 2001 From: Lauren Ko Date: Tue, 2 May 2017 15:41:23 -0500 Subject: [PATCH 010/189] Update with fixes for 1.1.9 --- CHANGES.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index ccdc1ce7..1ba5c1de 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,3 +1,9 @@ +1.1.9 +----- +* [Extract `property` attributes of HTML meta elements](https://github.com/iipc/webarchive-commons/pull/75) +* [Do not add value of preceding HTTP header field if there is no value](https://github.com/iipc/webarchive-commons/pull/74) +* [Fix WAT records corresponding to response records of Wget generated WARCs](https://github.com/iipc/webarchive-commons/pull/74) + 1.1.8 ----- * [Improve HTML link extraction](https://github.com/iipc/webarchive-commons/pull/72) From 4101f7e39cbdcc508a936faf8b519e68258b9639 Mon Sep 17 00:00:00 2001 From: Naomi Dushay Date: Tue, 8 Aug 2017 16:08:43 -0700 Subject: [PATCH 011/189] use commons-collections v3.2.2 to avoid v3.2.1 vulnerability --- pom.xml | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index 23953c06..8373cdad 100644 --- a/pom.xml +++ b/pom.xml @@ -72,7 +72,7 @@ guava 17.0 - + org.json json @@ -89,12 +89,12 @@ juniversalchardet 1.0.3 - + commons-httpclient commons-httpclient 3.1 - + org.apache.hadoop @@ -128,12 +128,12 @@ tomcat jasper-compiler - + hsqldb hsqldb - - + + @@ -160,7 +160,7 @@ libidn 1.15 - + it.unimi.dsi dsiutils 2.0.12 @@ -170,13 +170,26 @@ ch.qos.logback logback-classic + + + commons-collections + commons-collections + + + + + commons-collections + commons-collections + 3.2.2 + + org.apache.httpcomponents httpcore 4.3 - + joda-time joda-time From 988bec707c27a01333becfc3bd502af4441ea1e1 Mon Sep 17 00:00:00 2001 From: Lauren Ko Date: Wed, 9 Aug 2017 10:57:28 -0500 Subject: [PATCH 012/189] Update CHANGES.md for PR 77 --- CHANGES.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGES.md b/CHANGES.md index 1ba5c1de..dcb598d9 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,6 @@ 1.1.9 ----- +* [Use commons-collections v3.2.2 to avoid v3.2.1 vulnerability](https://github.com/iipc/webarchive-commons/pull/77) * [Extract `property` attributes of HTML meta elements](https://github.com/iipc/webarchive-commons/pull/75) * [Do not add value of preceding HTTP header field if there is no value](https://github.com/iipc/webarchive-commons/pull/74) * [Fix WAT records corresponding to response records of Wget generated WARCs](https://github.com/iipc/webarchive-commons/pull/74) From 2e8cdea3d245c11e1ea3a2a6153c0038479aef12 Mon Sep 17 00:00:00 2001 From: nruest Date: Tue, 7 May 2019 13:23:28 -0400 Subject: [PATCH 013/189] [maven-release-plugin] prepare release webarchive-commons-1.1.9 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 8373cdad..833f42c3 100644 --- a/pom.xml +++ b/pom.xml @@ -9,7 +9,7 @@ org.netpreserve.commons webarchive-commons - 1.1.9-SNAPSHOT + 1.1.9 jar webarchive-commons From da029db2ba89205b93a5291ed18b9c69155271bb Mon Sep 17 00:00:00 2001 From: nruest Date: Tue, 7 May 2019 13:23:34 -0400 Subject: [PATCH 014/189] [maven-release-plugin] prepare for next development iteration --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 833f42c3..1cbeb99a 100644 --- a/pom.xml +++ b/pom.xml @@ -9,7 +9,7 @@ org.netpreserve.commons webarchive-commons - 1.1.9 + 1.1.10-SNAPSHOT jar webarchive-commons From 723b18a35f8be786cb073282b5ea88b5d8c643ce Mon Sep 17 00:00:00 2001 From: nruest Date: Tue, 7 May 2019 13:56:18 -0400 Subject: [PATCH 015/189] Update TravisCI config; resolves #82. - Test Oracle Java 8 - Test OpenJDK Java 8 - Use trusty - Require sudo for OpenJDK7 - Remove Oracle Java 7 (it's gone!) - Remove mvn site from the build process since there is no javadoc site (at least that I can tell) --- .travis.yml | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index 0dfd3f7f..54daf83b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,7 +1,13 @@ +dist: trusty language: java +# sudo required for OpenJDK7 support per: +# https://github.com/travis-ci/travis-ci/issues/7884#issuecomment-309689557 +sudo: required jdk: - - oraclejdk7 + - openjdk7 + - oraclejdk8 + - openjdk8 before_install: - "git clone https://github.com/iipc/travis.git target/travis" @@ -11,8 +17,8 @@ before_script: - "export MAVEN_OPTS=-Xmx512m" - "ulimit -u 2048" -script: - - "target/travis/deploy-if.sh" +script: + - mvn install -B -V # whitelist in the master branch only branches: @@ -23,4 +29,3 @@ env: global: - secure: "qDKjVdoe4Qcz4WfXiQydU7tyl51T62FUJrjqu4FUPBcgeQhFQiggwhpaE6xCOzOpxbsuBi2R1c8gMQf5esE5iDL5jZMu+kz++dYbuzMTd13ttvZWMW5wRPH0H8iHk609FP/RDtVKKBr7WO0JvvIAZEhWNHZrLXBrrKgdTey171g=" - secure: "FXGBKJNP9X7ePJfS4eYTZtoFo4RT1sxor34XxncSJr7uV6ggtZb4B4WNd16IlLcDk6E32sx8YoWdltaOGwQ5Vg/kux5Ko/wKZCoccS018Ln1bRT86dD1KoPY34rGoNJVQxe7J/1MPqpBKwmi2XCKfzpsEh3W7bbIqg8w9MEOOZA=" - From 79aed910b44510294367a4acf4f3e6376b1c62c0 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Wed, 23 Aug 2017 17:04:52 +0200 Subject: [PATCH 016/189] ExtractingParseObserver: get links from onClick attributes - extract links from JavaScript code snippets in onClick attributes of INPUT and DIV elements --- .../html/ExtractingParseObserver.java | 40 +++++++++++++++++- .../html/ExtractingParseObserverTest.java | 10 +++++ .../resource/html/link-extraction-test.warc | 42 +++++++++++++++++++ 3 files changed, 91 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java index 52989455..e4fa83c7 100644 --- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java +++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java @@ -39,6 +39,15 @@ public class ExtractingParseObserver implements ParseObserver { protected static Pattern cssUrlTrimPattern = Pattern.compile(cssUrlTrimPatString); + protected static String jsOnClickUrl1PatString = + "(?i)^(?:javascript:)?(?:(?:window|top|document|self|parent)\\.)?location(?:\\.href)?\\s*=\\s*('|')([^'\"]{3,256})\\1$"; + protected static String jsOnClickUrl2PatString = + "(?i)^(?:javascript:)?(?:window|parent)\\.open\\((['\"]|')([^\"']{3,256}?)\\1[,)]"; + protected static Pattern[] jsOnClickUrlPatterns = { + Pattern.compile(jsOnClickUrl1PatString), + Pattern.compile(jsOnClickUrl2PatString) + }; + private final static int MAX_TEXT_LEN = 100; private static final String PATH = "path"; @@ -51,6 +60,7 @@ public class ExtractingParseObserver implements ParseObserver { extractors.put("APPLET", new AppletTagExtractor()); extractors.put("AREA", new AreaTagExtractor()); extractors.put("BASE", new BaseTagExtractor()); + extractors.put("DIV", new DivTagExtractor()); extractors.put("EMBED", new EmbedTagExtractor()); extractors.put("FORM", new FormTagExtractor()); extractors.put("FRAME", new FrameTagExtractor()); @@ -268,7 +278,20 @@ private static void addHrefWithAttrs(HTMLMetaData data, TagNode node, if(l != null) { data.addHref(l); } - } + } + + private static void addHrefsOnclick(HTMLMetaData data, TagNode node) { + String onclick = node.getAttribute("onclick"); + if (onclick != null) { + String path = makePath(node.getTagName(), "onclick"); + for (Pattern pattern : jsOnClickUrlPatterns) { + String url = patternJSExtract(pattern, onclick); + if (url != null) { + data.addHref(PATH, path, "url", url); + } + } + } + } private interface TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs); @@ -330,6 +353,12 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs } } + private static class DivTagExtractor implements TagExtractor { + public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { + addHrefsOnclick(data,node); + } + } + private static class EmbedTagExtractor implements TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { addBasicHrefs(data,node,"src"); @@ -386,6 +415,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs private static class InputTagExtractor implements TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { addBasicHrefs(data,node,"src","formaction"); + addHrefsOnclick(data,node); } } @@ -450,4 +480,12 @@ private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String conten } } } + + private static String patternJSExtract(Pattern pattern, String content) { + Matcher m = pattern.matcher(content); + if (m.find()) { + return m.group(2); + } + return null; + } } diff --git a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java index 8f690a06..4828ad64 100644 --- a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java +++ b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java @@ -263,6 +263,16 @@ public void testLinkExtraction() throws ResourceParseException, IOException { {"http://www.your-domain.com/your-page.html", "DIV@/data-href"} }; checkLinks(extractor.getNext(), fbSocialLinks); + String[][] onClickLinks = { + {"webpage.html", "DIV@/onclick"}, + {"index.html", "INPUT@/onclick"}, + {"http://www.x.com/", "INPUT@/onclick"}, + {"button-child.php", "INPUT@/onclick"}, + {"http://example.com/", "INPUT@/onclick"}, + {"http://example.com/location/href/1.html", "INPUT@/onclick"}, + {"http://example.com/location/href/2.html", "INPUT@/onclick"} + }; + checkLinks(extractor.getNext(), onClickLinks); } } diff --git a/src/test/resources/org/archive/resource/html/link-extraction-test.warc b/src/test/resources/org/archive/resource/html/link-extraction-test.warc index ab0e54c8..1a30598e 100644 --- a/src/test/resources/org/archive/resource/html/link-extraction-test.warc +++ b/src/test/resources/org/archive/resource/html/link-extraction-test.warc @@ -318,3 +318,45 @@ Content-Type: text/html +WARC/1.0 +WARC-Type: response +WARC-Date: 2017-08-23T13:54:59Z +Content-Type: application/http;msgtype=response +Content-Length: 1279 + +HTTP/1.1 200 OK +Date: Wed, 23 Aug 2017 13:54:59 GMT +Server: Apache/2.4.18 (Ubuntu) +Last-Modified: Wed, 23 Aug 2017 13:54:03 GMT +ETag: "3ca-5576c0b718ab3" +Accept-Ranges: bytes +Content-Length: 971 +Vary: Accept-Encoding +Keep-Alive: timeout=5, max=100 +Connection: Keep-Alive +Content-Type: text/html + + + +Test Extraction of URLs from INPUT onClick Attributes + + + + +

Click to load webpage

+ + + + + + + + From 26b1e7af27abec102ab36faf6a786dfedf9436fd Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Wed, 23 Aug 2017 14:48:05 +0200 Subject: [PATCH 017/189] ExtractingParseObserver: extract rel, hreflang and type attributes - add "rel" attribute to A and AREA links - add attributes "hreflang" and "type" (MIME type) to A@/href links --- .../html/ExtractingParseObserver.java | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java index 52989455..a487fd34 100644 --- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java +++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java @@ -284,7 +284,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs l.add(makePath("A","href")); l.add("url"); l.add(url); - for(String a : new String[] {"target","alt","title"}) { + for(String a : new String[] {"target","alt","title","rel","hreflang","type"}) { String v = node.getAttribute(a); if(v != null) { l.add(a); @@ -311,7 +311,22 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs private static class AreaTagExtractor implements TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { - addBasicHrefs(data,node,"href"); + String url = node.getAttribute("href"); + if(url != null) { + ArrayList l = new ArrayList(); + l.add(PATH); + l.add(makePath("AREA","href")); + l.add("url"); + l.add(url); + for(String a : new String[] {"rel"}) { + String v = node.getAttribute(a); + if(v != null) { + l.add(a); + l.add(v); + } + } + data.addHref(l); + } } } From a2cc42cac2777d06ab40e09811cdc883773775b9 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Thu, 11 Jun 2020 14:24:03 +0200 Subject: [PATCH 018/189] WAT extractor: do not fail on missing WARC-Filename in warcinfo record, fixes #88 - do not throw IOException if there is no WARC-Filename in warcinfo record - write metadata record (corresponding to warcinfo) without WARC-Target-URI --- src/main/java/org/archive/extract/WATExtractorOutput.java | 2 +- src/main/java/org/archive/format/warc/WARCRecordWriter.java | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/archive/extract/WATExtractorOutput.java b/src/main/java/org/archive/extract/WATExtractorOutput.java index 3bcfa924..4b5f72ed 100644 --- a/src/main/java/org/archive/extract/WATExtractorOutput.java +++ b/src/main/java/org/archive/extract/WATExtractorOutput.java @@ -151,7 +151,7 @@ private void writeWARC(OutputStream recOut, MetaData md) throws IOException { String warcType = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Type"); String targetURI; if(warcType.equals("warcinfo")) { - targetURI = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Filename"); + targetURI = JSONUtils.extractSingle(md, "Envelope.WARC-Header-Metadata.WARC-Filename"); } else { targetURI = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Target-URI"); } diff --git a/src/main/java/org/archive/format/warc/WARCRecordWriter.java b/src/main/java/org/archive/format/warc/WARCRecordWriter.java index 0aab83b7..3278b289 100644 --- a/src/main/java/org/archive/format/warc/WARCRecordWriter.java +++ b/src/main/java/org/archive/format/warc/WARCRecordWriter.java @@ -88,7 +88,10 @@ public void writeJSONMetadataRecord( OutputStream out, { HttpHeaders headers = new HttpHeaders(); headers.add(HEADER_KEY_TYPE, WARCRecordType.metadata.name()); - headers.add(HEADER_KEY_URI, targetURI); + if (targetURI != null) { + // WARC-Target-URI is optional in metadata records + headers.add(HEADER_KEY_URI, targetURI); + } headers.add(HEADER_KEY_DATE, DateUtils.getLog14Date(originalDate)); headers.add(HEADER_KEY_ID, makeRecordId()); headers.add(HEADER_KEY_REFERS_TO, origRecordId); From 04e10397b9137a36812c17276826bc60d1a37ede Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Mon, 15 Jun 2020 13:29:25 +0200 Subject: [PATCH 019/189] Update change log to include #85, #86 and #89 --- CHANGES.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index dcb598d9..bf985ada 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,3 +1,10 @@ +1.1.10 +------ +* [WAT extractor: do not fail on missing WARC-Filename in warcinfo record](https://github.com/iipc/webarchive-commons/pull/89) +* [ExtractingParseObserver: extract rel, hreflang and type attributes](https://github.com/iipc/webarchive-commons/pull/86) +* [ExtractingParseObserver: extract links from onClick attributes](https://github.com/iipc/webarchive-commons/pull/85) +* [Update TravisCI config](https://github.com/iipc/webarchive-commons/pull/83) + 1.1.9 ----- * [Use commons-collections v3.2.2 to avoid v3.2.1 vulnerability](https://github.com/iipc/webarchive-commons/pull/77) From 9041ff4e96f6554658742affe490223dc0241d06 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 13 Oct 2020 01:28:48 +0000 Subject: [PATCH 020/189] Bump junit from 3.8.1 to 4.13.1 Bumps [junit](https://github.com/junit-team/junit4) from 3.8.1 to 4.13.1. - [Release notes](https://github.com/junit-team/junit4/releases) - [Changelog](https://github.com/junit-team/junit4/blob/main/doc/ReleaseNotes4.13.1.md) - [Commits](https://github.com/junit-team/junit4/commits/r4.13.1) Signed-off-by: dependabot[bot] --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 1cbeb99a..5ca7e1a3 100644 --- a/pom.xml +++ b/pom.xml @@ -64,7 +64,7 @@ junit junit - 3.8.1 + 4.13.1 From c2530d77b73838c31f4e83f2be941ec61032ebb2 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Tue, 16 Mar 2021 11:58:11 +0100 Subject: [PATCH 021/189] Fix InterruptibleCharSequenceTest (testInterruptibility) to run on JDK 11 - if thread running the regexp matching is already finished after the initial/current sleeping time, rerun the test again with a shorter sleeping time until the expected RuntimeException is hit --- .../util/InterruptibleCharSequenceTest.java | 26 +++++++++++++------ 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/src/test/java/org/archive/util/InterruptibleCharSequenceTest.java b/src/test/java/org/archive/util/InterruptibleCharSequenceTest.java index a3a5f180..8b5c5d1b 100644 --- a/src/test/java/org/archive/util/InterruptibleCharSequenceTest.java +++ b/src/test/java/org/archive/util/InterruptibleCharSequenceTest.java @@ -107,14 +107,24 @@ public void testNoninterruptible() throws InterruptedException { } public void testInterruptibility() throws InterruptedException { - BlockingQueue