From f7cd67d9f4ac252385517770fce63dea767f9203 Mon Sep 17 00:00:00 2001
From: RogerMathisen
Date: Tue, 23 Sep 2014 13:06:44 +0200
Subject: [PATCH 001/240] - Replaced direct references to "/tmp" with generic
temporary directory reference using File.createTempFile().
Fixes bug reported in iipc/webarchive-commons Issue #2.
---
.../archive/format/gzip/GZIPMemberWriterTest.java | 4 ++--
.../util/binsearch/SortedTextFileTest.java | 2 +-
.../iterator/SortedCompositeIteratorTest.java | 15 ++++++---------
3 files changed, 9 insertions(+), 12 deletions(-)
diff --git a/src/test/java/org/archive/format/gzip/GZIPMemberWriterTest.java b/src/test/java/org/archive/format/gzip/GZIPMemberWriterTest.java
index 5cd75ccf..483d2baf 100644
--- a/src/test/java/org/archive/format/gzip/GZIPMemberWriterTest.java
+++ b/src/test/java/org/archive/format/gzip/GZIPMemberWriterTest.java
@@ -12,8 +12,8 @@
public class GZIPMemberWriterTest extends TestCase {
public void testWrite() throws IOException {
- String outPath = "/tmp/tmp.gz";
- GZIPMemberWriter gzw = new GZIPMemberWriter(new FileOutputStream(new File(outPath)));
+ File outFile = File.createTempFile("tmp", ".gz");
+ GZIPMemberWriter gzw = new GZIPMemberWriter(new FileOutputStream(outFile));
gzw.write(new ByteArrayInputStream("Here is record 1".getBytes(IAUtils.UTF8)));
gzw.write(new ByteArrayInputStream("Here is record 2".getBytes(IAUtils.UTF8)));
}
diff --git a/src/test/java/org/archive/util/binsearch/SortedTextFileTest.java b/src/test/java/org/archive/util/binsearch/SortedTextFileTest.java
index 2c9d19e8..8f812b75 100644
--- a/src/test/java/org/archive/util/binsearch/SortedTextFileTest.java
+++ b/src/test/java/org/archive/util/binsearch/SortedTextFileTest.java
@@ -25,7 +25,7 @@ private void createFile(File target, int max) throws FileNotFoundException {
public void testGetRecordIteratorStringBoolean() throws IOException {
- File test = new File("/tmp/test.tmp");
+ File test = File.createTempFile("test", null);
int max = 1000000;
createFile(test,max);
RandomAccessFileSeekableLineReaderFactory factory =
diff --git a/src/test/java/org/archive/util/iterator/SortedCompositeIteratorTest.java b/src/test/java/org/archive/util/iterator/SortedCompositeIteratorTest.java
index f1c2a0ec..0f4dc68a 100644
--- a/src/test/java/org/archive/util/iterator/SortedCompositeIteratorTest.java
+++ b/src/test/java/org/archive/util/iterator/SortedCompositeIteratorTest.java
@@ -4,6 +4,7 @@
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
+import java.io.IOException;
import java.io.PrintWriter;
import java.util.Comparator;
@@ -11,21 +12,16 @@
public class SortedCompositeIteratorTest extends TestCase {
- public void testHasNext() throws FileNotFoundException {
+ public void testHasNext() throws FileNotFoundException, IOException {
long t = 210000;
long c = 134;
float f = (float)c / (float)t;
System.err.format("F(%f)\n",f);
- File a = new File("/tmp/a");
- File b = new File("/tmp/b");
- if(a.isFile()) {
- a.delete();
- }
- if(b.isFile()) {
- b.delete();
- }
+ File a = File.createTempFile("filea", null);
+ File b = File.createTempFile("fileb", null);
+
PrintWriter apw = new PrintWriter(a);
PrintWriter bpw = new PrintWriter(b);
apw.println("1");
@@ -38,6 +34,7 @@ public void testHasNext() throws FileNotFoundException {
BufferedReader bbr = new BufferedReader(new FileReader(b));
SortedCompositeIterator sci = new SortedCompositeIterator(new Comparator() {
+ @Override
public int compare(String o1, String o2) {
return o1.compareTo(o2);
}
From 077abb783d77b8a556112a6617911d0ee7006595 Mon Sep 17 00:00:00 2001
From: thomase
Date: Tue, 23 Sep 2014 14:48:48 +0200
Subject: [PATCH 002/240] * changed newline to System.lineSeparator
---
.../org/archive/net/PublicSuffixesTest.java | 386 +++++++++---------
1 file changed, 193 insertions(+), 193 deletions(-)
diff --git a/src/test/java/org/archive/net/PublicSuffixesTest.java b/src/test/java/org/archive/net/PublicSuffixesTest.java
index b88acb6d..a82bab22 100644
--- a/src/test/java/org/archive/net/PublicSuffixesTest.java
+++ b/src/test/java/org/archive/net/PublicSuffixesTest.java
@@ -1,193 +1,193 @@
-/*
- * This file is part of the Heritrix web crawler (crawler.archive.org).
- *
- * Licensed to the Internet Archive (IA) by one or more individual
- * contributors.
- *
- * The IA licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.archive.net;
-
-import java.io.PrintWriter;
-import java.io.StringWriter;
-import java.util.ArrayList;
-import java.util.regex.Matcher;
-
-import junit.framework.TestCase;
-
-import org.archive.net.PublicSuffixes.Node;
-
-/**
- * Test cases for PublicSuffixes utility. Confirm expected matches/nonmatches
- * from constructed regex.
- *
- * @author gojomo
- */
-public class PublicSuffixesTest extends TestCase {
- // test of low level implementation
-
- public void testCompare() {
- Node n = new Node("hoge");
- assertTrue(n.compareTo('a') > 0);
- assertEquals(-1, n.compareTo('*'));
- assertEquals(-1, n.compareTo('!'));
- assertEquals(-1, n.compareTo(new Node("*,")));
- assertEquals(-1, n.compareTo(new Node("!muga,")));
- assertEquals(-1, n.compareTo(new Node("")));
-
- n = new Node("*,");
- assertEquals(1, n.compareTo('a'));
- assertEquals(0, n.compareTo('*'));
- assertEquals(1, n.compareTo('!'));
- assertEquals(0, n.compareTo(new Node("*,")));
- assertEquals(1, n.compareTo(new Node("!muga,")));
- assertEquals(-1, n.compareTo(new Node("")));
-
- n = new Node("!hoge");
- assertEquals(1, n.compareTo('a'));
- assertEquals(-1, n.compareTo('*'));
- assertEquals(0, n.compareTo('!'));
- assertEquals(-1, n.compareTo(new Node("*,")));
- assertEquals(0, n.compareTo(new Node("!muga,")));
- assertEquals(-1, n.compareTo(new Node("")));
-
- n = new Node("");
- assertEquals(1, n.compareTo('a'));
- assertEquals(1, n.compareTo('*'));
- assertEquals(1, n.compareTo('!'));
- assertEquals(0, n.compareTo(new Node("")));
- }
-
- protected String dump(Node alt) {
- StringWriter w = new StringWriter();
- PublicSuffixes.dump(alt, 0, new PrintWriter(w));
- return w.toString();
- }
- public void testTrie1() {
- Node alt = new Node(null, new ArrayList());
- alt.addBranch("ac,");
- // specifically, should not have empty string as match.
- assertEquals("(null)\n" +
- " \"ac,\"\n", dump(alt));
- alt.addBranch("ac,com,");
- assertEquals("(null)\n" +
- " \"ac,\"\n" +
- " \"com,\"\n" +
- " \"\"\n", dump(alt));
- alt.addBranch("ac,edu,");
- assertEquals("(null)\n" +
- " \"ac,\"\n" +
- " \"com,\"\n" +
- " \"edu,\"\n" +
- " \"\"\n", dump(alt));
- }
- public void testTrie2() {
- Node alt = new Node(null, new ArrayList());
- alt.addBranch("ac,");
- alt.addBranch("*,");
- assertEquals("(null)\n" +
- " \"ac,\"\n" +
- " \"*,\"\n", dump(alt));
- }
-
- public void testTrie3() {
- Node alt = new Node(null, new ArrayList());
- alt.addBranch("ac,");
- alt.addBranch("ac,!hoge,");
- alt.addBranch("ac,*,");
- // exception goes first.
- assertEquals("(null)\n" +
- " \"ac,\"\n" +
- " \"!hoge,\"\n" +
- " \"*,\"\n" +
- " \"\"\n", dump(alt));
- }
-
- // test of higher-level functionality
-
- Matcher m = PublicSuffixes.getTopmostAssignedSurtPrefixPattern()
- .matcher("");
-
- public void testBasics() {
- matchPrefix("com,example,www,", "com,example,");
- matchPrefix("com,example,", "com,example,");
- matchPrefix("org,archive,www,", "org,archive,");
- matchPrefix("org,archive,", "org,archive,");
- matchPrefix("fr,yahoo,www,", "fr,yahoo,");
- matchPrefix("fr,yahoo,", "fr,yahoo,");
- matchPrefix("au,com,foobar,www,", "au,com,foobar,");
- matchPrefix("au,com,foobar,", "au,com,foobar,");
- matchPrefix("uk,co,virgin,www,", "uk,co,virgin,");
- matchPrefix("uk,co,virgin,", "uk,co,virgin,");
- matchPrefix("au,com,example,www,", "au,com,example,");
- matchPrefix("au,com,example,", "au,com,example,");
- matchPrefix("jp,yokohama,public,assigned,www,",
- "jp,yokohama,public,assigned,");
- matchPrefix("jp,yokohama,public,assigned,", "jp,yokohama,public,assigned,");
- }
-
- public void testDomainWithDash() {
- matchPrefix("de,bad-site,www", "de,bad-site,");
- }
-
- public void testDomainWithNumbers() {
- matchPrefix("de,archive4u,www", "de,archive4u,");
- }
-
- public void testIPV4() {
- assertEquals("unexpected reduction",
- "1.2.3.4",
- PublicSuffixes.reduceSurtToAssignmentLevel("1.2.3.4"));
- }
-
- public void testIPV6() {
- assertEquals("unexpected reduction",
- "[2001:0db8:85a3:08d3:1319:8a2e:0370:7344]",
- PublicSuffixes.reduceSurtToAssignmentLevel(
- "[2001:0db8:85a3:08d3:1319:8a2e:0370:7344]"));
- }
-
- public void testExceptions() {
- matchPrefix("uk,bl,www,", "uk,bl,");
- matchPrefix("uk,bl,", "uk,bl,");
- matchPrefix("jp,tokyo,city,subdomain,", "jp,tokyo,city,");
- matchPrefix("jp,tokyo,city,", "jp,tokyo,city,");
- }
-
- public void testFakeTLD() {
- // we assume any new/unknonwn TLD should be assumed as 2-level;
- // this is preferable for our grouping purpose but might not be
- // for a cookie-assigning browser (original purpose of publicsuffixlist)
- matchPrefix("zzz,example,www,", "zzz,example,");
- }
-
- public void testUnsegmentedHostname() {
- m.reset("example");
- assertFalse("unexpected match found in 'example'", m.find());
- }
-
- public void testTopmostAssignedCaching() {
- assertSame("topmostAssignedSurtPrefixPattern not cached",PublicSuffixes.getTopmostAssignedSurtPrefixPattern(),PublicSuffixes.getTopmostAssignedSurtPrefixPattern());
- assertSame("topmostAssignedSurtPrefixRegex not cached",PublicSuffixes.getTopmostAssignedSurtPrefixRegex(),PublicSuffixes.getTopmostAssignedSurtPrefixRegex());
- }
-
- // TODO: test UTF domains?
-
- protected void matchPrefix(String surtDomain, String expectedAssignedPrefix) {
- m.reset(surtDomain);
- assertTrue("expected match not found in '" + surtDomain, m.find());
- assertEquals("expected match not found", expectedAssignedPrefix, m
- .group());
- }
-}
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.net;
+
+import java.io.PrintWriter;
+import java.io.StringWriter;
+import java.util.ArrayList;
+import java.util.regex.Matcher;
+
+import junit.framework.TestCase;
+
+import org.archive.net.PublicSuffixes.Node;
+
+/**
+ * Test cases for PublicSuffixes utility. Confirm expected matches/nonmatches
+ * from constructed regex.
+ *
+ * @author gojomo
+ */
+public class PublicSuffixesTest extends TestCase {
+ // test of low level implementation
+ private final String NL = System.lineSeparator();
+
+ public void testCompare() {
+ Node n = new Node("hoge");
+ assertTrue(n.compareTo('a') > 0);
+ assertEquals(-1, n.compareTo('*'));
+ assertEquals(-1, n.compareTo('!'));
+ assertEquals(-1, n.compareTo(new Node("*,")));
+ assertEquals(-1, n.compareTo(new Node("!muga,")));
+ assertEquals(-1, n.compareTo(new Node("")));
+
+ n = new Node("*,");
+ assertEquals(1, n.compareTo('a'));
+ assertEquals(0, n.compareTo('*'));
+ assertEquals(1, n.compareTo('!'));
+ assertEquals(0, n.compareTo(new Node("*,")));
+ assertEquals(1, n.compareTo(new Node("!muga,")));
+ assertEquals(-1, n.compareTo(new Node("")));
+
+ n = new Node("!hoge");
+ assertEquals(1, n.compareTo('a'));
+ assertEquals(-1, n.compareTo('*'));
+ assertEquals(0, n.compareTo('!'));
+ assertEquals(-1, n.compareTo(new Node("*,")));
+ assertEquals(0, n.compareTo(new Node("!muga,")));
+ assertEquals(-1, n.compareTo(new Node("")));
+
+ n = new Node("");
+ assertEquals(1, n.compareTo('a'));
+ assertEquals(1, n.compareTo('*'));
+ assertEquals(1, n.compareTo('!'));
+ assertEquals(0, n.compareTo(new Node("")));
+ }
+
+ protected String dump(Node alt) {
+ StringWriter w = new StringWriter();
+ PublicSuffixes.dump(alt, 0, new PrintWriter(w));
+ return w.toString();
+ }
+ public void testTrie1() {
+ Node alt = new Node(null, new ArrayList());
+ alt.addBranch("ac,");
+ // specifically, should not have empty string as match.
+ assertEquals("(null)" + NL + " \"ac,\"" + NL, dump(alt));
+ alt.addBranch("ac,com,");
+ assertEquals("(null)" + NL +
+ " \"ac,\"" + NL +
+ " \"com,\"" + NL +
+ " \"\"" + NL, dump(alt));
+ alt.addBranch("ac,edu,");
+ assertEquals("(null)" + NL +
+ " \"ac,\"" + NL +
+ " \"com,\"" + NL +
+ " \"edu,\"" + NL +
+ " \"\"" + NL, dump(alt));
+ }
+ public void testTrie2() {
+ Node alt = new Node(null, new ArrayList());
+ alt.addBranch("ac,");
+ alt.addBranch("*,");
+ assertEquals("(null)" + NL +
+ " \"ac,\"" + NL +
+ " \"*,\"" + NL, dump(alt));
+ }
+
+ public void testTrie3() {
+ Node alt = new Node(null, new ArrayList());
+ alt.addBranch("ac,");
+ alt.addBranch("ac,!hoge,");
+ alt.addBranch("ac,*,");
+ // exception goes first.
+ assertEquals("(null)" + NL +
+ " \"ac,\"" + NL +
+ " \"!hoge,\"" + NL +
+ " \"*,\"" + NL +
+ " \"\"" + NL, dump(alt));
+ }
+
+ // test of higher-level functionality
+
+ Matcher m = PublicSuffixes.getTopmostAssignedSurtPrefixPattern()
+ .matcher("");
+
+ public void testBasics() {
+ matchPrefix("com,example,www,", "com,example,");
+ matchPrefix("com,example,", "com,example,");
+ matchPrefix("org,archive,www,", "org,archive,");
+ matchPrefix("org,archive,", "org,archive,");
+ matchPrefix("fr,yahoo,www,", "fr,yahoo,");
+ matchPrefix("fr,yahoo,", "fr,yahoo,");
+ matchPrefix("au,com,foobar,www,", "au,com,foobar,");
+ matchPrefix("au,com,foobar,", "au,com,foobar,");
+ matchPrefix("uk,co,virgin,www,", "uk,co,virgin,");
+ matchPrefix("uk,co,virgin,", "uk,co,virgin,");
+ matchPrefix("au,com,example,www,", "au,com,example,");
+ matchPrefix("au,com,example,", "au,com,example,");
+ matchPrefix("jp,yokohama,public,assigned,www,",
+ "jp,yokohama,public,assigned,");
+ matchPrefix("jp,yokohama,public,assigned,", "jp,yokohama,public,assigned,");
+ }
+
+ public void testDomainWithDash() {
+ matchPrefix("de,bad-site,www", "de,bad-site,");
+ }
+
+ public void testDomainWithNumbers() {
+ matchPrefix("de,archive4u,www", "de,archive4u,");
+ }
+
+ public void testIPV4() {
+ assertEquals("unexpected reduction",
+ "1.2.3.4",
+ PublicSuffixes.reduceSurtToAssignmentLevel("1.2.3.4"));
+ }
+
+ public void testIPV6() {
+ assertEquals("unexpected reduction",
+ "[2001:0db8:85a3:08d3:1319:8a2e:0370:7344]",
+ PublicSuffixes.reduceSurtToAssignmentLevel(
+ "[2001:0db8:85a3:08d3:1319:8a2e:0370:7344]"));
+ }
+
+ public void testExceptions() {
+ matchPrefix("uk,bl,www,", "uk,bl,");
+ matchPrefix("uk,bl,", "uk,bl,");
+ matchPrefix("jp,tokyo,city,subdomain,", "jp,tokyo,city,");
+ matchPrefix("jp,tokyo,city,", "jp,tokyo,city,");
+ }
+
+ public void testFakeTLD() {
+ // we assume any new/unknonwn TLD should be assumed as 2-level;
+ // this is preferable for our grouping purpose but might not be
+ // for a cookie-assigning browser (original purpose of publicsuffixlist)
+ matchPrefix("zzz,example,www,", "zzz,example,");
+ }
+
+ public void testUnsegmentedHostname() {
+ m.reset("example");
+ assertFalse("unexpected match found in 'example'", m.find());
+ }
+
+ public void testTopmostAssignedCaching() {
+ assertSame("topmostAssignedSurtPrefixPattern not cached",PublicSuffixes.getTopmostAssignedSurtPrefixPattern(),PublicSuffixes.getTopmostAssignedSurtPrefixPattern());
+ assertSame("topmostAssignedSurtPrefixRegex not cached",PublicSuffixes.getTopmostAssignedSurtPrefixRegex(),PublicSuffixes.getTopmostAssignedSurtPrefixRegex());
+ }
+
+ // TODO: test UTF domains?
+
+ protected void matchPrefix(String surtDomain, String expectedAssignedPrefix) {
+ m.reset(surtDomain);
+ assertTrue("expected match not found in '" + surtDomain, m.find());
+ assertEquals("expected match not found", expectedAssignedPrefix, m
+ .group());
+ }
+}
From 5054060e27da6fef0816efc8b90af06e4e998d9a Mon Sep 17 00:00:00 2001
From: RogerMathisen
Date: Wed, 24 Sep 2014 10:04:32 +0200
Subject: [PATCH 003/240] Updated release notes.
---
CHANGES.md | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/CHANGES.md b/CHANGES.md
index 65d24814..db09a463 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,3 +1,7 @@
+1.1.5
+-----
+* [Removed direct reference to Unix TMP-path](https://github.com/iipc/webarchive-commons/issues/2)
+
1.1.4
-----
* [All dates should be independent of locale settings](https://github.com/iipc/webarchive-commons/pull/22)
From f3e12da0bb53cb4ffb0d21b2d13cda1b6918b1d1 Mon Sep 17 00:00:00 2001
From: Thomas Edvardsen
Date: Wed, 24 Sep 2014 10:26:38 +0200
Subject: [PATCH 004/240] * changed newline from 0d0a to 0a in sourcfile
---
.../org/archive/net/PublicSuffixesTest.java | 386 +++++++++---------
1 file changed, 193 insertions(+), 193 deletions(-)
diff --git a/src/test/java/org/archive/net/PublicSuffixesTest.java b/src/test/java/org/archive/net/PublicSuffixesTest.java
index a82bab22..7528bbe1 100644
--- a/src/test/java/org/archive/net/PublicSuffixesTest.java
+++ b/src/test/java/org/archive/net/PublicSuffixesTest.java
@@ -1,193 +1,193 @@
-/*
- * This file is part of the Heritrix web crawler (crawler.archive.org).
- *
- * Licensed to the Internet Archive (IA) by one or more individual
- * contributors.
- *
- * The IA licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.archive.net;
-
-import java.io.PrintWriter;
-import java.io.StringWriter;
-import java.util.ArrayList;
-import java.util.regex.Matcher;
-
-import junit.framework.TestCase;
-
-import org.archive.net.PublicSuffixes.Node;
-
-/**
- * Test cases for PublicSuffixes utility. Confirm expected matches/nonmatches
- * from constructed regex.
- *
- * @author gojomo
- */
-public class PublicSuffixesTest extends TestCase {
- // test of low level implementation
- private final String NL = System.lineSeparator();
-
- public void testCompare() {
- Node n = new Node("hoge");
- assertTrue(n.compareTo('a') > 0);
- assertEquals(-1, n.compareTo('*'));
- assertEquals(-1, n.compareTo('!'));
- assertEquals(-1, n.compareTo(new Node("*,")));
- assertEquals(-1, n.compareTo(new Node("!muga,")));
- assertEquals(-1, n.compareTo(new Node("")));
-
- n = new Node("*,");
- assertEquals(1, n.compareTo('a'));
- assertEquals(0, n.compareTo('*'));
- assertEquals(1, n.compareTo('!'));
- assertEquals(0, n.compareTo(new Node("*,")));
- assertEquals(1, n.compareTo(new Node("!muga,")));
- assertEquals(-1, n.compareTo(new Node("")));
-
- n = new Node("!hoge");
- assertEquals(1, n.compareTo('a'));
- assertEquals(-1, n.compareTo('*'));
- assertEquals(0, n.compareTo('!'));
- assertEquals(-1, n.compareTo(new Node("*,")));
- assertEquals(0, n.compareTo(new Node("!muga,")));
- assertEquals(-1, n.compareTo(new Node("")));
-
- n = new Node("");
- assertEquals(1, n.compareTo('a'));
- assertEquals(1, n.compareTo('*'));
- assertEquals(1, n.compareTo('!'));
- assertEquals(0, n.compareTo(new Node("")));
- }
-
- protected String dump(Node alt) {
- StringWriter w = new StringWriter();
- PublicSuffixes.dump(alt, 0, new PrintWriter(w));
- return w.toString();
- }
- public void testTrie1() {
- Node alt = new Node(null, new ArrayList());
- alt.addBranch("ac,");
- // specifically, should not have empty string as match.
- assertEquals("(null)" + NL + " \"ac,\"" + NL, dump(alt));
- alt.addBranch("ac,com,");
- assertEquals("(null)" + NL +
- " \"ac,\"" + NL +
- " \"com,\"" + NL +
- " \"\"" + NL, dump(alt));
- alt.addBranch("ac,edu,");
- assertEquals("(null)" + NL +
- " \"ac,\"" + NL +
- " \"com,\"" + NL +
- " \"edu,\"" + NL +
- " \"\"" + NL, dump(alt));
- }
- public void testTrie2() {
- Node alt = new Node(null, new ArrayList());
- alt.addBranch("ac,");
- alt.addBranch("*,");
- assertEquals("(null)" + NL +
- " \"ac,\"" + NL +
- " \"*,\"" + NL, dump(alt));
- }
-
- public void testTrie3() {
- Node alt = new Node(null, new ArrayList());
- alt.addBranch("ac,");
- alt.addBranch("ac,!hoge,");
- alt.addBranch("ac,*,");
- // exception goes first.
- assertEquals("(null)" + NL +
- " \"ac,\"" + NL +
- " \"!hoge,\"" + NL +
- " \"*,\"" + NL +
- " \"\"" + NL, dump(alt));
- }
-
- // test of higher-level functionality
-
- Matcher m = PublicSuffixes.getTopmostAssignedSurtPrefixPattern()
- .matcher("");
-
- public void testBasics() {
- matchPrefix("com,example,www,", "com,example,");
- matchPrefix("com,example,", "com,example,");
- matchPrefix("org,archive,www,", "org,archive,");
- matchPrefix("org,archive,", "org,archive,");
- matchPrefix("fr,yahoo,www,", "fr,yahoo,");
- matchPrefix("fr,yahoo,", "fr,yahoo,");
- matchPrefix("au,com,foobar,www,", "au,com,foobar,");
- matchPrefix("au,com,foobar,", "au,com,foobar,");
- matchPrefix("uk,co,virgin,www,", "uk,co,virgin,");
- matchPrefix("uk,co,virgin,", "uk,co,virgin,");
- matchPrefix("au,com,example,www,", "au,com,example,");
- matchPrefix("au,com,example,", "au,com,example,");
- matchPrefix("jp,yokohama,public,assigned,www,",
- "jp,yokohama,public,assigned,");
- matchPrefix("jp,yokohama,public,assigned,", "jp,yokohama,public,assigned,");
- }
-
- public void testDomainWithDash() {
- matchPrefix("de,bad-site,www", "de,bad-site,");
- }
-
- public void testDomainWithNumbers() {
- matchPrefix("de,archive4u,www", "de,archive4u,");
- }
-
- public void testIPV4() {
- assertEquals("unexpected reduction",
- "1.2.3.4",
- PublicSuffixes.reduceSurtToAssignmentLevel("1.2.3.4"));
- }
-
- public void testIPV6() {
- assertEquals("unexpected reduction",
- "[2001:0db8:85a3:08d3:1319:8a2e:0370:7344]",
- PublicSuffixes.reduceSurtToAssignmentLevel(
- "[2001:0db8:85a3:08d3:1319:8a2e:0370:7344]"));
- }
-
- public void testExceptions() {
- matchPrefix("uk,bl,www,", "uk,bl,");
- matchPrefix("uk,bl,", "uk,bl,");
- matchPrefix("jp,tokyo,city,subdomain,", "jp,tokyo,city,");
- matchPrefix("jp,tokyo,city,", "jp,tokyo,city,");
- }
-
- public void testFakeTLD() {
- // we assume any new/unknonwn TLD should be assumed as 2-level;
- // this is preferable for our grouping purpose but might not be
- // for a cookie-assigning browser (original purpose of publicsuffixlist)
- matchPrefix("zzz,example,www,", "zzz,example,");
- }
-
- public void testUnsegmentedHostname() {
- m.reset("example");
- assertFalse("unexpected match found in 'example'", m.find());
- }
-
- public void testTopmostAssignedCaching() {
- assertSame("topmostAssignedSurtPrefixPattern not cached",PublicSuffixes.getTopmostAssignedSurtPrefixPattern(),PublicSuffixes.getTopmostAssignedSurtPrefixPattern());
- assertSame("topmostAssignedSurtPrefixRegex not cached",PublicSuffixes.getTopmostAssignedSurtPrefixRegex(),PublicSuffixes.getTopmostAssignedSurtPrefixRegex());
- }
-
- // TODO: test UTF domains?
-
- protected void matchPrefix(String surtDomain, String expectedAssignedPrefix) {
- m.reset(surtDomain);
- assertTrue("expected match not found in '" + surtDomain, m.find());
- assertEquals("expected match not found", expectedAssignedPrefix, m
- .group());
- }
-}
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.net;
+
+import java.io.PrintWriter;
+import java.io.StringWriter;
+import java.util.ArrayList;
+import java.util.regex.Matcher;
+
+import junit.framework.TestCase;
+
+import org.archive.net.PublicSuffixes.Node;
+
+/**
+ * Test cases for PublicSuffixes utility. Confirm expected matches/nonmatches
+ * from constructed regex.
+ *
+ * @author gojomo
+ */
+public class PublicSuffixesTest extends TestCase {
+ // test of low level implementation
+ private final String NL = System.lineSeparator();
+
+ public void testCompare() {
+ Node n = new Node("hoge");
+ assertTrue(n.compareTo('a') > 0);
+ assertEquals(-1, n.compareTo('*'));
+ assertEquals(-1, n.compareTo('!'));
+ assertEquals(-1, n.compareTo(new Node("*,")));
+ assertEquals(-1, n.compareTo(new Node("!muga,")));
+ assertEquals(-1, n.compareTo(new Node("")));
+
+ n = new Node("*,");
+ assertEquals(1, n.compareTo('a'));
+ assertEquals(0, n.compareTo('*'));
+ assertEquals(1, n.compareTo('!'));
+ assertEquals(0, n.compareTo(new Node("*,")));
+ assertEquals(1, n.compareTo(new Node("!muga,")));
+ assertEquals(-1, n.compareTo(new Node("")));
+
+ n = new Node("!hoge");
+ assertEquals(1, n.compareTo('a'));
+ assertEquals(-1, n.compareTo('*'));
+ assertEquals(0, n.compareTo('!'));
+ assertEquals(-1, n.compareTo(new Node("*,")));
+ assertEquals(0, n.compareTo(new Node("!muga,")));
+ assertEquals(-1, n.compareTo(new Node("")));
+
+ n = new Node("");
+ assertEquals(1, n.compareTo('a'));
+ assertEquals(1, n.compareTo('*'));
+ assertEquals(1, n.compareTo('!'));
+ assertEquals(0, n.compareTo(new Node("")));
+ }
+
+ protected String dump(Node alt) {
+ StringWriter w = new StringWriter();
+ PublicSuffixes.dump(alt, 0, new PrintWriter(w));
+ return w.toString();
+ }
+ public void testTrie1() {
+ Node alt = new Node(null, new ArrayList());
+ alt.addBranch("ac,");
+ // specifically, should not have empty string as match.
+ assertEquals("(null)" + NL + " \"ac,\"" + NL, dump(alt));
+ alt.addBranch("ac,com,");
+ assertEquals("(null)" + NL +
+ " \"ac,\"" + NL +
+ " \"com,\"" + NL +
+ " \"\"" + NL, dump(alt));
+ alt.addBranch("ac,edu,");
+ assertEquals("(null)" + NL +
+ " \"ac,\"" + NL +
+ " \"com,\"" + NL +
+ " \"edu,\"" + NL +
+ " \"\"" + NL, dump(alt));
+ }
+ public void testTrie2() {
+ Node alt = new Node(null, new ArrayList());
+ alt.addBranch("ac,");
+ alt.addBranch("*,");
+ assertEquals("(null)" + NL +
+ " \"ac,\"" + NL +
+ " \"*,\"" + NL, dump(alt));
+ }
+
+ public void testTrie3() {
+ Node alt = new Node(null, new ArrayList());
+ alt.addBranch("ac,");
+ alt.addBranch("ac,!hoge,");
+ alt.addBranch("ac,*,");
+ // exception goes first.
+ assertEquals("(null)" + NL +
+ " \"ac,\"" + NL +
+ " \"!hoge,\"" + NL +
+ " \"*,\"" + NL +
+ " \"\"" + NL, dump(alt));
+ }
+
+ // test of higher-level functionality
+
+ Matcher m = PublicSuffixes.getTopmostAssignedSurtPrefixPattern()
+ .matcher("");
+
+ public void testBasics() {
+ matchPrefix("com,example,www,", "com,example,");
+ matchPrefix("com,example,", "com,example,");
+ matchPrefix("org,archive,www,", "org,archive,");
+ matchPrefix("org,archive,", "org,archive,");
+ matchPrefix("fr,yahoo,www,", "fr,yahoo,");
+ matchPrefix("fr,yahoo,", "fr,yahoo,");
+ matchPrefix("au,com,foobar,www,", "au,com,foobar,");
+ matchPrefix("au,com,foobar,", "au,com,foobar,");
+ matchPrefix("uk,co,virgin,www,", "uk,co,virgin,");
+ matchPrefix("uk,co,virgin,", "uk,co,virgin,");
+ matchPrefix("au,com,example,www,", "au,com,example,");
+ matchPrefix("au,com,example,", "au,com,example,");
+ matchPrefix("jp,yokohama,public,assigned,www,",
+ "jp,yokohama,public,assigned,");
+ matchPrefix("jp,yokohama,public,assigned,", "jp,yokohama,public,assigned,");
+ }
+
+ public void testDomainWithDash() {
+ matchPrefix("de,bad-site,www", "de,bad-site,");
+ }
+
+ public void testDomainWithNumbers() {
+ matchPrefix("de,archive4u,www", "de,archive4u,");
+ }
+
+ public void testIPV4() {
+ assertEquals("unexpected reduction",
+ "1.2.3.4",
+ PublicSuffixes.reduceSurtToAssignmentLevel("1.2.3.4"));
+ }
+
+ public void testIPV6() {
+ assertEquals("unexpected reduction",
+ "[2001:0db8:85a3:08d3:1319:8a2e:0370:7344]",
+ PublicSuffixes.reduceSurtToAssignmentLevel(
+ "[2001:0db8:85a3:08d3:1319:8a2e:0370:7344]"));
+ }
+
+ public void testExceptions() {
+ matchPrefix("uk,bl,www,", "uk,bl,");
+ matchPrefix("uk,bl,", "uk,bl,");
+ matchPrefix("jp,tokyo,city,subdomain,", "jp,tokyo,city,");
+ matchPrefix("jp,tokyo,city,", "jp,tokyo,city,");
+ }
+
+ public void testFakeTLD() {
+ // we assume any new/unknonwn TLD should be assumed as 2-level;
+ // this is preferable for our grouping purpose but might not be
+ // for a cookie-assigning browser (original purpose of publicsuffixlist)
+ matchPrefix("zzz,example,www,", "zzz,example,");
+ }
+
+ public void testUnsegmentedHostname() {
+ m.reset("example");
+ assertFalse("unexpected match found in 'example'", m.find());
+ }
+
+ public void testTopmostAssignedCaching() {
+ assertSame("topmostAssignedSurtPrefixPattern not cached",PublicSuffixes.getTopmostAssignedSurtPrefixPattern(),PublicSuffixes.getTopmostAssignedSurtPrefixPattern());
+ assertSame("topmostAssignedSurtPrefixRegex not cached",PublicSuffixes.getTopmostAssignedSurtPrefixRegex(),PublicSuffixes.getTopmostAssignedSurtPrefixRegex());
+ }
+
+ // TODO: test UTF domains?
+
+ protected void matchPrefix(String surtDomain, String expectedAssignedPrefix) {
+ m.reset(surtDomain);
+ assertTrue("expected match not found in '" + surtDomain, m.find());
+ assertEquals("expected match not found", expectedAssignedPrefix, m
+ .group());
+ }
+}
From faec599fc4a1cc8f09523e78cab073ed570b8adc Mon Sep 17 00:00:00 2001
From: RogerMathisen
Date: Wed, 24 Sep 2014 11:03:55 +0200
Subject: [PATCH 005/240] - Removed pointless code.
---
.../archive/util/iterator/SortedCompositeIteratorTest.java | 5 -----
1 file changed, 5 deletions(-)
diff --git a/src/test/java/org/archive/util/iterator/SortedCompositeIteratorTest.java b/src/test/java/org/archive/util/iterator/SortedCompositeIteratorTest.java
index 0f4dc68a..11ea1229 100644
--- a/src/test/java/org/archive/util/iterator/SortedCompositeIteratorTest.java
+++ b/src/test/java/org/archive/util/iterator/SortedCompositeIteratorTest.java
@@ -14,11 +14,6 @@ public class SortedCompositeIteratorTest extends TestCase {
public void testHasNext() throws FileNotFoundException, IOException {
- long t = 210000;
- long c = 134;
- float f = (float)c / (float)t;
- System.err.format("F(%f)\n",f);
-
File a = File.createTempFile("filea", null);
File b = File.createTempFile("fileb", null);
From 595851a1138529b4d7e633b2cfd5e4e28b6b6204 Mon Sep 17 00:00:00 2001
From: John Erik Halse
Date: Tue, 30 Sep 2014 14:45:33 +0200
Subject: [PATCH 006/240] Require the oldest recommended version of Maven 3
---
pom.xml | 22 ++++++++++++++++++++++
1 file changed, 22 insertions(+)
diff --git a/pom.xml b/pom.xml
index 6664efd8..0eee2ed2 100644
--- a/pom.xml
+++ b/pom.xml
@@ -214,7 +214,29 @@
+
+ org.apache.maven.plugins
+ maven-enforcer-plugin
+ 1.3.1
+
+
+ enforce-maven
+
+ enforce
+
+
+
+
+ This project requires Maven 3
+ 3.0.5
+
+
+
+
+
+
+
src/main/resources
From 46d0f6ffbad1b02fd7917c0e218eeed6557f3d9b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kristinn=20Sigur=C3=B0sson?=
Date: Tue, 30 Sep 2014 15:10:48 +0000
Subject: [PATCH 007/240] Java 6 compatibility System.lineSeparator() was
introducted in Java 7
---
src/test/java/org/archive/net/PublicSuffixesTest.java | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/test/java/org/archive/net/PublicSuffixesTest.java b/src/test/java/org/archive/net/PublicSuffixesTest.java
index 7528bbe1..ca6e6408 100644
--- a/src/test/java/org/archive/net/PublicSuffixesTest.java
+++ b/src/test/java/org/archive/net/PublicSuffixesTest.java
@@ -36,7 +36,7 @@
*/
public class PublicSuffixesTest extends TestCase {
// test of low level implementation
- private final String NL = System.lineSeparator();
+ private final String NL = System.getProperty("line.separator");
public void testCompare() {
Node n = new Node("hoge");
From 6556c7f14e54d07f13fe49c4c1bc6ee88c18f134 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kristinn=20Sigur=C3=B0sson?=
Date: Tue, 30 Sep 2014 16:09:54 +0000
Subject: [PATCH 008/240] Change test value to get around Java 8 bug Fixes
issue #31 which relates to changes in how Java rounds doubles in some edge
cases.
---
.../java/org/archive/util/ArchiveUtilsTest.java | 17 ++++++++++-------
1 file changed, 10 insertions(+), 7 deletions(-)
diff --git a/src/test/java/org/archive/util/ArchiveUtilsTest.java b/src/test/java/org/archive/util/ArchiveUtilsTest.java
index 8251615a..586a1821 100644
--- a/src/test/java/org/archive/util/ArchiveUtilsTest.java
+++ b/src/test/java/org/archive/util/ArchiveUtilsTest.java
@@ -229,16 +229,19 @@ public void testByteArrayEquals() {
/** test doubleToString() */
public void testDoubleToString(){
- double test = 12.345;
- assertTrue(
+ double test = 12.121d;
+ assertEquals(
"cecking zero precision",
- ArchiveUtils.doubleToString(test, 0).equals("12"));
- assertTrue(
+ "12",
+ ArchiveUtils.doubleToString(test, 0));
+ assertEquals(
"cecking 2 character precision",
- ArchiveUtils.doubleToString(test, 2).equals("12.34"));
- assertTrue(
+ "12.12",
+ ArchiveUtils.doubleToString(test, 2));
+ assertEquals(
"cecking precision higher then the double has",
- ArchiveUtils.doubleToString(test, 65).equals("12.345"));
+ "12.121",
+ ArchiveUtils.doubleToString(test, 65));
}
From fbf4df7117e3fe5b812a047736836e6531936897 Mon Sep 17 00:00:00 2001
From: John Erik Halse
Date: Wed, 1 Oct 2014 07:48:22 +0200
Subject: [PATCH 009/240] Changed message.
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index 0eee2ed2..f6730625 100644
--- a/pom.xml
+++ b/pom.xml
@@ -227,7 +227,7 @@
- This project requires Maven 3
+ This project requires Maven 3.0.5 or higher3.0.5
From fbbaab079b06260aa84b8b2d896a34db3a6872e3 Mon Sep 17 00:00:00 2001
From: John Erik Halse
Date: Wed, 1 Oct 2014 12:54:59 +0200
Subject: [PATCH 010/240] Update CHANGES.md
---
CHANGES.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/CHANGES.md b/CHANGES.md
index db09a463..a84f579e 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,6 +1,6 @@
1.1.5
-----
-* [Removed direct reference to Unix TMP-path](https://github.com/iipc/webarchive-commons/issues/2)
+* [Tests fail on Windows](https://github.com/iipc/webarchive-commons/issues/2)
1.1.4
-----
From 7914bdf04dbf5d0b431065b650a91773684ae757 Mon Sep 17 00:00:00 2001
From: John Erik Halse
Date: Wed, 1 Oct 2014 12:58:31 +0200
Subject: [PATCH 011/240] Update CHANGES.md
---
CHANGES.md | 1 +
1 file changed, 1 insertion(+)
diff --git a/CHANGES.md b/CHANGES.md
index a84f579e..8e787634 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,6 +1,7 @@
1.1.5
-----
* [Tests fail on Windows](https://github.com/iipc/webarchive-commons/issues/2)
+* [Test fails on Java 8](https://github.com/iipc/webarchive-commons/issues/31)
1.1.4
-----
From 166656eb4b0dbfb16611a8b74e79c35b8954e72a Mon Sep 17 00:00:00 2001
From: John Erik Halse
Date: Mon, 6 Oct 2014 14:03:17 +0200
Subject: [PATCH 012/240] Added method to UsableUri to get the IDN in non-puny
form
---
src/main/java/org/archive/url/UsableURI.java | 48 +++++++++++++++++++
.../java/org/archive/url/UsableURITest.java | 12 +++++
2 files changed, 60 insertions(+)
diff --git a/src/main/java/org/archive/url/UsableURI.java b/src/main/java/org/archive/url/UsableURI.java
index b9c4ff9d..fa1de57a 100644
--- a/src/main/java/org/archive/url/UsableURI.java
+++ b/src/main/java/org/archive/url/UsableURI.java
@@ -18,6 +18,7 @@
*/
package org.archive.url;
+import gnu.inet.encoding.IDNA;
import java.io.File;
import java.io.IOException;
import java.io.ObjectOutputStream;
@@ -271,6 +272,53 @@ public String toString() {
return toCustomString();
}
+ /**
+ * In the case of a puny encoded IDN, this method returns the decoded Unicode version.
+ * @return decoded IDN version of URI
+ */
+ public String toUnicodeHostString() {
+ if (!_is_hostname) {
+ return toString();
+ }
+
+ try {
+ StringBuilder buf = new StringBuilder();
+
+ if (_scheme != null) {
+ buf.append(_scheme);
+ buf.append(':');
+ }
+ if (_is_net_path) {
+ buf.append("//");
+ if (_authority != null) { // has_authority
+ if (_userinfo != null) {
+ buf.append(_userinfo).append('@');
+ }
+ buf.append(IDNA.toUnicode(getHost()));
+ if (_port >= 0) {
+ buf.append(':').append(_port);
+ }
+ this._authority = buf.toString().toCharArray();
+ }
+ }
+ if (_opaque != null && _is_opaque_part) {
+ buf.append(_opaque);
+ } else if (_path != null) {
+ // _is_hier_part or _is_relativeURI
+ if (_path.length != 0) {
+ buf.append(_path);
+ }
+ }
+ if (_query != null) { // has_query
+ buf.append('?');
+ buf.append(_query);
+ }
+ return buf.toString();
+ } catch (URIException ex) {
+ throw new RuntimeException(ex);
+ }
+ }
+
public synchronized String getEscapedURI() {
if (this.cachedEscapedURI == null) {
this.cachedEscapedURI = super.getEscapedURI();
diff --git a/src/test/java/org/archive/url/UsableURITest.java b/src/test/java/org/archive/url/UsableURITest.java
index 2aec0e96..7588f03c 100644
--- a/src/test/java/org/archive/url/UsableURITest.java
+++ b/src/test/java/org/archive/url/UsableURITest.java
@@ -53,4 +53,16 @@ public void testSchemalessRelative() throws URIException {
UsableURI test = new UsableURI(base, relative);
assertEquals("http://www.facebook.com/?href=http://www.archive.org/a", test.toString());
}
+
+ /**
+ * Test of toUnicodeHostString method, of class UsableURI.
+ */
+ public void testToUnicodeHostString() throws URIException {
+ assertEquals("http://øx.dk", new UsableURI("http://xn--x-4ga.dk", true, "UTF-8").toUnicodeHostString());
+ assertEquals("xn--x-4ga.dk", new UsableURI("xn--x-4ga.dk", true, "UTF-8").toUnicodeHostString());
+ assertEquals("http://user:pass@øx.dk:8080", new UsableURI("http://user:pass@xn--x-4ga.dk:8080", true, "UTF-8").toUnicodeHostString());
+ assertEquals("http://user@øx.dk:8080", new UsableURI("http://user@xn--x-4ga.dk:8080", true, "UTF-8").toUnicodeHostString());
+ assertEquals("http://øx.dk/foo/bar?query=q", new UsableURI("http://xn--x-4ga.dk/foo/bar?query=q", true, "UTF-8").toUnicodeHostString());
+ assertEquals("http://127.0.0.1/foo/bar?query=q", new UsableURI("http://127.0.0.1/foo/bar?query=q", true, "UTF-8").toUnicodeHostString());
+ }
}
From 619412c284baf78e8fbb3e2391687e226c4ea0f1 Mon Sep 17 00:00:00 2001
From: John Erik Halse
Date: Mon, 10 Nov 2014 12:12:07 +0100
Subject: [PATCH 013/240] Fixed bug which changed the URI after calling
toUnicodeHostString.
---
src/main/java/org/archive/url/UsableURI.java | 4 +++-
src/test/java/org/archive/url/UsableURITest.java | 15 +++++++++++++++
2 files changed, 18 insertions(+), 1 deletion(-)
diff --git a/src/main/java/org/archive/url/UsableURI.java b/src/main/java/org/archive/url/UsableURI.java
index fa1de57a..ed40f41a 100644
--- a/src/main/java/org/archive/url/UsableURI.java
+++ b/src/main/java/org/archive/url/UsableURI.java
@@ -274,6 +274,9 @@ public String toString() {
/**
* In the case of a puny encoded IDN, this method returns the decoded Unicode version.
+ *
+ * Most of this implementation is copied from {@link org.apache.commons.httpclient.URI#setURI()}.
+ *
* @return decoded IDN version of URI
*/
public String toUnicodeHostString() {
@@ -298,7 +301,6 @@ public String toUnicodeHostString() {
if (_port >= 0) {
buf.append(':').append(_port);
}
- this._authority = buf.toString().toCharArray();
}
}
if (_opaque != null && _is_opaque_part) {
diff --git a/src/test/java/org/archive/url/UsableURITest.java b/src/test/java/org/archive/url/UsableURITest.java
index 7588f03c..73694f79 100644
--- a/src/test/java/org/archive/url/UsableURITest.java
+++ b/src/test/java/org/archive/url/UsableURITest.java
@@ -64,5 +64,20 @@ public void testToUnicodeHostString() throws URIException {
assertEquals("http://user@øx.dk:8080", new UsableURI("http://user@xn--x-4ga.dk:8080", true, "UTF-8").toUnicodeHostString());
assertEquals("http://øx.dk/foo/bar?query=q", new UsableURI("http://xn--x-4ga.dk/foo/bar?query=q", true, "UTF-8").toUnicodeHostString());
assertEquals("http://127.0.0.1/foo/bar?query=q", new UsableURI("http://127.0.0.1/foo/bar?query=q", true, "UTF-8").toUnicodeHostString());
+
+ // test idn round trip
+ // XXX fails because idn is not handled here (it is converted to punycode in UsableURIFactory.fixupDomainlabel())
+ // assertEquals("http://øx.dk", new UsableURI("http://øx.dk", false, "UTF-8").toUnicodeHostString());
+ // To check the round trip it is then necessary to use the factory method in UsableURIFactory.
+ assertEquals("http://øx.dk/", UsableURIFactory.getInstance("http://øx.dk/", "UTF-8").toUnicodeHostString());
+
+ // non-idn domain name
+ assertEquals("http://example.org", new UsableURI("http://example.org", true, "UTF-8").toUnicodeHostString());
+
+ // ensure a call to toUnicodeHostString() has no effect on toString()
+ UsableURI uri = new UsableURI("http://xn--x-4ga.dk", true, "UTF-8");
+ assertEquals("http://øx.dk", uri.toUnicodeHostString());
+ uri.setPath(uri.getPath()); // force toString() cached value to be recomputed
+ assertEquals("http://xn--x-4ga.dk", uri.toString());
}
}
From 61f5a8cb7233f48196ea8fa305492d6b9f637b7f Mon Sep 17 00:00:00 2001
From: John Erik Halse
Date: Mon, 10 Nov 2014 12:14:08 +0100
Subject: [PATCH 014/240] Fixed bug that prevented the https scheme from using
static string.
---
src/main/java/org/archive/url/LaxURI.java | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/main/java/org/archive/url/LaxURI.java b/src/main/java/org/archive/url/LaxURI.java
index 807333d3..e1cea9b7 100644
--- a/src/main/java/org/archive/url/LaxURI.java
+++ b/src/main/java/org/archive/url/LaxURI.java
@@ -211,7 +211,7 @@ protected void setURI() {
if (_scheme.length == 4 && Arrays.equals(_scheme, HTTP_SCHEME)) {
_scheme = HTTP_SCHEME;
} else if (_scheme.length == 5
- && Arrays.equals(_scheme, HTTP_SCHEME)) {
+ && Arrays.equals(_scheme, HTTPS_SCHEME)) {
_scheme = HTTPS_SCHEME;
}
}
From 6b7971f86eda7255c1d5ab05f7883da30db7fced Mon Sep 17 00:00:00 2001
From: John Erik Halse
Date: Mon, 10 Nov 2014 15:39:15 +0100
Subject: [PATCH 015/240] Removed unnecessary import
---
src/test/java/org/archive/url/UsableURITest.java | 1 -
1 file changed, 1 deletion(-)
diff --git a/src/test/java/org/archive/url/UsableURITest.java b/src/test/java/org/archive/url/UsableURITest.java
index 73694f79..2a2f41f5 100644
--- a/src/test/java/org/archive/url/UsableURITest.java
+++ b/src/test/java/org/archive/url/UsableURITest.java
@@ -21,7 +21,6 @@
import java.net.URISyntaxException;
import org.apache.commons.httpclient.URIException;
-import org.archive.url.UsableURI;
import junit.framework.TestCase;
From 363a3c51b40a5d559bfa6eb7d2f038b9258f577a Mon Sep 17 00:00:00 2001
From: Gerhard Gossen
Date: Wed, 17 Dec 2014 16:39:24 +0100
Subject: [PATCH 016/240] Improve URL escaping in CDX writer
---
.../extract/RealCDXExtractorOutput.java | 9 ++++--
.../extract/RealCDXExtractorOutputTest.java | 28 +++++++++++++++++++
2 files changed, 34 insertions(+), 3 deletions(-)
create mode 100644 src/test/java/org/archive/extract/RealCDXExtractorOutputTest.java
diff --git a/src/main/java/org/archive/extract/RealCDXExtractorOutput.java b/src/main/java/org/archive/extract/RealCDXExtractorOutput.java
index 62a423c5..8ca3ff82 100644
--- a/src/main/java/org/archive/extract/RealCDXExtractorOutput.java
+++ b/src/main/java/org/archive/extract/RealCDXExtractorOutput.java
@@ -4,6 +4,7 @@
import java.io.OutputStream;
import java.io.PrintWriter;
import java.net.MalformedURLException;
+import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.List;
@@ -307,12 +308,14 @@ private String extractHTMLMetaRefresh(String origUrl, MetaData m) {
return "-";
}
- private String resolve(String context, String spec) {
+ static String resolve(String context, String spec) {
// TODO: test!
try {
URL cUrl = new URL(context);
- URL resolved = new URL(cUrl,spec);
- return resolved.toURI().toASCIIString();
+ URL url = new URL(cUrl, spec);
+ // this constructor escapes its arguments, if necessary
+ URI uri = new URI(url.getProtocol(), url.getHost(), url.getPath(), url.getQuery(), url.getRef());
+ return uri.toASCIIString();
} catch (URISyntaxException e) {
} catch (MalformedURLException e) {
diff --git a/src/test/java/org/archive/extract/RealCDXExtractorOutputTest.java b/src/test/java/org/archive/extract/RealCDXExtractorOutputTest.java
new file mode 100644
index 00000000..14f8489d
--- /dev/null
+++ b/src/test/java/org/archive/extract/RealCDXExtractorOutputTest.java
@@ -0,0 +1,28 @@
+package org.archive.extract;
+
+import java.net.MalformedURLException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.net.URL;
+import java.net.URLEncoder;
+
+import junit.framework.TestCase;
+
+
+public class RealCDXExtractorOutputTest extends TestCase {
+
+ public void testEscapeResolvedUrl() throws Exception {
+ String context ="http://www.uni-giessen.de/cms/studium/dateien/informationberatung/merkblattpdf";
+ String spec = "http://fss.plone.uni-giessen.de/fß/studium/dateien/informationberatung/merkblattpdf/file/Mérkblatt zur Gestaltung von Nachteilsausgleichen.pdf?föo=bar#änchor";
+ String escaped = RealCDXExtractorOutput.resolve(context, spec);
+ assertTrue(escaped.indexOf(" ") < 0);
+ URI parsed = new URI(escaped);
+ assertEquals("änchor", parsed.getFragment());
+ }
+
+ public void testNoDoubleEscaping() throws Exception {
+ String spec = "https://www.google.com/search?q=java+escape+url+spaces&ie=utf-8&oe=utf-8";
+ String resolved = RealCDXExtractorOutput.resolve(spec, spec);
+ assertTrue(spec.equals(resolved));
+ }
+}
From 1ee18d8a426a0b18aa502f71896d9962416262a0 Mon Sep 17 00:00:00 2001
From: Gerhard Gossen
Date: Wed, 17 Dec 2014 17:12:42 +0100
Subject: [PATCH 017/240] Update CHANGES.md
---
CHANGES.md | 1 +
1 file changed, 1 insertion(+)
diff --git a/CHANGES.md b/CHANGES.md
index 8e787634..7fb2f7c4 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,5 +1,6 @@
1.1.5
-----
+* [Escape redirect URLs in RealCDXExtractorOutput](https://github.com/iipc/webarchive-commons/pull/36)
* [Tests fail on Windows](https://github.com/iipc/webarchive-commons/issues/2)
* [Test fails on Java 8](https://github.com/iipc/webarchive-commons/issues/31)
From f130aad04b255e7d8cd4eee4bac86c25b0cbbf36 Mon Sep 17 00:00:00 2001
From: Noah Levitt
Date: Tue, 6 Jan 2015 15:54:59 -0800
Subject: [PATCH 018/240] move RecordingOutputStreamTest.java from heritrix to
webarchive-commons
---
.../archive/io/RecordingOutputStreamTest.java | 260 ++++++++++++++++++
1 file changed, 260 insertions(+)
create mode 100644 src/test/java/org/archive/io/RecordingOutputStreamTest.java
diff --git a/src/test/java/org/archive/io/RecordingOutputStreamTest.java b/src/test/java/org/archive/io/RecordingOutputStreamTest.java
new file mode 100644
index 00000000..1c53549b
--- /dev/null
+++ b/src/test/java/org/archive/io/RecordingOutputStreamTest.java
@@ -0,0 +1,260 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+
+import org.archive.util.TmpDirTestCase;
+
+
+/**
+ * Test casesfor RecordingOutputStream.
+ *
+ * @author stack
+ */
+public class RecordingOutputStreamTest extends TmpDirTestCase
+{
+ /**
+ * Size of buffer used in tests.
+ */
+ private static final int BUFFER_SIZE = 5;
+
+ /**
+ * How much to write total to testing RecordingOutputStream.
+ */
+ private static final int WRITE_TOTAL = 10;
+
+
+ /*
+ * @see TmpDirTestCase#setUp()
+ */
+ protected void setUp() throws Exception
+ {
+ super.setUp();
+ }
+
+ /**
+ * Test reusing instance of RecordingOutputStream.
+ *
+ * @throws IOException Failed open of backing file or opening of
+ * input streams verifying recording.
+ */
+ public void testReuse()
+ throws IOException
+ {
+ final String BASENAME = "testReuse";
+ cleanUpOldFiles(BASENAME);
+ RecordingOutputStream ros = new RecordingOutputStream(BUFFER_SIZE,
+ (new File(getTmpDir(), BASENAME + "Bkg.txt")).getAbsolutePath());
+ for (int i = 0; i < 3; i++)
+ {
+ reuse(BASENAME, ros, i);
+ }
+ }
+
+ private void reuse(String baseName, RecordingOutputStream ros, int index)
+ throws IOException
+ {
+ final String BASENAME = baseName + Integer.toString(index);
+ File f = writeIntRecordedFile(ros, BASENAME, WRITE_TOTAL);
+ verifyRecording(ros, f, WRITE_TOTAL);
+ // Do again to test that I can get a new ReplayInputStream on same
+ // RecordingOutputStream.
+ verifyRecording(ros, f, WRITE_TOTAL);
+ }
+
+ /**
+ * Method to test for void write(int).
+ *
+ * Uses small buffer size and small write size. Test mark and reset too.
+ *
+ * @throws IOException Failed open of backing file or opening of
+ * input streams verifying recording.
+ */
+ public void testWriteint()
+ throws IOException
+ {
+ final String BASENAME = "testWriteint";
+ cleanUpOldFiles(BASENAME);
+ RecordingOutputStream ros = new RecordingOutputStream(BUFFER_SIZE,
+ (new File(getTmpDir(), BASENAME + "Backing.txt")).getAbsolutePath());
+ File f = writeIntRecordedFile(ros, BASENAME, WRITE_TOTAL);
+ verifyRecording(ros, f, WRITE_TOTAL);
+ // Do again to test that I can get a new ReplayInputStream on same
+ // RecordingOutputStream.
+ verifyRecording(ros, f, WRITE_TOTAL);
+ }
+
+ /**
+ * Method to test for void write(byte []).
+ *
+ * Uses small buffer size and small write size.
+ *
+ * @throws IOException Failed open of backing file or opening of
+ * input streams verifying recording.
+ */
+ public void testWritebytearray()
+ throws IOException
+ {
+ final String BASENAME = "testWritebytearray";
+ cleanUpOldFiles(BASENAME);
+ RecordingOutputStream ros = new RecordingOutputStream(BUFFER_SIZE,
+ (new File(getTmpDir(), BASENAME + "Backing.txt")).getAbsolutePath());
+ File f = writeByteRecordedFile(ros, BASENAME, WRITE_TOTAL);
+ verifyRecording(ros, f, WRITE_TOTAL);
+ // Do again to test that I can get a new ReplayInputStream on same
+ // RecordingOutputStream.
+ verifyRecording(ros, f, WRITE_TOTAL);
+ }
+
+ /**
+ * Test mark and reset.
+ * @throws IOException
+ */
+ public void testMarkReset() throws IOException
+ {
+ final String BASENAME = "testMarkReset";
+ cleanUpOldFiles(BASENAME);
+ RecordingOutputStream ros = new RecordingOutputStream(BUFFER_SIZE,
+ (new File(getTmpDir(), BASENAME + "Backing.txt")).getAbsolutePath());
+ File f = writeByteRecordedFile(ros, BASENAME, WRITE_TOTAL);
+ verifyRecording(ros, f, WRITE_TOTAL);
+ ReplayInputStream ris = ros.getReplayInputStream();
+ ris.mark(10 /*Arbitrary value*/);
+ // Read from the stream.
+ ris.read();
+ ris.read();
+ ris.read();
+ // Reset it. It should be back at zero.
+ ris.reset();
+ assertEquals("Reset to zero", ris.read(), 0);
+ assertEquals("Reset to zero char 1", ris.read(), 1);
+ assertEquals("Reset to zero char 2", ris.read(), 2);
+ // Mark stream. Here. Next character should be '3'.
+ ris.mark(10 /* Arbitrary value*/);
+ ris.read();
+ ris.read();
+ ris.reset();
+ assertEquals("Reset to zero char 3", ris.read(), 3);
+ }
+
+ /**
+ * Record a file write.
+ *
+ * Write a file w/ characters that start at null and ascend to
+ * filesize. Record the writing w/ passed ros
+ * recordingoutputstream. Return the file recorded as result of method.
+ * The file output stream that is recorded is named
+ * basename + ".txt".
+ *
+ *
This method writes a character at a time.
+ *
+ * @param ros RecordingOutputStream to record with.
+ * @param basename Basename of file.
+ * @param size How many characters to write.
+ * @return Recorded output stream.
+ */
+ private File writeIntRecordedFile(RecordingOutputStream ros,
+ String basename, int size)
+ throws IOException
+ {
+ File f = new File(getTmpDir(), basename + ".txt");
+ FileOutputStream fos = new FileOutputStream(f);
+ ros.open(fos);
+ for (int i = 0; i < WRITE_TOTAL; i++)
+ {
+ ros.write(i);
+ }
+ ros.close();
+ fos.close();
+ assertEquals("Content-Length test", size,
+ ros.getResponseContentLength());
+ return f;
+ }
+
+ /**
+ * Record a file byte array write.
+ *
+ * Write a file w/ characters that start at null and ascend to
+ * filesize. Record the writing w/ passed ros
+ * recordingoutputstream. Return the file recorded as result of method.
+ * The file output stream that is recorded is named
+ * basename + ".txt".
+ *
+ *
This method writes using a byte array.
+ *
+ * @param ros RecordingOutputStream to record with.
+ * @param basename Basename of file.
+ * @param size How many characters to write.
+ * @return Recorded output stream.
+ */
+ private File writeByteRecordedFile(RecordingOutputStream ros,
+ String basename, int size)
+ throws IOException
+ {
+ File f = new File(getTmpDir(), basename + ".txt");
+ FileOutputStream fos = new FileOutputStream(f);
+ ros.open(fos);
+ byte [] b = new byte[size];
+ for (int i = 0; i < size; i++)
+ {
+ b[i] = (byte)i;
+ }
+ ros.write(b);
+ ros.close();
+ fos.close();
+ assertEquals("Content-Length test", size,
+ ros.getResponseContentLength());
+ return f;
+ }
+
+ /**
+ * Verify what was written is both in the file written to and in the
+ * recording stream.
+ *
+ * @param ros Stream to check.
+ * @param f File that was recorded. Stream should have its content
+ * exactly.
+ * @param size Amount of bytes written.
+ *
+ * @exception IOException Failure reading streams.
+ */
+ private void verifyRecording(RecordingOutputStream ros, File f,
+ int size) throws IOException
+ {
+ assertEquals("Recorded file size.", size, f.length());
+ FileInputStream fis = new FileInputStream(f);
+ assertNotNull("FileInputStream not null", fis);
+ ReplayInputStream ris = ros.getReplayInputStream();
+ assertNotNull("ReplayInputStream not null", ris);
+ for (int i = 0; i < size; i++)
+ {
+ assertEquals("ReplayInputStream content verification", i,
+ ris.read());
+ assertEquals("Recorded file content verification", i,
+ fis.read());
+ }
+ assertEquals("ReplayInputStream at EOF", -1, ris.read());
+ fis.close();
+ ris.close();
+ }
+}
From da5d63d41d83fe4d5ea6d14165830e75c568c9a2 Mon Sep 17 00:00:00 2001
From: Noah Levitt
Date: Tue, 6 Jan 2015 15:58:31 -0800
Subject: [PATCH 019/240] fix for
https://github.com/iipc/webarchive-commons/issues/38 - detect end of http
protocol headers in a smarter way, to avoid calling write(byte) repeatedly;
add unit tests
---
.../org/archive/io/RecordingOutputStream.java | 49 +++++++--
.../archive/io/RecordingOutputStreamTest.java | 100 ++++++++++++++++++
2 files changed, 142 insertions(+), 7 deletions(-)
diff --git a/src/main/java/org/archive/io/RecordingOutputStream.java b/src/main/java/org/archive/io/RecordingOutputStream.java
index fe05701c..7d2ff212 100644
--- a/src/main/java/org/archive/io/RecordingOutputStream.java
+++ b/src/main/java/org/archive/io/RecordingOutputStream.java
@@ -242,6 +242,26 @@ public void write(int b) throws IOException {
checkLimits();
}
+ private int findMessageBodyBeginMark(byte[] b, int off, int len) {
+ if ((lastTwoBytes[1] == '\n' || lastTwoBytes[0] == '\n' && lastTwoBytes[1] == '\r')
+ && len >= 1 && b[off] == '\n') {
+ return 1;
+ } else if (lastTwoBytes[1] == '\n' && len >= 2 && b[off] == '\r' && b[off+1] == '\n') {
+ return 2;
+ }
+
+ for (int i = off; i < off + len - 1; i++) {
+ if (b[i] == '\n' && b[i+1] == '\n') {
+ return i + 2;
+ } else if (b[i] == '\n' && b[i+1] == '\r'
+ && i + 2 < off + len && b[i+2] == '\n') {
+ return i + 3;
+ }
+ }
+
+ return -1;
+ }
+
public void write(byte[] b, int off, int len) throws IOException {
if(position < maxPosition) {
if(position+len<=maxPosition) {
@@ -255,20 +275,35 @@ public void write(byte[] b, int off, int len) throws IOException {
off += consumeRange;
len -= consumeRange;
}
-
- // see comment on int[] lastTwoBytes
- while (messageBodyBeginMark < 0 && len > 0) {
- write(b[off]);
- off++;
- len--;
+
+ if (messageBodyBeginMark < 0) {
+ // see comment on int[] lastTwoBytes
+ int mark = findMessageBodyBeginMark(b, off, len);
+ if (mark > 0) {
+ if(recording) {
+ record(b, off, mark - off);
+ }
+ if (this.out != null) {
+ this.out.write(b, off, mark - off);
+ }
+ markMessageBodyBegin();
+ len = len - (mark - off);
+ off = mark;
+ }
}
-
+
if(recording) {
record(b, off, len);
}
if (this.out != null) {
this.out.write(b, off, len);
}
+ if (len >= 1) {
+ lastTwoBytes[1] = b[off + len - 1];
+ if (len >= 2) {
+ lastTwoBytes[0] = b[off + len - 2];
+ }
+ }
checkLimits();
}
diff --git a/src/test/java/org/archive/io/RecordingOutputStreamTest.java b/src/test/java/org/archive/io/RecordingOutputStreamTest.java
index 1c53549b..f697ff31 100644
--- a/src/test/java/org/archive/io/RecordingOutputStreamTest.java
+++ b/src/test/java/org/archive/io/RecordingOutputStreamTest.java
@@ -18,11 +18,13 @@
*/
package org.archive.io;
+import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
+import org.archive.util.Base32;
import org.archive.util.TmpDirTestCase;
@@ -257,4 +259,102 @@ private void verifyRecording(RecordingOutputStream ros, File f,
fis.close();
ris.close();
}
+
+ public void testMessageBodyBegin() throws IOException {
+ final String BASENAME = "testMessageBodyBegin";
+ cleanUpOldFiles(BASENAME);
+ RecordingOutputStream ros = new RecordingOutputStream(BUFFER_SIZE,
+ (new File(getTmpDir(), BASENAME + "Backing.txt")).getAbsolutePath());
+ ros.setSha1Digest();
+
+ ros.open(new ByteArrayOutputStream());
+ ros.write("0123456789\n\nabcdefghij".getBytes());
+ assertEquals(12, ros.getMessageBodyBegin());
+ assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue()));
+ ros.close();
+
+ ros.open(new ByteArrayOutputStream());
+ ros.write("0123456789\r\n\r\nabcdefghij".getBytes());
+ assertEquals(14, ros.getMessageBodyBegin());
+ assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue()));
+ ros.close();
+
+ ros.open(new ByteArrayOutputStream());
+ ros.write("0123456789\n\r\nabcdefghij".getBytes());
+ assertEquals(13, ros.getMessageBodyBegin());
+ assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue()));
+ ros.close();
+
+ ros.open(new ByteArrayOutputStream());
+ ros.write("0123456789\n".getBytes());
+ assertEquals(-1, ros.getMessageBodyBegin());
+ ros.write("\nabcdefghij".getBytes());
+ assertEquals(12, ros.getMessageBodyBegin());
+ assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue()));
+ ros.close();
+
+ ros.open(new ByteArrayOutputStream());
+ ros.write("0123456789\n".getBytes());
+ assertEquals(-1, ros.getMessageBodyBegin());
+ ros.write("\r\nabcdefghij".getBytes());
+ assertEquals(13, ros.getMessageBodyBegin());
+ assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue()));
+ ros.close();
+
+ ros.open(new ByteArrayOutputStream());
+ ros.write("0123456789\n\r".getBytes());
+ assertEquals(-1, ros.getMessageBodyBegin());
+ ros.write("\nabcdefghij".getBytes());
+ assertEquals(13, ros.getMessageBodyBegin());
+ assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue()));
+ ros.close();
+
+ ros.open(new ByteArrayOutputStream());
+ ros.write("0123456789".getBytes());
+ ros.write('\n');
+ assertEquals(-1, ros.getMessageBodyBegin());
+ ros.write("\nabcdefghij".getBytes());
+ assertEquals(12, ros.getMessageBodyBegin());
+ assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue()));
+ ros.close();
+
+ ros.open(new ByteArrayOutputStream());
+ ros.write("0123456789".getBytes());
+ ros.write('\n');
+ ros.write('\n');
+ for (int b: "abcdefghij".getBytes()) {
+ ros.write(b);
+ }
+ assertEquals(12, ros.getMessageBodyBegin());
+ assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue()));
+ ros.close();
+
+ ros.open(new ByteArrayOutputStream());
+ ros.write("0123456789".getBytes());
+ ros.write('\n');
+ ros.write('\r');
+ ros.write('\n');
+ for (int b: "abcdefghij".getBytes()) {
+ ros.write(b);
+ }
+ assertEquals(13, ros.getMessageBodyBegin());
+ assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue()));
+ ros.close();
+
+ ros.open(new ByteArrayOutputStream());
+ ros.write("0123456789\n".getBytes());
+ ros.write('\n');
+ ros.write("abcdefghij".getBytes());
+ assertEquals(12, ros.getMessageBodyBegin());
+ assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue()));
+ ros.close();
+
+ ros.open(new ByteArrayOutputStream());
+ ros.write("0123456789\n\r".getBytes());
+ ros.write('\n');
+ ros.write("abcdefghij".getBytes());
+ assertEquals(13, ros.getMessageBodyBegin());
+ assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue()));
+ ros.close();
+ }
}
From 808dcfe76002ebc126c168abb5b6f00b5d3b7e07 Mon Sep 17 00:00:00 2001
From: Noah Levitt
Date: Tue, 6 Jan 2015 16:08:48 -0800
Subject: [PATCH 020/240] move TmpDirTestCase.java from heritrix to
webarchive-commons
---
.../java/org/archive/util/TmpDirTestCase.java | 119 ++++++++++++++++++
1 file changed, 119 insertions(+)
create mode 100644 src/main/java/org/archive/util/TmpDirTestCase.java
diff --git a/src/main/java/org/archive/util/TmpDirTestCase.java b/src/main/java/org/archive/util/TmpDirTestCase.java
new file mode 100644
index 00000000..09ec345b
--- /dev/null
+++ b/src/main/java/org/archive/util/TmpDirTestCase.java
@@ -0,0 +1,119 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.util;
+
+import java.io.File;
+import java.io.IOException;
+
+import junit.framework.TestCase;
+
+
+/**
+ * Base class for TestCases that want access to a tmp dir for the writing
+ * of files.
+ *
+ * @author stack
+ */
+public abstract class TmpDirTestCase extends TestCase
+{
+ /**
+ * Name of the system property that holds pointer to tmp directory into
+ * which we can safely write files.
+ */
+ public static final String TEST_TMP_SYSTEM_PROPERTY_NAME = "testtmpdir";
+
+ /**
+ * Default test tmp.
+ */
+ public static final String DEFAULT_TEST_TMP_DIR = File.separator + "tmp" +
+ File.separator + "heritrix-junit-tests";
+
+ /**
+ * Directory to write temporary files to.
+ */
+ private File tmpDir = null;
+
+
+ public TmpDirTestCase()
+ {
+ super();
+ }
+
+ public TmpDirTestCase(String testName)
+ {
+ super(testName);
+ }
+
+ /*
+ * @see TestCase#setUp()
+ */
+ protected void setUp() throws Exception {
+ super.setUp();
+ this.tmpDir = tmpDir();
+ }
+
+ /**
+ * @return Returns the tmpDir.
+ */
+ public File getTmpDir()
+ {
+ return this.tmpDir;
+ }
+
+ /**
+ * Delete any files left over from previous run.
+ *
+ * @param basename Base name of files we're to clean up.
+ */
+ public void cleanUpOldFiles(String basename) {
+ cleanUpOldFiles(getTmpDir(), basename);
+ }
+
+ /**
+ * Delete any files left over from previous run.
+ *
+ * @param prefix Base name of files we're to clean up.
+ * @param basedir Directory to start cleaning in.
+ */
+ public void cleanUpOldFiles(File basedir, String prefix) {
+ File [] files = FileUtils.getFilesWithPrefix(basedir, prefix);
+ if (files != null) {
+ for (int i = 0; i < files.length; i++) {
+ org.apache.commons.io.FileUtils.deleteQuietly(files[i]);
+ }
+ }
+ }
+
+
+ public static File tmpDir() throws IOException {
+ String tmpDirStr = System.getProperty(TEST_TMP_SYSTEM_PROPERTY_NAME);
+ tmpDirStr = (tmpDirStr == null)? DEFAULT_TEST_TMP_DIR: tmpDirStr;
+ File tmpDir = new File(tmpDirStr);
+ FileUtils.ensureWriteableDirectory(tmpDir);
+
+ if (!tmpDir.canWrite())
+ {
+ throw new IOException(tmpDir.getAbsolutePath() +
+ " is unwriteable.");
+ }
+
+ return tmpDir;
+ }
+}
From eda46e2554f52d0514de04b6624f81964e67289d Mon Sep 17 00:00:00 2001
From: Noah Levitt
Date: Tue, 6 Jan 2015 16:24:39 -0800
Subject: [PATCH 021/240] update junit dependency since TmpDirTestCase.java is
not in the "test" area
---
pom.xml | 1 -
1 file changed, 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index 6664efd8..df8d0928 100644
--- a/pom.xml
+++ b/pom.xml
@@ -65,7 +65,6 @@
junitjunit3.8.1
- test
From c77d6f5b0dcd899f5adff3db8eab87319cc162ed Mon Sep 17 00:00:00 2001
From: Noah Levitt
Date: Tue, 27 Jan 2015 14:55:45 -0800
Subject: [PATCH 022/240] update CHANGES.md
---
CHANGES.md | 1 +
1 file changed, 1 insertion(+)
diff --git a/CHANGES.md b/CHANGES.md
index 7fb2f7c4..b872846d 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -3,6 +3,7 @@
* [Escape redirect URLs in RealCDXExtractorOutput](https://github.com/iipc/webarchive-commons/pull/36)
* [Tests fail on Windows](https://github.com/iipc/webarchive-commons/issues/2)
* [Test fails on Java 8](https://github.com/iipc/webarchive-commons/issues/31)
+* [RecordingOutputStream can affect tcp packets sent in an undesirable way](https://github.com/iipc/webarchive-commons/issues/38)
1.1.4
-----
From 5df4d91d8cb7c4c2943c318eb44cb9579ac55597 Mon Sep 17 00:00:00 2001
From: Andrew Jackson
Date: Wed, 4 Feb 2015 10:10:11 +0000
Subject: [PATCH 023/240] [maven-release-plugin] prepare release
webarchive-commons-1.1.5
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index df8d0928..0ed119b8 100644
--- a/pom.xml
+++ b/pom.xml
@@ -9,7 +9,7 @@
org.netpreserve.commonswebarchive-commons
- 1.1.5-SNAPSHOT
+ 1.1.5jarwebarchive-commons
From 62ff2fefb02e9bd24d7c41945628006682c00ce1 Mon Sep 17 00:00:00 2001
From: Andrew Jackson
Date: Wed, 4 Feb 2015 10:10:14 +0000
Subject: [PATCH 024/240] [maven-release-plugin] prepare for next development
iteration
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index 0ed119b8..7a32de08 100644
--- a/pom.xml
+++ b/pom.xml
@@ -9,7 +9,7 @@
org.netpreserve.commonswebarchive-commons
- 1.1.5
+ 1.1.6-SNAPSHOTjarwebarchive-commons
From c44e320ae6df411403a50a4bddfcdfa0c27898f7 Mon Sep 17 00:00:00 2001
From: Gerhard Gossen
Date: Mon, 15 Jun 2015 13:25:59 +0200
Subject: [PATCH 025/240] Handle empty String argument in
CharsetDetector.trimAttrValue
---
.../java/org/archive/format/text/charset/CharsetDetector.java | 3 +++
1 file changed, 3 insertions(+)
diff --git a/src/main/java/org/archive/format/text/charset/CharsetDetector.java b/src/main/java/org/archive/format/text/charset/CharsetDetector.java
index d391aac3..ae71b5fa 100644
--- a/src/main/java/org/archive/format/text/charset/CharsetDetector.java
+++ b/src/main/java/org/archive/format/text/charset/CharsetDetector.java
@@ -178,6 +178,9 @@ protected String getCharsetFromMeta(byte buffer[],int len) throws IOException {
}
private static String trimAttrValue(String value) {
+ if (value.isEmpty()) {
+ return value;
+ }
String result = value;
if (result.charAt(0) == '"') {
result = result.substring(1, result.length() - 1);
From c7daf46e75b7d9cebee9de9f2c54560f333e6976 Mon Sep 17 00:00:00 2001
From: Andy Jackson
Date: Mon, 15 Jun 2015 22:26:08 +0100
Subject: [PATCH 026/240] Update CHANGES.md
---
CHANGES.md | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/CHANGES.md b/CHANGES.md
index b872846d..c43ff93e 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,3 +1,7 @@
+1.1.6
+-----
+* [Handle empty String argument in CharsetDetector.trimAttrValue](https://github.com/iipc/webarchive-commons/pull/49)
+
1.1.5
-----
* [Escape redirect URLs in RealCDXExtractorOutput](https://github.com/iipc/webarchive-commons/pull/36)
From 6fcd096e6563d27244b8e37423af7b5aa58a9e6f Mon Sep 17 00:00:00 2001
From: RogerMathisen
Date: Tue, 14 Jul 2015 10:36:12 +0200
Subject: [PATCH 027/240] Adding commit for: Fix issues #42 #43 #44 #45 and
#47 #46
---
CHANGES.md | 6 ++
pom.xml | 2 +-
.../extract/RealCDXExtractorOutput.java | 2 +-
.../archive/extract/ResourceExtractor.java | 14 +++-
.../WARCMetadataRecordExtractorOutput.java | 2 +-
.../archive/extract/WATExtractorOutput.java | 71 ++++++++++++++++---
.../archive/resource/ResourceConstants.java | 3 +-
.../archive/resource/warc/WARCResource.java | 8 ++-
.../record/WARCMetaDataResourceFactory.java | 4 +-
src/main/java/org/archive/util/IAUtils.java | 33 +++++++++
.../resources/org/archive/commons.properties | 5 ++
11 files changed, 128 insertions(+), 22 deletions(-)
create mode 100644 src/main/resources/org/archive/commons.properties
diff --git a/CHANGES.md b/CHANGES.md
index c43ff93e..70f9b052 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,6 +1,12 @@
1.1.6
-----
* [Handle empty String argument in CharsetDetector.trimAttrValue](https://github.com/iipc/webarchive-commons/pull/49)
+* [WAT extractor: adding information in WAT's warcinfo](https://github.com/iipc/webarchive-commons/issues/47)
+* [WAT extractor: missing WARC format version](https://github.com/iipc/webarchive-commons/issues/45)
+* [WAT extractor: envelope structure does not conform to the WAT specification](https://github.com/iipc/webarchive-commons/issues/44)
+* [WAT extractor: WARC-Date in all records should be the WAT record generation date](https://github.com/iipc/webarchive-commons/issues/43)
+* [WAT extractor: WARC-Filename in the WAT warcinfo record should be the WAT filename itself](https://github.com/iipc/webarchive-commons/issues/42)
+* [WAT extractor: Entity-Trailing-Slop-Bytes should be called Entity-Trailing-Slop-Length](https://github.com/iipc/webarchive-commons/issues/48)
1.1.5
-----
diff --git a/pom.xml b/pom.xml
index d3314679..7984edde 100644
--- a/pom.xml
+++ b/pom.xml
@@ -9,7 +9,7 @@
org.netpreserve.commonswebarchive-commons
- 1.1.6-SNAPSHOT
+ 1.1.7-SNAPSHOTjarwebarchive-commons
diff --git a/src/main/java/org/archive/extract/RealCDXExtractorOutput.java b/src/main/java/org/archive/extract/RealCDXExtractorOutput.java
index 8ca3ff82..e6f6e82f 100644
--- a/src/main/java/org/archive/extract/RealCDXExtractorOutput.java
+++ b/src/main/java/org/archive/extract/RealCDXExtractorOutput.java
@@ -104,7 +104,7 @@ public void output(Resource resource) throws IOException {
String meta = "TBD";
String redir = "TBD";
- if(format.equals("WARC")) {
+ if(format.startsWith("WARC")) {
origUrl = getWARCURL(m);
date = getWARCDate(m);
String type = getWARCType(m);
diff --git a/src/main/java/org/archive/extract/ResourceExtractor.java b/src/main/java/org/archive/extract/ResourceExtractor.java
index 7f4d6e7a..2812aa5b 100644
--- a/src/main/java/org/archive/extract/ResourceExtractor.java
+++ b/src/main/java/org/archive/extract/ResourceExtractor.java
@@ -1,6 +1,7 @@
package org.archive.extract;
import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
@@ -74,7 +75,7 @@ public int run(String[] args)
if(args.length < 1) {
return USAGE(1);
}
- if(args.length > 3) {
+ if(args.length > 4) {
return USAGE(1);
}
int max = Integer.MAX_VALUE;
@@ -89,7 +90,14 @@ public int run(String[] args)
}
}
String path = args[arg];
- if(args.length == arg + 2) {
+ String outputFile = null;
+ if(args.length >= arg + 2) {
+ //if a output file is specified in the command line
+ if(args.length == arg + 3) {
+ outputFile = args[arg+2];
+ os.close();
+ os = new FileOutputStream(outputFile);
+ }
if(args[arg].equals("-cdx")) {
path = args[arg+1];
out = new RealCDXExtractorOutput(makePrintWriter(os));
@@ -100,7 +108,7 @@ public int run(String[] args)
} else if(args[arg].equals("-wat")) {
path = args[arg+1];
- out = new WATExtractorOutput(os);
+ out = new WATExtractorOutput(os, outputFile);
} else {
String filter = args[arg+1];
out = new JSONViewExtractorOutput(os, filter);
diff --git a/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java b/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java
index ff46a914..68f9d1c8 100644
--- a/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java
+++ b/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java
@@ -68,7 +68,7 @@ public void output(Resource resource) throws IOException {
String date = "TBD";
String canUrl = "TBD";
- if(format.equals("WARC")) {
+ if(format.startsWith("WARC")) {
origUrl = getWARCURL(m);
date = getWARCDate(m);
String type = getWARCType(m);
diff --git a/src/main/java/org/archive/extract/WATExtractorOutput.java b/src/main/java/org/archive/extract/WATExtractorOutput.java
index f4d27147..3bcfa924 100644
--- a/src/main/java/org/archive/extract/WATExtractorOutput.java
+++ b/src/main/java/org/archive/extract/WATExtractorOutput.java
@@ -2,11 +2,13 @@
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
+import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.nio.charset.Charset;
import java.text.ParseException;
+import java.net.UnknownHostException;
import java.util.Date;
import org.archive.format.gzip.GZIPMemberWriter;
@@ -22,6 +24,12 @@
import org.archive.util.io.CommitedOutputStream;
import org.json.JSONException;
+import java.net.InetAddress;
+import java.text.DateFormat;
+import java.text.SimpleDateFormat;
+
+import java.util.logging.Logger;
+
public class WATExtractorOutput implements ExtractorOutput {
WARCRecordWriter recW;
private boolean wroteFirst;
@@ -29,11 +37,15 @@ public class WATExtractorOutput implements ExtractorOutput {
private static int DEFAULT_BUFFER_RAM = 1024 * 1024;
private int bufferRAM = DEFAULT_BUFFER_RAM;
private final static Charset UTF8 = Charset.forName("UTF-8");
+ private String outputFile;
+
+ private static final Logger LOG = Logger.getLogger(WATExtractorOutput.class.getName());
- public WATExtractorOutput(OutputStream out) {
+ public WATExtractorOutput(OutputStream out, String outputFile) {
gzW = new GZIPMemberWriter(out);
recW = new WARCRecordWriter();
wroteFirst = false;
+ this.outputFile = outputFile;
}
private CommitedOutputStream getOutput() {
@@ -56,9 +68,9 @@ public void output(Resource resource) throws IOException {
throw new IOException("Missing Envelope.Format");
}
cos = getOutput();
- if(envelopeFormat.equals("ARC")) {
+ if(envelopeFormat.startsWith("ARC")) {
writeARC(cos,top);
- } else if(envelopeFormat.equals("WARC")) {
+ } else if(envelopeFormat.startsWith("WARC")) {
writeWARC(cos,top);
} else {
// hrm...
@@ -68,13 +80,51 @@ public void output(Resource resource) throws IOException {
}
private void writeWARCInfo(OutputStream recOut, MetaData md) throws IOException {
- String filename = JSONUtils.extractSingle(md, "Container.Filename");
- if(filename == null) {
- throw new IOException("No Container.Filename...");
+ // filename is given in the command line
+ String filename = outputFile;
+ if (filename == null || filename.length() == 0) {
+ // if no filename by command line, we construct a default filename base on container filename
+ filename = JSONUtils.extractSingle(md, "Container.Filename");
+ if (filename == null) {
+ throw new IOException("No Container.Filename...");
+ }
+ if (filename.endsWith(".warc") || filename.endsWith(".warc.gz")) {
+ filename = filename.replaceFirst("\\.warc$", ".warc.wat.gz");
+ filename = filename.replaceFirst("\\.warc\\.gz$", ".warc.wat.gz");
+ } else if (filename.endsWith(".arc") || filename.endsWith(".arc.gz")) {
+ filename = filename.replaceFirst("\\.arc$", ".arc.wat.gz");
+ filename = filename.replaceFirst("\\.arc\\.gz$", ".arc.wat.gz");
+ }
}
+ // removing path from filename
+ File tmpFile = new File(filename);
+ filename = tmpFile.getName();
HttpHeaders headers = new HttpHeaders();
- headers.add("Software-Info", IAUtils.COMMONS_VERSION);
- headers.addDateHeader("Extracted-Date", new Date());
+ headers.add("software", IAUtils.COMMONS_VERSION);
+ headers.addDateHeader("extractedDate", new Date());
+
+ // add ip, hostname
+ try {
+ InetAddress host = InetAddress.getLocalHost();
+ headers.add("ip", host.getHostAddress());
+ headers.add("hostname", host.getCanonicalHostName());
+ } catch (UnknownHostException e) {
+ LOG.warning("unable to obtain local crawl engine host :\n"+e.getMessage());
+ }
+
+ headers.add("format", IAUtils.WARC_FORMAT);
+ headers.add("conformsTo", IAUtils.WARC_FORMAT_CONFORMS_TO);
+ // optional arguments
+ if(IAUtils.OPERATOR != null && IAUtils.OPERATOR.length() > 0) {
+ headers.add("operator", IAUtils.OPERATOR);
+ }
+ if(IAUtils.PUBLISHER != null && IAUtils.PUBLISHER.length() > 0) {
+ headers.add("publisher", IAUtils.PUBLISHER);
+ }
+ if(IAUtils.WAT_WARCINFO_DESCRIPTION != null && IAUtils.WAT_WARCINFO_DESCRIPTION.length() > 0) {
+ headers.add("description", IAUtils.WAT_WARCINFO_DESCRIPTION);
+ }
+
ByteArrayOutputStream baos = new ByteArrayOutputStream();
headers.write(baos);
recW.writeWARCInfoRecord(recOut,filename,baos.toByteArray());
@@ -105,8 +155,9 @@ private void writeWARC(OutputStream recOut, MetaData md) throws IOException {
} else {
targetURI = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Target-URI");
}
- String capDateString = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Date");
- capDateString = transformWARCDate(capDateString);
+ // handle date of generation in WARC format
+ DateFormat dateFormat = new SimpleDateFormat("yyyyMMddHHmmss");
+ String capDateString = dateFormat.format(new Date());
String recId = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Record-ID");
writeWARCMDRecord(recOut,md,targetURI,capDateString,recId);
}
diff --git a/src/main/java/org/archive/resource/ResourceConstants.java b/src/main/java/org/archive/resource/ResourceConstants.java
index dd04fcfe..3b8bea1c 100644
--- a/src/main/java/org/archive/resource/ResourceConstants.java
+++ b/src/main/java/org/archive/resource/ResourceConstants.java
@@ -31,6 +31,7 @@ public interface ResourceConstants {
public static final String ENVELOPE_FORMAT = "Format";
public static final String ENVELOPE_FORMAT_ARC = "ARC";
public static final String ENVELOPE_FORMAT_WARC = "WARC";
+ public static final String ENVELOPE_FORMAT_WARC_1_0 = "WARC/1.0";
public static final String WARC_HEADER_LENGTH = "WARC-Header-Length";
public static final String WARC_HEADER_METADATA = "WARC-Header-Metadata";
@@ -104,7 +105,7 @@ public interface ResourceConstants {
public static final String HTTP_ENTITY_LENGTH = "Entity-Length";
public static final String HTTP_ENTITY_DIGEST = "Entity-Digest";
- public static final String HTTP_ENTITY_TRAILING_SLOP = "Entity-Trailing-Slop-Bytes";
+ public static final String HTTP_ENTITY_TRAILING_SLOP = "Entity-Trailing-Slop-Length";
public static final String HTML_METADATA = "HTML-Metadata";
public static final String HTML_HEAD = "Head";
diff --git a/src/main/java/org/archive/resource/warc/WARCResource.java b/src/main/java/org/archive/resource/warc/WARCResource.java
index 80929206..d538a25d 100644
--- a/src/main/java/org/archive/resource/warc/WARCResource.java
+++ b/src/main/java/org/archive/resource/warc/WARCResource.java
@@ -36,7 +36,7 @@ public WARCResource(MetaData metaData, ResourceContainer container,
this.response = response;
long length = -1;
- metaData.putString(ENVELOPE_FORMAT, ENVELOPE_FORMAT_WARC);
+ metaData.putString(ENVELOPE_FORMAT, ENVELOPE_FORMAT_WARC_1_0);
metaData.putLong(WARC_HEADER_LENGTH, response.getHeaderBytes());
MetaData fields = metaData.createChild(WARC_HEADER_METADATA);
for(HttpHeader h : response.getHeaders()) {
@@ -68,11 +68,11 @@ public InputStream getInputStream() {
}
public void notifyEOF() throws IOException {
- envelope.putLong(PAYLOAD_LENGTH, countingIS.getCount());
String digString = Base32.encode(digIS.getMessageDigest().digest());
- envelope.putString(PAYLOAD_DIGEST, "sha1:"+digString);
if(container.isCompressed()) {
+ metaData.putLong(PAYLOAD_LENGTH, countingIS.getCount());
metaData.putLong(PAYLOAD_SLOP_BYTES, StreamCopy.readToEOF(response));
+ metaData.putString(PAYLOAD_DIGEST, "sha1:"+digString);
} else {
// consume trailing bytes if we can...
InputStream raw = response.getInner();
@@ -81,7 +81,9 @@ public void notifyEOF() throws IOException {
(PushBackOneByteInputStream) raw;
long numNewlines = StreamCopy.skipChars(pb1bis, CR_NL_CHARS);
if(numNewlines > 0) {
+ metaData.putLong(PAYLOAD_LENGTH, countingIS.getCount());
metaData.putLong(PAYLOAD_SLOP_BYTES, numNewlines);
+ metaData.putString(PAYLOAD_DIGEST, "sha1:"+digString);
}
}
}
diff --git a/src/main/java/org/archive/resource/warc/record/WARCMetaDataResourceFactory.java b/src/main/java/org/archive/resource/warc/record/WARCMetaDataResourceFactory.java
index 3f502665..0dfb2834 100644
--- a/src/main/java/org/archive/resource/warc/record/WARCMetaDataResourceFactory.java
+++ b/src/main/java/org/archive/resource/warc/record/WARCMetaDataResourceFactory.java
@@ -33,8 +33,8 @@ public Resource getResource(InputStream is, MetaData parentMetaData,
if(headers.isCorrupt()) {
md.putBoolean(WARC_META_FIELDS_CORRUPT, true);
}
- md.putLong(PAYLOAD_SLOP_BYTES, StreamCopy.readToEOF(is));
- md.putLong(PAYLOAD_LENGTH, bytes);
+ parentMetaData.putLong(PAYLOAD_SLOP_BYTES, StreamCopy.readToEOF(is));
+ parentMetaData.putLong(PAYLOAD_LENGTH, bytes);
return new WARCMetaDataResource(md,container, headers);
} catch (HttpParseException e) {
diff --git a/src/main/java/org/archive/util/IAUtils.java b/src/main/java/org/archive/util/IAUtils.java
index ed563d02..d3cf5cf9 100644
--- a/src/main/java/org/archive/util/IAUtils.java
+++ b/src/main/java/org/archive/util/IAUtils.java
@@ -24,7 +24,10 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
+import java.io.Reader;
+import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
+import java.util.Properties;
/**
* Miscellaneous useful methods.
@@ -35,6 +38,11 @@ public class IAUtils {
public final static Charset UTF8 = Charset.forName("utf-8");
final public static String COMMONS_VERSION = loadCommonsVersion();
+ final public static String PUBLISHER = loadCommons("publisher");
+ final public static String OPERATOR = loadCommons("operator");
+ final public static String WAT_WARCINFO_DESCRIPTION = loadCommons("wat.warcinfo.description");
+ final public static String WARC_FORMAT = loadCommons("warc.format");
+ final public static String WARC_FORMAT_CONFORMS_TO = loadCommons("warc.format.conforms.to");
public static String loadCommonsVersion() {
InputStream input = IAUtils.class.getResourceAsStream(
@@ -57,6 +65,31 @@ public static String loadCommonsVersion() {
return version.trim();
}
+ public static String loadCommons(String id) {
+ InputStream input = IAUtils.class.getResourceAsStream("/org/archive/commons.properties");
+ Reader reader = null;
+ if (input == null) {
+ return "UNKNOWN";
+ }
+ try {
+ reader = new InputStreamReader(input, "UTF-8");
+ } catch (UnsupportedEncodingException e) {
+ return "UNKNOWN";
+ }
+ Properties prop = new Properties();
+ try {
+ prop.load(reader);
+ } catch (IOException e1) {
+ return "UNKNOWN";
+ }
+ if (prop.getProperty(id) != null) {
+ return prop.getProperty(id);
+ } else {
+ return "UNKNOWN";
+ }
+
+ }
+
public static void closeQuietly(Object input) {
if(input == null || ! (input instanceof Closeable)) {
return;
diff --git a/src/main/resources/org/archive/commons.properties b/src/main/resources/org/archive/commons.properties
new file mode 100644
index 00000000..f115ff43
--- /dev/null
+++ b/src/main/resources/org/archive/commons.properties
@@ -0,0 +1,5 @@
+operator=
+publisher=
+wat.warcinfo.description=
+warc.format=WARC File Format 1.0
+warc.format.conforms.to=http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf
From d4e7730db7c3fc49c55995bfbdf5b5d89f9e2145 Mon Sep 17 00:00:00 2001
From: RogerMathisen
Date: Tue, 14 Jul 2015 11:19:58 +0200
Subject: [PATCH 028/240] [maven-release-plugin] prepare release
webarchive-commons-1.1.6
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index 7984edde..27118d70 100644
--- a/pom.xml
+++ b/pom.xml
@@ -9,7 +9,7 @@
org.netpreserve.commonswebarchive-commons
- 1.1.7-SNAPSHOT
+ 1.1.6jarwebarchive-commons
From 7a7cf08941d966c6e3a6fcd42f1a886552d23038 Mon Sep 17 00:00:00 2001
From: RogerMathisen
Date: Tue, 14 Jul 2015 11:20:38 +0200
Subject: [PATCH 029/240] [maven-release-plugin] prepare for next development
iteration
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index 27118d70..7984edde 100644
--- a/pom.xml
+++ b/pom.xml
@@ -9,7 +9,7 @@
org.netpreserve.commonswebarchive-commons
- 1.1.6
+ 1.1.7-SNAPSHOTjarwebarchive-commons
From c1545bc7bee9c9bbd8626cf1b4b8d323bd415f2c Mon Sep 17 00:00:00 2001
From: Noah Levitt
Date: Wed, 28 Oct 2015 11:45:48 -0700
Subject: [PATCH 030/240] fix for HER-2089 - get rid of broken, seemingly
unnecessary escapeWhitespace() step of uri fixup
---
.../org/archive/url/UsableURIFactory.java | 52 +------------------
.../org/archive/url/UsableURIFactoryTest.java | 8 ++-
2 files changed, 9 insertions(+), 51 deletions(-)
diff --git a/src/main/java/org/archive/url/UsableURIFactory.java b/src/main/java/org/archive/url/UsableURIFactory.java
index 9118b850..1059bfbd 100644
--- a/src/main/java/org/archive/url/UsableURIFactory.java
+++ b/src/main/java/org/archive/url/UsableURIFactory.java
@@ -49,8 +49,8 @@
* @author stack
*/
public class UsableURIFactory extends URI {
-
- private static final long serialVersionUID = -6146295130382209042L;
+
+ private static final long serialVersionUID = 2L;
/**
* Logging instance.
@@ -395,9 +395,6 @@ private String fixup(String uri, final URI base, final String charset)
}
TextUtils.recycleMatcher(matcher);
- // now, minimally escape any whitespace
- uri = escapeWhitespace(uri);
-
// For further processing, get uri elements. See the RFC2396REGEX
// comment above for explanation of group indices used in the below.
// matcher = RFC2396REGEX.matcher(uri);
@@ -663,51 +660,6 @@ private String ensureMinimalEscaping(String u, final String charset,
return u;
}
- /**
- * Escape any whitespace found.
- *
- * The parent class takes care of the bulk of escaping. But if any
- * instance of escaping is found in the URI, then we ask for parent
- * to do NO escaping. Here we escape any whitespace found irrespective
- * of whether the uri has already been escaped. We do this for
- * case where uri has been judged already-escaped only, its been
- * incompletly done and whitespace remains. Spaces, etc., in the URI are
- * a real pain. Their presence will break log file and ARC parsing.
- * @param uri URI string to check.
- * @return uri with spaces escaped if any found.
- */
- protected String escapeWhitespace(String uri) {
- // Just write a new string anyways. The perl '\s' is not
- // as inclusive as the Character.isWhitespace so there are
- // whitespace characters we could miss. So, rather than
- // write some awkward regex, just go through the string
- // a character at a time. Only create buffer first time
- // we find a space.
- MutableString buffer = null;
- for (int i = 0; i < uri.length(); i++) {
- char c = uri.charAt(i);
- if (Character.isWhitespace(c)) {
- if (buffer == null) {
- buffer = new MutableString(uri.length() +
- 2 /*If space, two extra characters (at least)*/);
- buffer.append(uri.substring(0, i));
- }
- buffer.append("%");
- String hexStr = Integer.toHexString(c);
- if ((hexStr.length() % 2) > 0) {
- buffer.append("0");
- }
- buffer.append(hexStr);
-
- } else {
- if (buffer != null) {
- buffer.append(c);
- }
- }
- }
- return (buffer != null)? buffer.toString(): uri;
- }
-
/**
* Check port on passed http authority. Make sure the size is not larger
* than allowed: See the 'port' definition on this
diff --git a/src/test/java/org/archive/url/UsableURIFactoryTest.java b/src/test/java/org/archive/url/UsableURIFactoryTest.java
index af190957..73f2b6db 100644
--- a/src/test/java/org/archive/url/UsableURIFactoryTest.java
+++ b/src/test/java/org/archive/url/UsableURIFactoryTest.java
@@ -174,7 +174,7 @@ public final void testWhitespaceEscaped() throws URIException {
assertTrue("Not equal " + uuri.toString(),
uuri.toString().equals(tgtUri));
uri = "http://archive.org/index%25\u001D.html";
- tgtUri = "http://archive.org/index%25%1D.html".toLowerCase();
+ tgtUri = "http://archive.org/index%25%1D.html";
uuri = UsableURIFactory.getInstance(uri);
assertEquals("whitespace escaping", tgtUri, uuri.toString());
uri = "http://gemini.info.usaid.gov/directory/" +
@@ -185,6 +185,12 @@ public final void testWhitespaceEscaped() throws URIException {
"faxResults.cfm?name=Ebenezer +Rumplestiltskin,&location=" +
"RRB%20%20%20%205%2E08%2D006");
assertEquals("whitespace escaping", tgtUri, uuri.toString());
+
+ // https://webarchive.jira.com/browse/HER-2089
+ uri = "http://archive.org/index%25\u3000.html";
+ tgtUri = "http://archive.org/index%25%E3%80%80.html";
+ uuri = UsableURIFactory.getInstance(uri);
+ assertEquals("U+3000 ideographic space escaping", tgtUri, uuri.toString());
}
// public final void testFailedGetPath() throws URIException {
From 86589b0fafaa0918ce2192080e68941c47b39c40 Mon Sep 17 00:00:00 2001
From: Noah Levitt
Date: Mon, 14 Dec 2015 10:38:49 -0800
Subject: [PATCH 031/240] flush output etc before tallying stats to fix
sizeOnDisk calculation
---
src/main/java/org/archive/io/warc/WARCWriter.java | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/src/main/java/org/archive/io/warc/WARCWriter.java b/src/main/java/org/archive/io/warc/WARCWriter.java
index e2d28ee9..7e22e08b 100644
--- a/src/main/java/org/archive/io/warc/WARCWriter.java
+++ b/src/main/java/org/archive/io/warc/WARCWriter.java
@@ -236,8 +236,8 @@ public void writeRecord(WARCRecordInfo recordInfo)
long totalBytes = 0;
long startPosition;
- try {
- startPosition = getPosition();
+ startPosition = getPosition();
+ try {
preWriteRecordTasks();
// TODO: Revisit encoding of header.
@@ -261,13 +261,12 @@ public void writeRecord(WARCRecordInfo recordInfo)
write(CRLF_BYTES);
totalBytes += 2 * CRLF_BYTES.length;
- tally(recordInfo.getType(), contentBytes, totalBytes, getPosition() - startPosition);
-
recordInfo.setWARCFilename(getFilenameWithoutOccupiedSuffix());
recordInfo.setWARCFileOffset(startPosition);
tmpRecordLog.add(recordInfo);
} finally {
postWriteRecordTasks();
+ tally(recordInfo.getType(), contentBytes, totalBytes, getPosition() - startPosition);
}
}
From 1fede2354f65437825b6471261a8f0361ffba241 Mon Sep 17 00:00:00 2001
From: Jeremy Wiebe
Date: Mon, 7 Mar 2016 22:35:28 -0500
Subject: [PATCH 032/240] Store origin-code in ARCRecord header; accessible
through getOrigin() method.
---
.../org/archive/format/ArchiveFileConstants.java | 7 ++++++-
.../java/org/archive/format/arc/ARCConstants.java | 2 +-
src/main/java/org/archive/io/arc/ARCRecord.java | 13 +++++++++----
.../java/org/archive/io/arc/ARCRecordMetaData.java | 9 ++++++++-
4 files changed, 24 insertions(+), 7 deletions(-)
diff --git a/src/main/java/org/archive/format/ArchiveFileConstants.java b/src/main/java/org/archive/format/ArchiveFileConstants.java
index b0b8aa66..df3b4465 100644
--- a/src/main/java/org/archive/format/ArchiveFileConstants.java
+++ b/src/main/java/org/archive/format/ArchiveFileConstants.java
@@ -44,6 +44,11 @@ public interface ArchiveFileConstants {
* Key for the Archive File version field.
*/
public static final String VERSION_FIELD_KEY = "version";
+
+ /**
+ * Key for the Archive File origin-code field.
+ */
+ public static final String ORIGIN_FIELD_KEY = "origin";
/**
* Key for the Archive File length field.
@@ -80,7 +85,7 @@ public interface ArchiveFileConstants {
* Key for the Archive Record absolute offset into Archive file.
*/
public static final String ABSOLUTE_OFFSET_KEY = "absolute-offset";
-
+
public static final String READER_IDENTIFIER_FIELD_KEY =
"reader-identifier";
diff --git a/src/main/java/org/archive/format/arc/ARCConstants.java b/src/main/java/org/archive/format/arc/ARCConstants.java
index a336ddeb..5987b49f 100755
--- a/src/main/java/org/archive/format/arc/ARCConstants.java
+++ b/src/main/java/org/archive/format/arc/ARCConstants.java
@@ -196,7 +196,7 @@ public interface ARCConstants extends ArchiveFileConstants {
.asList(new String[] { URL_FIELD_KEY, IP_HEADER_FIELD_KEY,
DATE_FIELD_KEY, MIMETYPE_FIELD_KEY,
LENGTH_FIELD_KEY, VERSION_FIELD_KEY,
- ABSOLUTE_OFFSET_KEY });
+ ORIGIN_FIELD_KEY, ABSOLUTE_OFFSET_KEY });
/**
* Minimum possible record length.
diff --git a/src/main/java/org/archive/io/arc/ARCRecord.java b/src/main/java/org/archive/io/arc/ARCRecord.java
index 21bea07c..2d9c9bf4 100644
--- a/src/main/java/org/archive/io/arc/ARCRecord.java
+++ b/src/main/java/org/archive/io/arc/ARCRecord.java
@@ -200,7 +200,7 @@ public ARCRecord(InputStream in, ArchiveRecordHeader metaData,
public ARCRecord(InputStream in, final String identifier,
final long offset, boolean digest, boolean strict,
final boolean parseHttpHeaders,
- final boolean isAlignedOnFirstRecord, String version)
+ final boolean isAlignedOnFirstRecord, String version)
throws IOException {
super(in, null, 0, digest, strict);
setHeader(parseHeaders(in, identifier, offset, strict, isAlignedOnFirstRecord, version));
@@ -243,6 +243,7 @@ private ArchiveRecordHeader parseHeaders(final InputStream in,
getTokenizedHeaderLine(in, firstLineValues);
int bodyOffset = 0;
+ String origin = "";
if (offset == 0 && isAlignedOnFirstRecord) {
// If offset is zero and we were aligned at first record on
// creation (See #alignedOnFirstRecord for more on this), then no
@@ -263,6 +264,7 @@ private ArchiveRecordHeader parseHeaders(final InputStream in,
bodyOffset += getTokenizedHeaderLine(in, secondLineValues);
version = ((String)secondLineValues.get(0) +
"." + (String)secondLineValues.get(1));
+ origin = (String)secondLineValues.get(2);
// Just read over the 3rd line. We used to parse it and use
// values found here but now we just hardcode them to avoid
// having to read this 3rd line even for random arc file accesses.
@@ -271,7 +273,8 @@ private ArchiveRecordHeader parseHeaders(final InputStream in,
}
setBodyOffset(bodyOffset);
- return computeMetaData(this.headerFieldNameKeys, firstLineValues, version, offset, identifier);
+ return computeMetaData(this.headerFieldNameKeys, firstLineValues,
+ version, origin, offset, identifier);
}
/**
@@ -362,7 +365,8 @@ private int getTokenizedHeaderLine(final InputStream stream,
* @exception IOException If no. of keys doesn't match no. of values.
*/
private ARCRecordMetaData computeMetaData(List keys,
- List values, String v, long offset, final String identifier)
+ List values, String v, String origin,
+ long offset, final String identifier)
throws IOException {
if (keys.size() != values.size()) {
List originalValues = values;
@@ -423,6 +427,7 @@ private ARCRecordMetaData computeMetaData(List keys,
}
headerFields.put(VERSION_FIELD_KEY, v);
+ headerFields.put(ORIGIN_FIELD_KEY, origin);
headerFields.put(ABSOLUTE_OFFSET_KEY, new Long(offset));
return new ARCRecordMetaData(identifier, headerFields);
@@ -832,4 +837,4 @@ protected String getDigest4Cdx(ArchiveRecordHeader h) {
}
return (result != null) ? result: super.getDigest4Cdx(h);
}
-}
\ No newline at end of file
+}
diff --git a/src/main/java/org/archive/io/arc/ARCRecordMetaData.java b/src/main/java/org/archive/io/arc/ARCRecordMetaData.java
index 3f617041..02b368e4 100644
--- a/src/main/java/org/archive/io/arc/ARCRecordMetaData.java
+++ b/src/main/java/org/archive/io/arc/ARCRecordMetaData.java
@@ -168,6 +168,13 @@ public String getVersion() {
return (String)this.headerFields.get(VERSION_FIELD_KEY);
}
+ /**
+ * @return Arcfile origin code.
+ */
+ public String getOrigin() {
+ return (String)this.headerFields.get(ORIGIN_FIELD_KEY);
+ }
+
/**
* @return Offset into arcfile at which this record begins.
*/
@@ -264,4 +271,4 @@ public int getContentBegin() {
protected void setContentBegin(final int offset) {
this.contentBegin = offset;
}
-}
\ No newline at end of file
+}
From 28c9a1b2b04c9f392247690c7112ae20882d8cbc Mon Sep 17 00:00:00 2001
From: Jeremy Wiebe
Date: Fri, 11 Mar 2016 11:45:42 -0500
Subject: [PATCH 033/240] Update ArchiveFileConstants.java
---
src/main/java/org/archive/format/ArchiveFileConstants.java | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/main/java/org/archive/format/ArchiveFileConstants.java b/src/main/java/org/archive/format/ArchiveFileConstants.java
index df3b4465..89e1308c 100644
--- a/src/main/java/org/archive/format/ArchiveFileConstants.java
+++ b/src/main/java/org/archive/format/ArchiveFileConstants.java
@@ -46,7 +46,7 @@ public interface ArchiveFileConstants {
public static final String VERSION_FIELD_KEY = "version";
/**
- * Key for the Archive File origin-code field.
+ * Key for the Archive File origin-code field. This value is often hard-coded, so use with care.
*/
public static final String ORIGIN_FIELD_KEY = "origin";
From 7ef8aa95bc758d96b60a30d036dd0c32de20937c Mon Sep 17 00:00:00 2001
From: Jeremy Wiebe
Date: Mon, 14 Mar 2016 17:10:23 -0400
Subject: [PATCH 034/240] Update CHANGES.md
---
CHANGES.md | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/CHANGES.md b/CHANGES.md
index 70f9b052..3c9f4c8b 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,3 +1,7 @@
+1.1.7
+-----
+* [Store origin-code of ARC file header](https://github.com/iipc/webarchive-commons/pull/52/)
+
1.1.6
-----
* [Handle empty String argument in CharsetDetector.trimAttrValue](https://github.com/iipc/webarchive-commons/pull/49)
From 5cfff50a03263208520ca2d260229eefb2aec2f7 Mon Sep 17 00:00:00 2001
From: Hunter Stern
Date: Mon, 21 Mar 2016 17:30:30 -0700
Subject: [PATCH 035/240] Make canonicalizer be able to strip session id params
even if they are the first params in the query string. And add session id
strip test. And change IAURLCanonicalizer.java to ensure that if after
transformations on the query string have completed and the query is empty,
there is not a ? added to the end of the url.
---
.../org/archive/url/IAURLCanonicalizer.java | 35 +++++++++----------
.../org/archive/url/URLRegexTransformer.java | 10 +++---
.../archive/url/IAURLCanonicalizerTest.java | 10 ++++++
3 files changed, 32 insertions(+), 23 deletions(-)
diff --git a/src/main/java/org/archive/url/IAURLCanonicalizer.java b/src/main/java/org/archive/url/IAURLCanonicalizer.java
index 029598f6..0cf7c8a4 100644
--- a/src/main/java/org/archive/url/IAURLCanonicalizer.java
+++ b/src/main/java/org/archive/url/IAURLCanonicalizer.java
@@ -63,25 +63,24 @@ public void canonicalize(HandyURL url) {
String query = url.getQuery();
if(query != null) {
- if(query.equals("")) {
- if(rules.isSet(QUERY_SETTINGS, QUERY_STRIP_EMPTY)) {
- query = null;
- }
- } else {
- // we have a query... what to do with it?
+ // we have a query... what to do with it?
- // first remove uneeded:
- if(rules.isSet(QUERY_SETTINGS, QUERY_STRIP_SESSION_ID)) {
- query = URLRegexTransformer.stripQuerySessionID(query);
- }
- // lower-case:
- if(rules.isSet(QUERY_SETTINGS, QUERY_LOWERCASE)) {
- query = query.toLowerCase();
- }
- // re-order?
- if(rules.isSet(QUERY_SETTINGS, QUERY_ALPHA_REORDER)) {
- query = alphaReorderQuery(query);
- }
+ // first remove uneeded:
+ if(rules.isSet(QUERY_SETTINGS, QUERY_STRIP_SESSION_ID)) {
+ query = URLRegexTransformer.stripQuerySessionID(query);
+ }
+ // lower-case:
+ if(rules.isSet(QUERY_SETTINGS, QUERY_LOWERCASE)) {
+ query = query.toLowerCase();
+ }
+ // re-order?
+ if(rules.isSet(QUERY_SETTINGS, QUERY_ALPHA_REORDER)) {
+ query = alphaReorderQuery(query);
+ }
+ if(query.equals("")) {
+ if(rules.isSet(QUERY_SETTINGS, QUERY_STRIP_EMPTY)) {
+ query = null;
+ }
}
url.setQuery(query);
}
diff --git a/src/main/java/org/archive/url/URLRegexTransformer.java b/src/main/java/org/archive/url/URLRegexTransformer.java
index c5505a74..617e0225 100644
--- a/src/main/java/org/archive/url/URLRegexTransformer.java
+++ b/src/main/java/org/archive/url/URLRegexTransformer.java
@@ -16,11 +16,11 @@ public class URLRegexTransformer {
private static final OptimizedPattern QUERY_OPTS[] = {
- new OptimizedPattern("(?i)^(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2),
- new OptimizedPattern("(?i)^(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2),
- new OptimizedPattern("(?i)^(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2),
- new OptimizedPattern("(?i)^(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2),
- new OptimizedPattern("(?i)^(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", "cftoken=", 1, 2),
+ new OptimizedPattern("(?i)^(.*)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2),
+ new OptimizedPattern("(?i)^(.*)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2),
+ new OptimizedPattern("(?i)^(.*)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2),
+ new OptimizedPattern("(?i)^(.*)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2),
+ new OptimizedPattern("(?i)^(.*)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", "cftoken=", 1, 2),
};
diff --git a/src/test/java/org/archive/url/IAURLCanonicalizerTest.java b/src/test/java/org/archive/url/IAURLCanonicalizerTest.java
index 3263edc7..91751b4a 100644
--- a/src/test/java/org/archive/url/IAURLCanonicalizerTest.java
+++ b/src/test/java/org/archive/url/IAURLCanonicalizerTest.java
@@ -53,5 +53,15 @@ public void testGetDefaultPort() {
assertEquals(80,IAURLCanonicalizer.getDefaultPort("http"));
assertEquals(443,IAURLCanonicalizer.getDefaultPort("https"));
}
+
+ public void testStripSessionId() throws URISyntaxException {
+ IAURLCanonicalizer iaC = new IAURLCanonicalizer(new DefaultIACanonicalizerRules());
+ compCan(iaC,
+ "http://www.nsf.gov/statistics/sed/2009/SED_2009.zip?CFID=14387305&CFTOKEN=72942008&jsessionid=f030eacc7e49c4ca0b077922347418418766",
+ "http://nsf.gov/statistics/sed/2009/sed_2009.zip?jsessionid=f030eacc7e49c4ca0b077922347418418766");
+ compCan(iaC,
+ "http://www.nsf.gov/statistics/sed/2009/SED_2009.zip?CFID=14387305&CFTOKEN=72942008",
+ "http://nsf.gov/statistics/sed/2009/sed_2009.zip");
+ }
}
From 02e6e29fb735b1fdd0957196d264b40d29e6fa6d Mon Sep 17 00:00:00 2001
From: John Erik Halse
Date: Fri, 17 Jun 2016 09:54:36 +0200
Subject: [PATCH 036/240] Updated release notes
---
CHANGES.md | 3 +++
1 file changed, 3 insertions(+)
diff --git a/CHANGES.md b/CHANGES.md
index 3c9f4c8b..52c40f42 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,6 +1,9 @@
1.1.7
-----
+* [Make canonicalizer be able to strip session id params even if they are the first params in the query string](https://github.com/iipc/webarchive-commons/pull/54)
* [Store origin-code of ARC file header](https://github.com/iipc/webarchive-commons/pull/52/)
+* [Flush output etc before tallying stats to fix sizeOnDisk calculation](https://github.com/iipc/webarchive-commons/pull/51)
+* [Get rid of broken, seemingly unnecessary escapeWhitespace() step of uri fixup](https://github.com/iipc/webarchive-commons/pull/50)
1.1.6
-----
From a55391dfe1855259939d118c49b84cf386c0960f Mon Sep 17 00:00:00 2001
From: John Erik Halse
Date: Fri, 17 Jun 2016 10:25:23 +0200
Subject: [PATCH 037/240] [maven-release-plugin] prepare release
webarchive-commons-1.1.7
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index 7984edde..f842a09c 100644
--- a/pom.xml
+++ b/pom.xml
@@ -9,7 +9,7 @@
org.netpreserve.commonswebarchive-commons
- 1.1.7-SNAPSHOT
+ 1.1.7jarwebarchive-commons
From bb36b6a7375453e1cb8073211041ca3f955ab217 Mon Sep 17 00:00:00 2001
From: John Erik Halse
Date: Fri, 17 Jun 2016 10:25:28 +0200
Subject: [PATCH 038/240] [maven-release-plugin] prepare for next development
iteration
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index f842a09c..24780063 100644
--- a/pom.xml
+++ b/pom.xml
@@ -9,7 +9,7 @@
org.netpreserve.commonswebarchive-commons
- 1.1.7
+ 1.1.8-SNAPSHOTjarwebarchive-commons
From 0cbca57bc87f9bd55844977a480ead400a40920d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kristinn=20Sigur=C3=B0sson?=
Date: Wed, 13 Jul 2016 12:21:42 +0000
Subject: [PATCH 039/240] Remove invalid constant
The PROFILE_REVISIT_URI_AGNOSTIC_IDENTICAL_DIGEST does not exist in the WARC specification. This file shouldn't include non-standard items. And, in any case, use of PROFILE_REVISIT_IDENTICAL_DIGEST is appropriate, even when using 'uri agnostic' deduplication.
---
src/main/java/org/archive/format/warc/WARCConstants.java | 2 --
1 file changed, 2 deletions(-)
diff --git a/src/main/java/org/archive/format/warc/WARCConstants.java b/src/main/java/org/archive/format/warc/WARCConstants.java
index c9f6cbf3..93a81f96 100644
--- a/src/main/java/org/archive/format/warc/WARCConstants.java
+++ b/src/main/java/org/archive/format/warc/WARCConstants.java
@@ -183,8 +183,6 @@ enum WARCRecordType {
public static final String HEADER_KEY_REFERS_TO_FILENAME = "WARC-Refers-To-Filename";
public static final String HEADER_KEY_REFERS_TO_FILE_OFFSET = "WARC-Refers-To-File-Offset";
- public static final String PROFILE_REVISIT_URI_AGNOSTIC_IDENTICAL_DIGEST =
- "http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest";
public static final String PROFILE_REVISIT_IDENTICAL_DIGEST =
"http://netpreserve.org/warc/1.0/revisit/identical-payload-digest";
public static final String PROFILE_REVISIT_NOT_MODIFIED =
From a23cfebe24a959c929b1fcf9fbb6fc37eae31c76 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Sun, 7 Aug 2016 16:49:47 +0200
Subject: [PATCH 040/240] Make regular expression to extract URLs from CSS more
restrictive (allow only `"`, `'`, `\"` or `\'` in front of or after the URL).
Avoid long-runners when matching the regex due to heavy back-tracking.
---
.../java/org/archive/resource/html/ExtractingParseObserver.java | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
index e1f57b55..df3742fa 100644
--- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
+++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
@@ -21,7 +21,7 @@ public class ExtractingParseObserver implements ParseObserver {
boolean inTitle = false;
protected static String cssUrlPatString =
- "url\\s*\\(\\s*([\\\\\"']*.+?[\\\\\"']*)\\s*\\)";
+ "url\\s*\\(\\s*((?:\\\\?[\"'])?.+?(?:\\\\?[\"'])?)\\s*\\)";
protected static String cssImportNoUrlPatString =
"@import\\s+(('[^']+')|(\"[^\"]+\")|(\\('[^']+'\\))|(\\(\"[^\"]+\"\\))|(\\([^)]+\\))|([a-z0-9_.:/\\\\-]+))\\s*;";
From 9d7abed43aef409e19842f875914c50a0b58ccf8 Mon Sep 17 00:00:00 2001
From: David Portabella
Date: Wed, 21 Sep 2016 11:54:18 +0200
Subject: [PATCH 041/240] fix: last header was lost if LF LF (intead of CRLF
CRLF)
---
.../archive/format/http/HttpHeaderParser.java | 1 +
.../format/http/HttpResponseParserTest.java | 19 +++++++++++++++++++
2 files changed, 20 insertions(+)
diff --git a/src/main/java/org/archive/format/http/HttpHeaderParser.java b/src/main/java/org/archive/format/http/HttpHeaderParser.java
index fdec62f2..d63ec405 100755
--- a/src/main/java/org/archive/format/http/HttpHeaderParser.java
+++ b/src/main/java/org/archive/format/http/HttpHeaderParser.java
@@ -231,6 +231,7 @@ public ParseState handleByte(byte b, HttpHeaderParser parser)
if(b == LF) {
// TODO: this is lax, is LFLF an OK terminator?
// that's all folks!
+ parser.headerFinished();
parser.parseFinished();
return parser.endState;
}
diff --git a/src/test/java/org/archive/format/http/HttpResponseParserTest.java b/src/test/java/org/archive/format/http/HttpResponseParserTest.java
index 2850fe44..c0d13230 100644
--- a/src/test/java/org/archive/format/http/HttpResponseParserTest.java
+++ b/src/test/java/org/archive/format/http/HttpResponseParserTest.java
@@ -38,4 +38,23 @@ public void testParse() throws IOException {
}
+ public void testParseWithLf() throws IOException {
+
+ HttpResponseParser parser = new HttpResponseParser();
+ String message = "200 OK\nContent-Type: text/plain\n\nHi there";
+ try {
+ HttpResponse response =
+ parser.parse(new ByteArrayInputStream(message.getBytes(IAUtils.UTF8)));
+ assertNotNull(response);
+ HttpHeaders headers = response.getHeaders();
+ assertNotNull(headers);
+ assertEquals(1,headers.size());
+
+ } catch (HttpParseException e) {
+ e.printStackTrace();
+ fail();
+ }
+
+ }
+
}
From 5f223d60c365a53533b2ad7217deaa65b3a91667 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Thu, 24 Nov 2016 11:51:10 +0100
Subject: [PATCH 042/240] Use CharsetDetector to guess encoding of HTML
document
---
.../resource/html/HTMLResourceFactory.java | 32 +++++++++++++++++--
1 file changed, 30 insertions(+), 2 deletions(-)
diff --git a/src/main/java/org/archive/resource/html/HTMLResourceFactory.java b/src/main/java/org/archive/resource/html/HTMLResourceFactory.java
index 935843f1..34062ed9 100644
--- a/src/main/java/org/archive/resource/html/HTMLResourceFactory.java
+++ b/src/main/java/org/archive/resource/html/HTMLResourceFactory.java
@@ -1,9 +1,14 @@
package org.archive.resource.html;
+import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
+import org.archive.format.http.HttpHeaders;
+import org.archive.format.json.JSONUtils;
+import org.archive.format.text.charset.CharsetDetector;
+import org.archive.format.text.charset.StandardCharsetDetector;
import org.archive.format.text.html.CDATALexer;
import org.archive.format.text.html.LexParser;
import org.archive.resource.MetaData;
@@ -13,17 +18,40 @@
import org.archive.resource.ResourceParseException;
import org.htmlparser.lexer.Page;
import org.htmlparser.util.ParserException;
+import org.json.JSONException;
+import org.json.JSONObject;
public class HTMLResourceFactory implements ResourceFactory {
+ protected static final int CHARSET_GUESS_CHUNK_SIZE = 8192;
+ protected static final String HTTP_HEADER_PATH = "Envelope.Payload-Metadata.HTTP-Response-Metadata.Headers";
+
+ protected CharsetDetector charSetDetector = new StandardCharsetDetector();
+
+
public Resource getResource(InputStream is, MetaData parentMetaData,
ResourceContainer container) throws ResourceParseException, IOException {
HTMLMetaData hmd = new HTMLMetaData(parentMetaData);
ExtractingParseObserver epo = new ExtractingParseObserver(hmd);
LexParser parser = new LexParser(epo);
CDATALexer lex = new CDATALexer();
- // TODO: figure out charset:
- String charset = "UTF-8";
+
+ // guess charset based on HTTP header and sniffed content chunk
+ is = new BufferedInputStream(is, CHARSET_GUESS_CHUNK_SIZE);
+ byte[] chunk = new byte[CHARSET_GUESS_CHUNK_SIZE];
+ is.mark(0);
+ int chunkSize = is.read(chunk, 0, CHARSET_GUESS_CHUNK_SIZE);
+ is.reset();
+ JSONObject headers = JSONUtils.extractObject(hmd.getTopMetaData(), HTTP_HEADER_PATH);
+ HttpHeaders httpHeaders = new HttpHeaders();
+ if (headers.has("Content-Type")) {
+ try {
+ httpHeaders.add("Content-Type", headers.getString("Content-Type"));
+ } catch (JSONException e) { }
+ }
+
+ String charset = charSetDetector.getCharset(chunk, chunkSize, httpHeaders);
+
Page page;
try {
page = new Page(is, charset);
From 607acaa734183b72c816359c588bbf157485d5ba Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Thu, 24 Nov 2016 12:44:53 +0100
Subject: [PATCH 043/240] HTML encoding detection: fix errors with empty
content or empty charset values
---
.../format/text/charset/CharsetDetector.java | 2 ++
.../resource/html/HTMLResourceFactory.java | 24 +++++++++++++------
2 files changed, 19 insertions(+), 7 deletions(-)
diff --git a/src/main/java/org/archive/format/text/charset/CharsetDetector.java b/src/main/java/org/archive/format/text/charset/CharsetDetector.java
index ae71b5fa..0534ff85 100644
--- a/src/main/java/org/archive/format/text/charset/CharsetDetector.java
+++ b/src/main/java/org/archive/format/text/charset/CharsetDetector.java
@@ -182,6 +182,8 @@ private static String trimAttrValue(String value) {
return value;
}
String result = value;
+ if (result.isEmpty())
+ return result;
if (result.charAt(0) == '"') {
result = result.substring(1, result.length() - 1);
} else if (result.charAt(0) == '\'') {
diff --git a/src/main/java/org/archive/resource/html/HTMLResourceFactory.java b/src/main/java/org/archive/resource/html/HTMLResourceFactory.java
index 34062ed9..afb1c850 100644
--- a/src/main/java/org/archive/resource/html/HTMLResourceFactory.java
+++ b/src/main/java/org/archive/resource/html/HTMLResourceFactory.java
@@ -5,6 +5,8 @@
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
import org.archive.format.http.HttpHeaders;
import org.archive.format.json.JSONUtils;
import org.archive.format.text.charset.CharsetDetector;
@@ -23,6 +25,8 @@
public class HTMLResourceFactory implements ResourceFactory {
+ public static final Log LOG = LogFactory.getLog(HTMLResourceFactory.class);
+
protected static final int CHARSET_GUESS_CHUNK_SIZE = 8192;
protected static final String HTTP_HEADER_PATH = "Envelope.Payload-Metadata.HTTP-Response-Metadata.Headers";
@@ -37,21 +41,27 @@ public Resource getResource(InputStream is, MetaData parentMetaData,
CDATALexer lex = new CDATALexer();
// guess charset based on HTTP header and sniffed content chunk
+ String charset = "UTF-8";
is = new BufferedInputStream(is, CHARSET_GUESS_CHUNK_SIZE);
byte[] chunk = new byte[CHARSET_GUESS_CHUNK_SIZE];
is.mark(0);
int chunkSize = is.read(chunk, 0, CHARSET_GUESS_CHUNK_SIZE);
is.reset();
- JSONObject headers = JSONUtils.extractObject(hmd.getTopMetaData(), HTTP_HEADER_PATH);
- HttpHeaders httpHeaders = new HttpHeaders();
- if (headers.has("Content-Type")) {
+ if (chunkSize > 0) {
+ JSONObject headers = JSONUtils.extractObject(hmd.getTopMetaData(), HTTP_HEADER_PATH);
+ HttpHeaders httpHeaders = new HttpHeaders();
+ if (headers.has("Content-Type")) {
+ try {
+ httpHeaders.add("Content-Type", headers.getString("Content-Type"));
+ } catch (JSONException e) { }
+ }
try {
- httpHeaders.add("Content-Type", headers.getString("Content-Type"));
- } catch (JSONException e) { }
+ charset = charSetDetector.getCharset(chunk, chunkSize, httpHeaders);
+ } catch (Exception e) {
+ LOG.error("Failed to guess charset: " + e.getMessage());
+ }
}
- String charset = charSetDetector.getCharset(chunk, chunkSize, httpHeaders);
-
Page page;
try {
page = new Page(is, charset);
From 824dd82f5f9c9e60392ece498f8e5d44a7e431b9 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Thu, 24 Nov 2016 14:05:55 +0100
Subject: [PATCH 044/240] Match http-equiv meta elements with unquoted
attribute values, e.g.
---
.../org/archive/format/text/charset/CharsetDetector.java | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/src/main/java/org/archive/format/text/charset/CharsetDetector.java b/src/main/java/org/archive/format/text/charset/CharsetDetector.java
index 0534ff85..9b4c8523 100644
--- a/src/main/java/org/archive/format/text/charset/CharsetDetector.java
+++ b/src/main/java/org/archive/format/text/charset/CharsetDetector.java
@@ -60,7 +60,8 @@ public abstract class CharsetDetector {
private final static String META_CONTENT_ATTR_PATTERN_STRING = "\\b" +
META_CONTENT_ATTRIBUTE + "\\s*=\\s*(" + ANY_ATTR_VALUE + ")(?:\\s|>)?";
private final static String META_HTTP_EQUIV_ATTR_PATTERN_STRING = "\\b" +
- META_HTTP_EQUIV_ATTRIBUTE + "\\s*=\\s*(" + ANY_ATTR_VALUE + ")(?:\\s|>)?";
+ META_HTTP_EQUIV_ATTRIBUTE + "\\s*=\\s*(" + META_CONTENT_TYPE + "|" +
+ ANY_ATTR_VALUE + ")(?:\\s|>)?";
@@ -183,7 +184,7 @@ private static String trimAttrValue(String value) {
}
String result = value;
if (result.isEmpty())
- return result;
+ return result;
if (result.charAt(0) == '"') {
result = result.substring(1, result.length() - 1);
} else if (result.charAt(0) == '\'') {
@@ -232,7 +233,6 @@ public static String findMetaContentType(String pageSample) {
protected String getCharsetFromBytes(byte buffer[], int len)
throws IOException {
String charsetName = null;
-
UniversalDetector detector = new UniversalDetector(null);
detector.handleData(buffer, 0, len);
detector.dataEnd();
From 9e41abcb36c585dd1cd9622f0eeeaddb0faae111 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Fri, 9 Dec 2016 15:35:10 +0100
Subject: [PATCH 045/240] Strip empty port, do not fail
---
src/main/java/org/archive/url/URLParser.java | 24 +++++++++++--------
.../archive/url/IAURLCanonicalizerTest.java | 1 +
.../archive/url/WaybackURLKeyMakerTest.java | 1 +
3 files changed, 16 insertions(+), 10 deletions(-)
diff --git a/src/main/java/org/archive/url/URLParser.java b/src/main/java/org/archive/url/URLParser.java
index 98e4c1aa..83d3c386 100644
--- a/src/main/java/org/archive/url/URLParser.java
+++ b/src/main/java/org/archive/url/URLParser.java
@@ -246,16 +246,20 @@ public static HandyURL parse(String urlString) throws URISyntaxException {
colonPort = uriAuthority.substring(portColonIndex);
}
if(colonPort != null) {
- if(colonPort.startsWith(":")) {
- try {
- port = Integer.parseInt(colonPort.substring(1));
- } catch(NumberFormatException e) {
- throw new URISyntaxException(urlString, "bad port "
- + colonPort.substring(1));
- }
- } else {
- // XXX: what's happened?!
- }
+ if(colonPort.startsWith(":")) {
+ if (colonPort.length() == 1) {
+ // a bare colon (http://example.com:/), use default port
+ } else {
+ try {
+ port = Integer.parseInt(colonPort.substring(1));
+ } catch(NumberFormatException e) {
+ throw new URISyntaxException(urlString, "bad port "
+ + colonPort.substring(1));
+ }
+ }
+ } else {
+ // XXX: what's happened?!
+ }
}
if(userInfo != null) {
int passColonIndex = userInfo.indexOf(COLON);
diff --git a/src/test/java/org/archive/url/IAURLCanonicalizerTest.java b/src/test/java/org/archive/url/IAURLCanonicalizerTest.java
index 91751b4a..e2c46258 100644
--- a/src/test/java/org/archive/url/IAURLCanonicalizerTest.java
+++ b/src/test/java/org/archive/url/IAURLCanonicalizerTest.java
@@ -12,6 +12,7 @@ public void testFull() throws URISyntaxException {
compCan(iaC,"https://www.archive.org:80/","https://archive.org:80/");
compCan(iaC,"http://www.archive.org:443/","http://archive.org:443/");
compCan(iaC,"https://www.archive.org:443/","https://archive.org/");
+ compCan(iaC,"http://www.archive.org:/","http://archive.org/");
compCan(iaC,"http://www.archive.org/big/","http://archive.org/big");
compCan(iaC,"dns:www.archive.org","dns:www.archive.org");
diff --git a/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java b/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java
index 34bfe625..26161456 100644
--- a/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java
+++ b/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java
@@ -22,6 +22,7 @@ public void testMakeKey() throws URISyntaxException {
assertEquals("org,archive)/goo", km.makeKey("http://archive.org/goo/?"));
assertEquals("org,archive)/goo?a&b", km.makeKey("http://archive.org/goo/?b&a"));
assertEquals("org,archive)/goo?a=1&a=2&b", km.makeKey("http://archive.org/goo/?a=2&b&a=1"));
+ assertEquals("org,archive)/", km.makeKey("http://archive.org:/"));
}
}
From b918f7f18e94c58a4a74d97e98f3c19465466595 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Wed, 4 Jan 2017 18:21:22 +0100
Subject: [PATCH 046/240] Improve clipping of quotation marks in CSS link
extraction - clip multiple quotation marks Fix
StringIndexOutOfBoundsException in patternCSSExtract - correct check for min.
required URL lenght when stripping 4 characters (2 at each end) - simplified
code, use non-capturing groups in regular expression
---
.../html/ExtractingParseObserver.java | 79 ++++++++++---------
.../html/ExtractingParseObserverTest.java | 48 ++++++-----
2 files changed, 70 insertions(+), 57 deletions(-)
diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
index df3742fa..45a48808 100644
--- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
+++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
@@ -23,7 +23,7 @@ public class ExtractingParseObserver implements ParseObserver {
protected static String cssUrlPatString =
"url\\s*\\(\\s*((?:\\\\?[\"'])?.+?(?:\\\\?[\"'])?)\\s*\\)";
protected static String cssImportNoUrlPatString =
- "@import\\s+(('[^']+')|(\"[^\"]+\")|(\\('[^']+'\\))|(\\(\"[^\"]+\"\\))|(\\([^)]+\\))|([a-z0-9_.:/\\\\-]+))\\s*;";
+ "@import\\s+((?:'[^']+')|(?:\"[^\"]+\")|(?:\\('[^']+'\\))|(?:\\(\"[^\"]+\"\\))|(?:\\([^)]+\\))|(?:[a-z0-9_.:/\\\\-]+))\\s*;";
protected static Pattern cssImportNoUrlPattern = Pattern
.compile(cssImportNoUrlPatString);
@@ -368,40 +368,45 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
}
}
}
- private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String content) {
- Matcher m = pattern.matcher(content);
- int idx = 0;
- int contentLen = content.length();
- while((idx < contentLen) && m.find(idx)) {
- String url = m.group(1);
- int origUrlLength = url.length();
- int urlStart = m.start(1);
- int urlEnd = m.end(1);
- idx = urlEnd;
- if(url.length() < 2) {
- continue;
- }
- if ((url.charAt(0) == '(')
- && (url.charAt(origUrlLength-1) == ')')) {
- url = url.substring(1, origUrlLength - 1);
- urlStart += 1;
- origUrlLength -= 2;
- }
- if (url.charAt(0) == '"') {
- url = url.substring(1, origUrlLength - 1);
- urlStart += 1;
- } else if (url.charAt(0) == '\'') {
- url = url.substring(1, origUrlLength - 1);
- urlStart += 1;
- } else if (url.charAt(0) == '\\') {
- if(url.length() == 2)
- continue;
- url = url.substring(2, origUrlLength - 2);
- urlStart += 2;
- }
- int urlLength = url.length();
- data.addHref("path","STYLE/#text","href",url);
- idx += urlLength;
- }
- }
+ private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String content) {
+ Matcher m = pattern.matcher(content);
+ int idx = 0;
+ int contentLen = content.length();
+ if (contentLen > 100000)
+ // extract URLs only from the first 100 kB
+ contentLen = 100000;
+ FIND:
+ while((idx < contentLen) && m.find()) {
+ idx = m.end();
+ String url = m.group(1);
+ if(url.length() < 2) {
+ continue;
+ }
+ if ((url.charAt(0) == '(')
+ && (url.charAt(url.length()-1) == ')')) {
+ url = url.substring(1, url.length() - 1);
+ }
+ CLIP:
+ while (url.length() > 1) {
+ if ((url.charAt(0) == '"' || url.charAt(0) == '\'')
+ && (url.charAt(url.length() - 1) == '"'
+ || url.charAt(url.length() - 1) == '\'')) {
+ if(url.length() <= 2) {
+ // empty URL
+ continue FIND;
+ }
+ url = url.substring(1, url.length() - 1);
+ } else if (url.charAt(0) == '\\') {
+ if(url.length() <= 4) {
+ // empty URL
+ continue FIND;
+ }
+ url = url.substring(2, url.length() - 2);
+ } else {
+ break CLIP;
+ }
+ }
+ data.addHref("path","STYLE/#text","href",url);
+ }
+ }
}
diff --git a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
index 24b6c18a..236b964b 100644
--- a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
+++ b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
@@ -19,7 +19,9 @@ public void testHandleStyleNodeExceptions() throws Exception {
"url (' ')",
"url('\")",
"url(')",
- "url('\"')"
+ "url('\"')",
+ "url('\\\"\"')",
+ "url(''''')"
};
boolean except = false;
HTMLMetaData md = new HTMLMetaData(new MetaData());
@@ -37,6 +39,7 @@ public void testHandleStyleNodeExceptions() throws Exception {
assertFalse(except);
}
}
+
public void testHandleStyleNode() throws Exception {
String[][] tests = {
{""},
@@ -45,31 +48,35 @@ public void testHandleStyleNode() throws Exception {
{"url(\"foo.gif\")","foo.gif"},
{"url(\\\"foo.gif\\\")","foo.gif"},
{"url(\\'foo.gif\\')","foo.gif"},
-
- };
+ {"url(''foo.gif'')","foo.gif"},
+ {"url( foo.gif )","foo.gif"},
+ {"url('''')"}
+ };
for(String[] testa : tests) {
checkExtract(testa);
}
- // boolean except = false;
-// HTMLMetaData md = new HTMLMetaData(new MetaData());
-// ExtractingParseObserver epo = new ExtractingParseObserver(md);
-// for(String css : tests) {
-// try {
-// TextNode tn = new TextNode(css);
-// epo.handleStyleNode(tn);
-// } catch(Exception e) {
-// System.err.format("And the winner is....(%s)\n", css);
-// e.printStackTrace();
-// except = true;
-// throw e;
-// }
-// assertFalse(except);
-// }
}
+
+ /**
+ * Test whether the pattern matcher does extract nothing and also does not
+ * not hang-up if an overlong CSS link is truncated.
+ */
+ public void testHandleStyleNodeNoHangupTruncated() throws Exception {
+ StringBuilder sb = new StringBuilder();
+ sb.append("url(");
+ for (int i = 0; i < 500000; i++)
+ sb.append('\'');
+ sb.append("foo.gif");
+ for (int i = 0; i < 499000; i++)
+ sb.append('\'');
+ String[] test = new String[1];
+ test[0] = sb.toString();
+ checkExtract(test);
+ }
+
private void checkExtract(String[] data) throws JSONException {
// System.err.format("CSS(%s) want[0](%s)\n",css,want[0]);
String css = data[0];
- boolean except = false;
HTMLMetaData md = new HTMLMetaData(new MetaData());
ExtractingParseObserver epo = new ExtractingParseObserver(md);
try {
@@ -87,7 +94,8 @@ private void checkExtract(String[] data) throws JSONException {
assertTrue(o instanceof JSONObject);
JSONObject jo = (JSONObject) o;
- assertEquals(data[i],jo.getString("href"));
+ assertEquals("CSS link extraction failed for <" + css + ">",
+ data[i], jo.getString("href"));
}
} else {
assertNull(a);
From 194a1faecf30905c840d71d0bc22b6ea5d6a61fe Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Wed, 18 Jan 2017 12:29:43 +0100
Subject: [PATCH 047/240] CSS link extraction: clip also unpaired leading and
trailing quotation marks
---
.../html/ExtractingParseObserver.java | 64 +++++++------------
.../html/ExtractingParseObserverTest.java | 9 +--
2 files changed, 27 insertions(+), 46 deletions(-)
diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
index 45a48808..deb8c8c0 100644
--- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
+++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
@@ -22,13 +22,18 @@ public class ExtractingParseObserver implements ParseObserver {
protected static String cssUrlPatString =
"url\\s*\\(\\s*((?:\\\\?[\"'])?.+?(?:\\\\?[\"'])?)\\s*\\)";
+ protected static String cssUrlTrimPatString =
+ "^(?:\\\\?[\"'])+|(?:\\\\?[\"'])+$";
protected static String cssImportNoUrlPatString =
- "@import\\s+((?:'[^']+')|(?:\"[^\"]+\")|(?:\\('[^']+'\\))|(?:\\(\"[^\"]+\"\\))|(?:\\([^)]+\\))|(?:[a-z0-9_.:/\\\\-]+))\\s*;";
+ "@import\\s+((?:'[^']+')|(?:\"[^\"]+\")|(?:\\('[^']+'\\))|(?:\\(\"[^\"]+\"\\))|(?:\\([^)]+\\))|(?:[a-z0-9_.:/\\\\-]+))\\s*;";
protected static Pattern cssImportNoUrlPattern = Pattern
.compile(cssImportNoUrlPatString);
protected static Pattern cssUrlPattern = Pattern.compile(cssUrlPatString);
+
+ protected static Pattern cssUrlTrimPattern = Pattern.compile(cssUrlTrimPatString);
+
private final static int MAX_TEXT_LEN = 100;
// private static String GLOBAL_ATTR[] = {"background"};
@@ -368,45 +373,20 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
}
}
}
- private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String content) {
- Matcher m = pattern.matcher(content);
- int idx = 0;
- int contentLen = content.length();
- if (contentLen > 100000)
- // extract URLs only from the first 100 kB
- contentLen = 100000;
- FIND:
- while((idx < contentLen) && m.find()) {
- idx = m.end();
- String url = m.group(1);
- if(url.length() < 2) {
- continue;
- }
- if ((url.charAt(0) == '(')
- && (url.charAt(url.length()-1) == ')')) {
- url = url.substring(1, url.length() - 1);
- }
- CLIP:
- while (url.length() > 1) {
- if ((url.charAt(0) == '"' || url.charAt(0) == '\'')
- && (url.charAt(url.length() - 1) == '"'
- || url.charAt(url.length() - 1) == '\'')) {
- if(url.length() <= 2) {
- // empty URL
- continue FIND;
- }
- url = url.substring(1, url.length() - 1);
- } else if (url.charAt(0) == '\\') {
- if(url.length() <= 4) {
- // empty URL
- continue FIND;
- }
- url = url.substring(2, url.length() - 2);
- } else {
- break CLIP;
- }
- }
- data.addHref("path","STYLE/#text","href",url);
- }
- }
+ private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String content) {
+ Matcher m = pattern.matcher(content);
+ int idx = 0;
+ int contentLen = content.length();
+ if (contentLen > 100000)
+ // extract URLs only from the first 100 kB
+ contentLen = 100000;
+ while((idx < contentLen) && m.find()) {
+ idx = m.end();
+ String url = m.group(1);
+ url = cssUrlTrimPattern.matcher(url).replaceAll("");
+ if (!url.isEmpty()) {
+ data.addHref("path","STYLE/#text","href", url);
+ }
+ }
+ }
}
diff --git a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
index 236b964b..bfbd6f02 100644
--- a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
+++ b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
@@ -20,8 +20,8 @@ public void testHandleStyleNodeExceptions() throws Exception {
"url('\")",
"url(')",
"url('\"')",
- "url('\\\"\"')",
- "url(''''')"
+ "url('\\\"\"')",
+ "url(''''')"
};
boolean except = false;
HTMLMetaData md = new HTMLMetaData(new MetaData());
@@ -50,7 +50,8 @@ public void testHandleStyleNode() throws Exception {
{"url(\\'foo.gif\\')","foo.gif"},
{"url(''foo.gif'')","foo.gif"},
{"url( foo.gif )","foo.gif"},
- {"url('''')"}
+ {"url('''')"},
+ {"url('foo.gif'')","foo.gif"},
};
for(String[] testa : tests) {
checkExtract(testa);
@@ -98,7 +99,7 @@ private void checkExtract(String[] data) throws JSONException {
data[i], jo.getString("href"));
}
} else {
- assertNull(a);
+ assertNull("Expected no extracted link for <" + css + ">", a);
}
}
From 038402885f85a426601d5f85936e210e4f55636f Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Fri, 27 Jan 2017 08:59:25 +0100
Subject: [PATCH 048/240] CharsetDetector: remove unnecessary check for empty
string (contributed by @ldko)
---
.../java/org/archive/format/text/charset/CharsetDetector.java | 2 --
1 file changed, 2 deletions(-)
diff --git a/src/main/java/org/archive/format/text/charset/CharsetDetector.java b/src/main/java/org/archive/format/text/charset/CharsetDetector.java
index 9b4c8523..690f8b99 100644
--- a/src/main/java/org/archive/format/text/charset/CharsetDetector.java
+++ b/src/main/java/org/archive/format/text/charset/CharsetDetector.java
@@ -183,8 +183,6 @@ private static String trimAttrValue(String value) {
return value;
}
String result = value;
- if (result.isEmpty())
- return result;
if (result.charAt(0) == '"') {
result = result.substring(1, result.length() - 1);
} else if (result.charAt(0) == '\'') {
From 1364716a83911369de7256aa1718a236acb75973 Mon Sep 17 00:00:00 2001
From: Lauren Ko
Date: Tue, 14 Feb 2017 17:07:36 -0600
Subject: [PATCH 049/240] Logging changes for next release.
---
CHANGES.md | 16 ++++++++++++----
1 file changed, 12 insertions(+), 4 deletions(-)
diff --git a/CHANGES.md b/CHANGES.md
index 52c40f42..fee29e16 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,3 +1,11 @@
+1.1.8
+-----
+* [Strip empty port via URLParser](https://github.com/iipc/webarchive-commons/pull/69/)
+* [Use CharsetDetector to guess encoding of HTML documents](https://github.com/iipc/webarchive-commons/pull/68/)
+* [Fix last header was lost if LF LF](https://github.com/iipc/webarchive-commons/pull/65/)
+* [Make regular expression to extract URLs from CSS more restrictive](https://github.com/iipc/webarchive-commons/pull/63)
+* [Remove invalid constant `PROFILE_REVISIT_URI_AGNOSTIC_IDENTICAL_DIGEST`](https://github.com/iipc/webarchive-commons/pull/62)
+
1.1.7
-----
* [Make canonicalizer be able to strip session id params even if they are the first params in the query string](https://github.com/iipc/webarchive-commons/pull/54)
@@ -36,10 +44,10 @@
1.1.2
-----
-* Fixed support for reading uncompressed WARCs, along with some unit testing. (https://github.com/iipc/webarchive-commons/pull/12)
+* [Fixed support for reading uncompressed WARCs, along with some unit testing.](https://github.com/iipc/webarchive-commons/pull/12)
1.1.1
-----
-* Renamed from commons-webarchive to webarchive-commons (https://github.com/iipc/webarchive-commons/pull/8)
-* Cope with malformed GZip extra fields as produced by wget 1.14 (https://github.com/iipc/webarchive-commons/pull/10)
-* Switch to httpcomponents, and add IA deployment information. (https://github.com/iipc/webarchive-commons/pull/11)
+* [Renamed from commons-webarchive to webarchive-commons](https://github.com/iipc/webarchive-commons/pull/8)
+* [Cope with malformed GZip extra fields as produced by wget 1.14](https://github.com/iipc/webarchive-commons/pull/10)
+* [Switch to httpcomponents, and add IA deployment information.](https://github.com/iipc/webarchive-commons/pull/11)
From 11579c2baab0db08f14341f70b848353eed17269 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Wed, 22 Feb 2017 13:11:13 +0100
Subject: [PATCH 050/240] Improve HTML link extraction - add extractors for
more elements which can take URLs as attribute values, add missing
attributes - generalize extraction of "global" attributes (`background`) -
add custom data attributes frequently used for linking (`data-href`,
`data-uri`) - add unit test to cover link extraction
---
.../html/ExtractingParseObserver.java | 79 ++++-
.../html/ExtractingParseObserverTest.java | 161 +++++++++
.../resource/html/link-extraction-test.warc | 320 ++++++++++++++++++
3 files changed, 551 insertions(+), 9 deletions(-)
create mode 100644 src/test/resources/org/archive/resource/html/link-extraction-test.warc
diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
index deb8c8c0..826851e0 100644
--- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
+++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
@@ -2,12 +2,17 @@
import java.util.ArrayList;
import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Locale;
import java.util.Map;
+import java.util.Set;
import java.util.Stack;
+import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.archive.format.text.html.ParseObserver;
+import org.htmlparser.Attribute;
import org.htmlparser.nodes.RemarkNode;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.nodes.TextNode;
@@ -36,11 +41,10 @@ public class ExtractingParseObserver implements ParseObserver {
private final static int MAX_TEXT_LEN = 100;
-// private static String GLOBAL_ATTR[] = {"background"};
-
private static final String PATH = "path";
private static final String PATH_SEPARATOR = "@/";
- private final static Map extractors;
+ private static final Map extractors;
+ private static final Set globalHrefAttributes;
static {
extractors = new HashMap();
extractors.put("A", new AnchorTagExtractor());
@@ -57,6 +61,22 @@ public class ExtractingParseObserver implements ParseObserver {
extractors.put("META", new MetaTagExtractor());
extractors.put("OBJECT", new ObjectTagExtractor());
extractors.put("SCRIPT", new ScriptTagExtractor());
+ extractors.put("Q", new QuotationLinkTagExtractor());
+ extractors.put("BLOCKQUOTE", new QuotationLinkTagExtractor());
+ extractors.put("DEL", new QuotationLinkTagExtractor());
+ extractors.put("INS", new QuotationLinkTagExtractor());
+ // HTML5:
+ extractors.put("BUTTON", new ButtonTagExtractor());
+ extractors.put("MENUITEM", new MenuitemTagExtractor());
+ extractors.put("VIDEO", new EmbedVideoTagExtractor());
+ extractors.put("AUDIO", new EmbedTagExtractor());
+ extractors.put("TRACK", new EmbedTagExtractor());
+ extractors.put("SOURCE", new EmbedTagExtractor());
+
+ globalHrefAttributes = new HashSet();
+ globalHrefAttributes.add("background");
+ globalHrefAttributes.add("data-href");
+ globalHrefAttributes.add("data-uri");
}
@@ -84,11 +104,19 @@ public void handleTagOpen(TagNode tag) {
inTitle = !tag.isEmptyXmlTag();
return;
}
+
// first the global attributes:
- // background
- String v = tag.getAttribute("background");
- if(v != null) {
- data.addHref(PATH,makePath(name,"background"),"url",v);
+ Vector attributes = tag.getAttributesEx();
+ for (Attribute a : attributes) {
+ String attrName = a.getName();
+ String attrValue = a.getValue();
+ if (attrName == null || attrValue == null) {
+ continue;
+ }
+ attrName = attrName.toLowerCase(Locale.ROOT);
+ if (globalHrefAttributes.contains(attrName)) {
+ data.addHref(PATH,makePath(name,attrName),"url",attrValue);
+ }
}
// TODO: style attribute, BASE(href) tag, Resolve URLs
@@ -296,12 +324,24 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
}
}
+ private static class ButtonTagExtractor implements TagExtractor {
+ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
+ addBasicHrefs(data,node,"formaction");
+ }
+ }
+
private static class EmbedTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
addBasicHrefs(data,node,"src");
}
}
+ private static class EmbedVideoTagExtractor implements TagExtractor {
+ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
+ addBasicHrefs(data,node,"src","poster");
+ }
+ }
+
private static class FormTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
ArrayList l = new ArrayList();
@@ -329,21 +369,26 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
addBasicHrefs(data,node,"src");
}
}
+
private static class IFrameTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
addBasicHrefs(data,node,"src");
}
}
+
private static class ImgTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
addHrefWithAttrs(data,node,"src","alt","title");
+ addBasicHrefs(data,node,"longdesc");
}
}
+
private static class InputTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
- addBasicHrefs(data,node,"src");
+ addBasicHrefs(data,node,"src","formaction");
}
}
+
private static class LinkTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
ArrayList l = getAttrListUrl(node,"href","rel","type");
@@ -352,6 +397,13 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
}
}
}
+
+ private static class MenuitemTagExtractor implements TagExtractor {
+ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
+ addBasicHrefs(data,node,"icon");
+ }
+ }
+
private static class MetaTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
ArrayList l = getAttrList(node,"name","rel","content","http-equiv");
@@ -360,11 +412,19 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
}
}
}
+
private static class ObjectTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
- addBasicHrefs(data,node,"codebase","cdata");
+ addBasicHrefs(data,node,"codebase","cdata","data");
}
}
+
+ private static class QuotationLinkTagExtractor implements TagExtractor {
+ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
+ addBasicHrefs(data,node,"cite");
+ }
+ }
+
private static class ScriptTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
ArrayList l = getAttrListUrl(node,"src","type");
@@ -373,6 +433,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
}
}
}
+
private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String content) {
Matcher m = pattern.matcher(content);
int idx = 0;
diff --git a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
index bfbd6f02..8f690a06 100644
--- a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
+++ b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
@@ -1,15 +1,33 @@
package org.archive.resource.html;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.logging.Logger;
+
+import org.archive.extract.ExtractingResourceFactoryMapper;
+import org.archive.extract.ExtractingResourceProducer;
+import org.archive.extract.ProducerUtils;
+import org.archive.extract.ResourceFactoryMapper;
import org.archive.resource.MetaData;
+import org.archive.resource.Resource;
+import org.archive.resource.ResourceParseException;
+import org.archive.resource.ResourceProducer;
import org.htmlparser.nodes.TextNode;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
+import com.google.common.collect.ArrayListMultimap;
+import com.google.common.collect.Multimap;
+
import junit.framework.TestCase;
public class ExtractingParseObserverTest extends TestCase {
+ private static final Logger LOG =
+ Logger.getLogger(ExtractingParseObserverTest.class.getName());
+
public void testHandleStyleNodeExceptions() throws Exception {
String[] tests = {
"some css",
@@ -103,5 +121,148 @@ private void checkExtract(String[] data) throws JSONException {
}
}
+ private void checkLink(Multimap links, String url, String path) {
+ assertTrue("Link with URL " + url + " not found", links.containsKey(url));
+ assertTrue("Wrong path " + path + " for " + url, links.get(url).contains(path));
+ }
+
+ private void checkLinks(Resource resource, String[][] expectedLinks) {
+ assertNotNull(resource);
+ assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource);
+ MetaData md = resource.getMetaData();
+ LOG.info(md.toString());
+ Multimap links = ArrayListMultimap.create();
+ JSONObject head = md.optJSONObject("Head");
+ if (head != null) {
+ //
+ String baseUrl = (String) head.opt("Base");
+ if (baseUrl != null) {
+ links.put(baseUrl, "__base__");
+ }
+ //
+ JSONArray metas = head.optJSONArray("Metas");
+ if (metas != null) {
+ for (int i = 0; i < metas.length(); i++) {
+ JSONObject o = (JSONObject) metas.optJSONObject(i);
+ String httpEquiv = o.optString("http-equiv");
+ if (httpEquiv != null && httpEquiv.equalsIgnoreCase("Refresh")) {
+ String metaRefreshTarget = o.optString("content");
+ if (metaRefreshTarget != null) {
+ metaRefreshTarget = metaRefreshTarget.replaceFirst("(?i)(?:^\\d+\\s*;)?\\s*url=", "");
+ links.put(metaRefreshTarget, "__meta_refresh__");
+ }
+ }
+ }
+ }
+ }
+ // extract outlinks
+ List linkArrays = new ArrayList();
+ if (md.optJSONArray("Links") != null) {
+ linkArrays.add(md.optJSONArray("Links"));
+ }
+ try {
+ if (md.getJSONObject("Head") != null && md.getJSONObject("Head").getJSONArray("Link") != null) {
+ linkArrays.add(md.getJSONObject("Head").getJSONArray("Link"));
+ }
+ } catch (JSONException e1) {
+ }
+ for (JSONArray ldata : linkArrays) {
+ for (int i = 0; i < ldata.length(); i++) {
+ JSONObject o = (JSONObject) ldata.optJSONObject(i);
+ try {
+ String url = o.getString("url");
+ links.put(url, o.getString("path"));
+ LOG.info(" found link: " + o.getString("url") + " " + o.getString("path"));
+ } catch (JSONException e) {
+ fail("Failed to extract URL from link: " + e.getMessage());
+ }
+ }
+ }
+ assertEquals("Unexpected number of links", expectedLinks.length, links.size());
+ for (String[] l : expectedLinks) {
+ checkLink(links, l[0], l[1]);
+ }
+ }
+
+ public void testLinkExtraction() throws ResourceParseException, IOException {
+ String testFileName = "link-extraction-test.warc";
+ ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath());
+ ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
+ ExtractingResourceProducer extractor =
+ new ExtractingResourceProducer(producer, mapper);
+ extractor.getNext(); // skip warcinfo record
+ String[][] html4links = {
+ {"http://www.example.com/", "__base__"},
+ {"http://www.example.com/redirected.html", "__meta_refresh__"},
+ {"background.jpg", "BODY@/background"},
+ {"http://www.example.com/a-href.html", "A@/href"},
+ {"#anchor", "A@/href"},
+ {"image.png", "IMG@/src"},
+ {"image.gif", "IMG@/src"},
+ {"http://example.com/image-description.html#image.gif", "IMG@/longdesc"},
+ {"helloworld.swf", "OBJECT@/data"},
+ {"http://www.example.com/shakespeare.html", "Q@/cite"},
+ {"http://www.example.com/shakespeare-long.html", "BLOCKQUOTE@/cite"}
+ };
+ checkLinks(extractor.getNext(), html4links);
+ String[][] html5links = {
+ {"http:///www.example.com/video.html", "LINK@/href", "canonical"},
+ {"video.rss", "LINK@/href", "alternate"},
+ {"https://archive.org/download/WebmVp8Vorbis/webmvp8.gif", "VIDEO@/poster"},
+ {"https://archive.org/download/WebmVp8Vorbis/webmvp8.webm", "SOURCE@/src"},
+ {"https://archive.org/download/WebmVp8Vorbis/webmvp8_512kb.mp4", "SOURCE@/src"},
+ {"https://archive.org/download/WebmVp8Vorbis/webmvp8.ogv", "SOURCE@/src"}
+ };
+ checkLinks(extractor.getNext(), html5links);
+ String[][] html5links2 = {
+ {"http://www.example.com/", "A@/href"},
+ };
+ checkLinks(extractor.getNext(), html5links2);
+ String[][] fbVideoLinks = {
+ {"https://www.facebook.com/facebook/videos/10153231379946729/", "BLOCKQUOTE@/cite"},
+ {"https://www.facebook.com/facebook/videos/10153231379946729/", "A@/href"},
+ {"https://www.facebook.com/facebook/", "A@/href"},
+ {"https://www.facebook.com/facebook/videos/10153231379946729/", "DIV@/data-href"}
+ };
+ checkLinks(extractor.getNext(), fbVideoLinks);
+ String[][] dataHrefLinks = {
+ {"standard.css", "LINK@/href", "stylesheet"},
+ {"https://www.facebook.com/elegantthemes/videos/10153760379211923/", "DIV@/data-href"},
+ {"https://www.facebook.com/facebook/videos/10153231379946729/", "DIV@/data-href"},
+ {"https://www.facebook.com/facebook/videos/10153231379946729/", "BLOCKQUOTE@/cite"},
+ {"https://www.facebook.com/facebook/videos/10153231379946729/", "A@/href"},
+ {"https://www.facebook.com/facebook/", "A@/href"},
+ {"//edge.flowplayer.org/bauhaus.webm", "SOURCE@/src"},
+ {"//edge.flowplayer.org/bauhaus.mp4", "SOURCE@/src"},
+ {"//edge.flowplayer.org/functional.webm", "BUTTON@/data-href"},
+ {"/content-page", "ARTICLE@/data-href"},
+ {"/content-page", "A@/href"},
+ {"/tags/content","A@/href"},
+ {"/tags/headlines", "A@/href"},
+ {"http://grabaperch.com", "DIV@/data-href"},
+ {"green.css", "LINK@/data-href"},
+ {"blue.css", "LINK@/data-href"},
+ {"http://codecanyon.net/user/CodingJack", "A@/data-href"},
+ {"jackbox/img/thumbs/4.jpg", "IMG@/src"},
+ {"//venobox-destination", "A@/data-href"},
+ {"#", "A@/href"},
+ {"http://www.youtube.com/v/itTskyFLSS8&rel=0&autohide=1&showinfo=0&autoplay=1", "DIV@/data-href"},
+ {"#", "A@/href"},
+ {"http://www.youtube.com/v/itTskyFLSS8&rel=0&autohide=1&showinfo=0", "IFRAME@/src"}
+ };
+ checkLinks(extractor.getNext(), dataHrefLinks);
+ String[][] fbSocialLinks = {
+ {"http://www.your-domain.com/your-page.html", "DIV@/data-uri"},
+ {"https://developers.facebook.com/docs/plugins/comments#configurator", "DIV@/data-href"},
+ {"https://www.facebook.com/zuck/posts/10102735452532991?comment_id=1070233703036185", "DIV@/data-href"},
+ {"https://www.facebook.com/zuck", "DIV@/data-href"},
+ {"https://developers.facebook.com/docs/plugins/", "DIV@/data-href"},
+ {"https://www.facebook.com/facebook", "DIV@/data-href"},
+ {"https://www.facebook.com/facebook", "BLOCKQUOTE@/cite"},
+ {"https://www.facebook.com/facebook", "A@/href"},
+ {"http://www.your-domain.com/your-page.html", "DIV@/data-href"}
+ };
+ checkLinks(extractor.getNext(), fbSocialLinks);
+ }
}
diff --git a/src/test/resources/org/archive/resource/html/link-extraction-test.warc b/src/test/resources/org/archive/resource/html/link-extraction-test.warc
new file mode 100644
index 00000000..ab0e54c8
--- /dev/null
+++ b/src/test/resources/org/archive/resource/html/link-extraction-test.warc
@@ -0,0 +1,320 @@
+WARC/1.0
+WARC-Type: warcinfo
+Content-Type: application/warc-fields
+WARC-Date: 2017-02-20T14:00:56Z
+Content-Length: 128
+
+format: WARC File Format 1.0
+conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf
+robots: classic
+
+
+
+WARC/1.0
+WARC-Type: response
+WARC-Date: 2017-02-20T14:00:56Z
+WARC-Target-URI: http://www.example.com/html4.html
+Content-Type: application/http; msgtype=response
+Content-Length: 1243
+
+HTTP/1.1 200 OK
+Date: Mon, 20 Feb 2017 14:00:56 GMT
+Content-Length: 1125
+Content-Type: application/xhtml+xml
+
+
+
+
+
+
+
+Test XHTML Link Extraction
+
+
+A@/href
+
+
+
From 6aa43f83a2cbc2acd0feb7f2c81d66f4ef1b13c5 Mon Sep 17 00:00:00 2001
From: Mohamed Elsayed
Date: Thu, 2 Mar 2017 15:28:16 +0200
Subject: [PATCH 051/240] Fix #25: move missing unit tests over from Heritrix3
---
.../archive/io/ArchiveReaderFactoryTest.java | 94 +++
.../io/BufferedSeekInputStreamTest.java | 67 ++
.../archive/io/HeaderedArchiveRecordTest.java | 209 ++++++
.../archive/io/RecordingInputStreamTest.java | 132 ++++
.../archive/io/ReplayCharSequenceTest.java | 391 ++++++++++
.../io/RepositionableInputStreamTest.java | 70 ++
.../org/archive/io/arc/ARCWriterPoolTest.java | 122 +++
.../org/archive/io/arc/ARCWriterTest.java | 699 ++++++++++++++++++
.../org/archive/io/warc/WARCWriterTest.java | 512 +++++++++++++
.../org/archive/uid/UUIDGeneratorTest.java | 44 ++
.../java/org/archive/util/FileUtilsTest.java | 271 +++++++
.../org/archive/util/MimetypeUtilsTest.java | 63 ++
.../org/archive/util/PropertyUtilsTest.java | 45 ++
.../org/archive/util/anvl/ANVLRecordTest.java | 128 ++++
14 files changed, 2847 insertions(+)
create mode 100644 src/test/java/org/archive/io/ArchiveReaderFactoryTest.java
create mode 100644 src/test/java/org/archive/io/BufferedSeekInputStreamTest.java
create mode 100644 src/test/java/org/archive/io/HeaderedArchiveRecordTest.java
create mode 100644 src/test/java/org/archive/io/RecordingInputStreamTest.java
create mode 100644 src/test/java/org/archive/io/ReplayCharSequenceTest.java
create mode 100644 src/test/java/org/archive/io/RepositionableInputStreamTest.java
create mode 100644 src/test/java/org/archive/io/arc/ARCWriterPoolTest.java
create mode 100644 src/test/java/org/archive/io/arc/ARCWriterTest.java
create mode 100644 src/test/java/org/archive/io/warc/WARCWriterTest.java
create mode 100644 src/test/java/org/archive/uid/UUIDGeneratorTest.java
create mode 100644 src/test/java/org/archive/util/FileUtilsTest.java
create mode 100644 src/test/java/org/archive/util/MimetypeUtilsTest.java
create mode 100644 src/test/java/org/archive/util/PropertyUtilsTest.java
create mode 100644 src/test/java/org/archive/util/anvl/ANVLRecordTest.java
diff --git a/src/test/java/org/archive/io/ArchiveReaderFactoryTest.java b/src/test/java/org/archive/io/ArchiveReaderFactoryTest.java
new file mode 100644
index 00000000..2313868c
--- /dev/null
+++ b/src/test/java/org/archive/io/ArchiveReaderFactoryTest.java
@@ -0,0 +1,94 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Iterator;
+
+import org.apache.commons.lang.StringUtils;
+import org.archive.io.ArchiveRecord;
+import org.archive.io.arc.ARCWriterTest;
+import org.archive.util.TmpDirTestCase;
+
+public class ArchiveReaderFactoryTest extends TmpDirTestCase {
+ /**
+ * Test local file as URL
+ * @throws IOException
+ */
+ public void testGetFileURL() throws IOException {
+ File arc = ARCWriterTest.createARCFile(getTmpDir(), true);
+ ArchiveReader reader = null;
+ try {
+ reader = ArchiveReaderFactory.
+ get(new URL("file:////" + arc.getAbsolutePath()));
+ for (Iterator i = reader.iterator(); i.hasNext();) {
+ ArchiveRecord r = (ArchiveRecord)i.next();
+ assertTrue("mime unread",StringUtils.isNotBlank(r.getHeader().getMimetype()));
+ }
+ } finally {
+ if (reader != null) {
+ reader.close();
+ }
+ }
+ }
+
+ /**
+ * Test local file as File
+ * @throws IOException
+ */
+ public void testGetFile() throws IOException {
+ File arc = ARCWriterTest.createARCFile(getTmpDir(), true);
+ ArchiveReader reader = null;
+ try {
+ reader = ArchiveReaderFactory.get(arc.getAbsoluteFile());
+ for (Iterator i = reader.iterator(); i.hasNext();) {
+ ArchiveRecord r = (ArchiveRecord)i.next();
+ assertTrue("mime unread",StringUtils.isNotBlank(r.getHeader().getMimetype()));
+ }
+ } finally {
+ if (reader != null) {
+ reader.close();
+ }
+ }
+ }
+
+ /**
+ * Test local file as String path
+ * @throws IOException
+ */
+ public void testGetPath() throws IOException {
+ File arc = ARCWriterTest.createARCFile(getTmpDir(), true);
+ ArchiveReader reader = null;
+ try {
+ reader = ArchiveReaderFactory.get(arc.getAbsoluteFile().getAbsolutePath());
+ for (Iterator i = reader.iterator(); i.hasNext();) {
+ ArchiveRecord r = (ArchiveRecord)i.next();
+ assertTrue("mime unread",StringUtils.isNotBlank(r.getHeader().getMimetype()));
+ }
+ } finally {
+ if (reader != null) {
+ reader.close();
+ }
+ }
+ }
+}
diff --git a/src/test/java/org/archive/io/BufferedSeekInputStreamTest.java b/src/test/java/org/archive/io/BufferedSeekInputStreamTest.java
new file mode 100644
index 00000000..270e45e0
--- /dev/null
+++ b/src/test/java/org/archive/io/BufferedSeekInputStreamTest.java
@@ -0,0 +1,67 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+import java.util.Random;
+
+import junit.framework.TestCase;
+
+
+/**
+ * Unit test for BufferedSeekInputStream. The tests do some random
+ * repositioning in the stream to make sure the buffer is always valid.
+ *
+ * @author pjack
+ */
+public class BufferedSeekInputStreamTest extends TestCase {
+
+
+ private static byte[] TEST_DATA = makeTestData();
+
+ public void testPosition() throws Exception {
+ Random random = new Random();
+ ArraySeekInputStream asis = new ArraySeekInputStream(TEST_DATA);
+ BufferedSeekInputStream bsis = new BufferedSeekInputStream(asis, 11);
+ for (int i = 0; i < TEST_DATA.length; i++) {
+ byte b = (byte)bsis.read();
+ assertEquals(TEST_DATA[i], b);
+ }
+ for (int i = 0; i < 1000; i++) {
+ int index = random.nextInt(TEST_DATA.length);
+ bsis.position(index);
+ char expected = (char)((int)TEST_DATA[index] & 0xFF);
+ char read = (char)(bsis.read() & 0xFF);
+ assertEquals(expected, read);
+ }
+ }
+
+
+ private static byte[] makeTestData() {
+ String s = "If the dull substance of my flesh were thought\n"
+ + "Injurious distance could not stop my way\n"
+ + "For then, despite of space, I would be brought\n"
+ + "From limits far remote where thou dost stay.\n";
+ byte[] r = new byte[s.length()];
+ for (int i = 0; i < r.length; i++) {
+ r[i] = (byte)s.charAt(i);
+// r[i] = (byte)s.charAt(i);
+ }
+ return r;
+ }
+}
diff --git a/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java b/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java
new file mode 100644
index 00000000..9f7e2a15
--- /dev/null
+++ b/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java
@@ -0,0 +1,209 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import junit.framework.TestCase;
+
+import org.apache.commons.httpclient.Header;
+import org.archive.io.arc.ARCRecord;
+import org.archive.io.warc.WARCRecord;
+
+public class HeaderedArchiveRecordTest extends TestCase {
+ private static final String HTTPHEADER = "HTTP/1.1 200 OK\r\n"
+ + "Last-Modified: Sun, 28 Aug 2005 14:10:55 GMT\r\n"
+ + "Content-Length: 108\r\n" + "Connection: close\r\n"
+ + "Content-Type: text/html\r\n" + "\r\n";
+ private static final String BODY = "\r\n" + " \r\n"
+ + " Neue Seite 1\r\n" + " \r\n"
+ + " \r\n" + " \r\n" + "";
+
+ public void testParseHttpHeadersInWARC() throws IOException {
+ final String url = "http://foo.maths.uq.edu.au/index.html";
+ // final String warcHeader = "WARC/0.10 000000000486 response " +
+ // url + " 20070315152520 " +
+ // "urn:uuid:d8b342a8-dba4-4d7f-a551-1d8184f2ff58 " +
+ // "application/http; msgtype=response\r\n" +
+ // "Checksum: sha1:IT6YEX5WHKK57GOEHV2YHTTXEP5KPM6A\r\n" +
+ // "IP-Address: 80.150.6.184\r\n" +
+ // "\r\n";
+
+ final String warcHeader = "WARC/0.12\r\n"
+ + "MIME-Version: 1.0\r\n"
+ + "WARC-Record-Type: response\r\n"
+ + "WARC-Target-URI: http://foo.maths.uq.edu.au/index.html\r\n"
+ + "WARC-Date: 2006-09-19T17:20:24Z\r\n"
+ + "WARC-Digest: sha1:IT6YEX5WHKK57GOEHV2YHTTXEP5KPM6A\r\n"
+ + "WARC-IP-Address: 80.150.6.184\r\n"
+ + "Content-ID: \r\n"
+ + "Content-Type: application/http; msgtype=response\r\n"
+ + "Content-Length: " + (HTTPHEADER.length() + BODY.length()) + "\r\n"
+ + "\r\n";
+
+ final String hdr = warcHeader + HTTPHEADER + BODY;
+
+ WARCRecord r = new WARCRecord(new ByteArrayInputStream(hdr.getBytes()),
+ "READER_IDENTIFIER", 0, false, true);
+ HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true);
+
+ har.skipHttpHeader();
+
+ byte[] b = new byte[BODY.length()];
+ har.read(b);
+ String bodyRead = new String(b);
+ assertEquals(BODY, bodyRead);
+ assertHeaderCorrectlyParsed(har.getContentHeaders());
+ assertEquals("failed to retrieve Url from metadata", har.getHeader()
+ .getUrl(), url);
+ }
+
+ public void testParseHttpHeadersInARC() throws IOException {
+ final int len = HTTPHEADER.length() + BODY.length();
+ final int contentLength = BODY.length();
+ final String url = "http://www.ly.gov.tw:80/accpart.htm";
+ final String hdr = HTTPHEADER + BODY;
+ // Interesting difference between ARCRecord and WARCRecord is that the
+ // stream passed the ARCRecord is supposed to be just past the
+ // ARCRecord metadata line where as stream passed WARCRecord is at
+ // record start. TODO: Add to ARCRecord constructor that doesn't
+ // take an ArchiveRecordHeader but rather parses it from the stream.
+ ArchiveRecordHeader arh = new ArchiveRecordHeader() {
+ public int getContentBegin() {
+ // TODO: In ARCs, this is where http headers end and
+ // the content begins. Need to reconcile for generic
+ // HeaderedArchiveRecord processing. In this context, it
+ // makes sense setting it to zero -- HeaderedArchiveRecord
+ // will then figure it out.
+ return 0;
+ }
+
+ public String getDate() {
+ return null;
+ }
+
+ public String getDigest() {
+ return null;
+ }
+
+ public Set getHeaderFieldKeys() {
+ return null;
+ }
+
+ public Map getHeaderFields() {
+ return null;
+ }
+
+ public Object getHeaderValue(String key) {
+ return null;
+ }
+
+ public long getLength() {
+ return len;
+ }
+
+ public long getContentLength() {
+ return contentLength;
+ }
+
+ public String getMimetype() {
+ return null;
+ }
+
+ public long getOffset() {
+ return 0;
+ }
+
+ public String getReaderIdentifier() {
+ return null;
+ }
+
+ public String getRecordIdentifier() {
+ return null;
+ }
+
+ public String getUrl() {
+ return url;
+ }
+
+ public String getVersion() {
+ return null;
+ }
+
+ };
+ ARCRecord r = new ARCRecord(new ByteArrayInputStream(hdr.getBytes()),
+ arh, 0, false, true, false);
+
+ HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true);
+ har.skipHttpHeader();
+ byte[] b = new byte[BODY.length()];
+ har.read(b);
+ String bodyRead = new String(b);
+ assertEquals(BODY, bodyRead);
+ assertHeaderCorrectlyParsed(har.getContentHeaders());
+ }
+
+ public void testEasierParseHttpHeadersInARC() throws IOException {
+ final String url = "http://www.archive.org/index.htm";
+ final String arcHeader = url
+ + " 192.168.0.1 20070515111004 text/html 167568\n";
+ final String hdr = arcHeader + HTTPHEADER + BODY;
+
+ ARCRecord r = new ARCRecord(new ByteArrayInputStream(hdr.getBytes()),
+ "READER_IDENTIFIER", 0, false, true, false);
+
+ HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true);
+ har.skipHttpHeader();
+ byte[] b = new byte[BODY.length()];
+ har.read(b);
+ String bodyRead = new String(b);
+ assertEquals(BODY, bodyRead);
+ assertHeaderCorrectlyParsed(har.getContentHeaders());
+ assertEquals("failed to retrieve Url from metadata", har.getHeader()
+ .getUrl(), url);
+ }
+
+ private void assertHeaderCorrectlyParsed(Header[] headers) {
+ final List orgHeaders = Arrays.asList(HTTPHEADER.split("\r\n"));
+ assertEquals("not all HTTP header entries have been retrieved",
+ orgHeaders.size(), headers.length + 1);
+
+ for (Header header : headers) {
+ assertTrue(orgHeaders.contains(header.getName() + ": "
+ + header.getValue()));
+ }
+ }
+
+ public void testNoheaderWARC() throws IOException {
+ String b = "hello world";
+ String c = "WARC/0.12\r\nContent-Type: text/plain\r\n"
+ + "Content-Length: " + b.length() + "\r\n\r\n" + b;
+ org.archive.io.warc.WARCRecord r = new org.archive.io.warc.WARCRecord(
+ new ByteArrayInputStream(c.getBytes()), "READER_IDENTIFIER", 0,
+ false, true);
+ HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true);
+ assertTrue(har.isStrict());
+ }
+}
diff --git a/src/test/java/org/archive/io/RecordingInputStreamTest.java b/src/test/java/org/archive/io/RecordingInputStreamTest.java
new file mode 100644
index 00000000..20a8b8b3
--- /dev/null
+++ b/src/test/java/org/archive/io/RecordingInputStreamTest.java
@@ -0,0 +1,132 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.PipedInputStream;
+import java.io.PipedOutputStream;
+
+import org.archive.util.TmpDirTestCase;
+
+
+/**
+ * Test cases for RecordingInputStream.
+ *
+ * @author gojomo
+ */
+public class RecordingInputStreamTest extends TmpDirTestCase
+{
+
+
+ /*
+ * @see TmpDirTestCase#setUp()
+ */
+ protected void setUp() throws Exception
+ {
+ super.setUp();
+ }
+
+ /**
+ * Test readFullyOrUntil soft (no exception) and hard (exception)
+ * length cutoffs, timeout, and rate-throttling.
+ *
+ * @throws IOException
+ * @throws InterruptedException
+ * @throws RecorderTimeoutException
+ */
+ public void testReadFullyOrUntil() throws RecorderTimeoutException, IOException, InterruptedException
+ {
+ RecordingInputStream ris = new RecordingInputStream(16384, (new File(
+ getTmpDir(), "testReadFullyOrUntil").getAbsolutePath()));
+ ByteArrayInputStream bais = new ByteArrayInputStream(
+ "abcdefghijklmnopqrstuvwxyz".getBytes());
+ // test soft max
+ ris.open(bais);
+ ris.setLimits(10,0,0);
+ ris.readFullyOrUntil(7);
+ ris.close();
+ ReplayInputStream res = ris.getReplayInputStream();
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ res.readFullyTo(baos);
+ assertEquals("soft max cutoff","abcdefg",new String(baos.toByteArray()));
+ // test hard max
+ bais.reset();
+ baos.reset();
+ ris.open(bais);
+ boolean exceptionThrown = false;
+ try {
+ ris.setLimits(10,0,0);
+ ris.readFullyOrUntil(13);
+ } catch (RecorderLengthExceededException ex) {
+ exceptionThrown = true;
+ }
+ assertTrue("hard max exception",exceptionThrown);
+ ris.close();
+ res = ris.getReplayInputStream();
+ res.readFullyTo(baos);
+ assertEquals("hard max cutoff","abcdefghijk",
+ new String(baos.toByteArray()));
+ // test timeout
+ PipedInputStream pin = new PipedInputStream();
+ PipedOutputStream pout = new PipedOutputStream(pin);
+ ris.open(pin);
+ exceptionThrown = false;
+ trickle("abcdefghijklmnopqrstuvwxyz".getBytes(),pout);
+ try {
+ ris.setLimits(0,5000,0);
+ ris.readFullyOrUntil(0);
+ } catch (RecorderTimeoutException ex) {
+ exceptionThrown = true;
+ }
+ assertTrue("timeout exception",exceptionThrown);
+ ris.close();
+ // test rate limit
+ bais = new ByteArrayInputStream(new byte[1024*2*5]);
+ ris.open(bais);
+ long startTime = System.currentTimeMillis();
+ ris.setLimits(0,0,2);
+ ris.readFullyOrUntil(0);
+ long endTime = System.currentTimeMillis();
+ long duration = endTime - startTime;
+ assertTrue("read too fast: "+duration,duration>=5000);
+ ris.close();
+ }
+
+ protected void trickle(final byte[] bytes, final PipedOutputStream pout) {
+ new Thread() {
+ public void run() {
+ try {
+ for (int i = 0; i < bytes.length; i++) {
+ Thread.sleep(1000);
+ pout.write(bytes[i]);
+ }
+ pout.close();
+ } catch (IOException e) {
+ // do nothing
+ } catch (Exception e) {
+ System.err.print(e);
+ }
+ }
+ }.start();
+
+ }
+}
diff --git a/src/test/java/org/archive/io/ReplayCharSequenceTest.java b/src/test/java/org/archive/io/ReplayCharSequenceTest.java
new file mode 100644
index 00000000..9208594a
--- /dev/null
+++ b/src/test/java/org/archive/io/ReplayCharSequenceTest.java
@@ -0,0 +1,391 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io;
+
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.text.NumberFormat;
+import java.util.Date;
+import java.util.Random;
+import java.util.logging.Logger;
+
+import org.archive.util.FileUtils;
+import org.archive.util.TmpDirTestCase;
+
+import com.google.common.base.Charsets;
+
+/**
+ * Test ReplayCharSequences.
+ *
+ * @author stack, gojomo
+ * @version $Revision$, $Date$
+ */
+public class ReplayCharSequenceTest extends TmpDirTestCase
+{
+ /**
+ * Logger.
+ */
+ private static Logger logger =
+ Logger.getLogger("org.archive.io.ReplayCharSequenceFactoryTest");
+
+
+ private static final int SEQUENCE_LENGTH = 127;
+ private static final int MULTIPLIER = 3;
+ private static final int BUFFER_SIZE = SEQUENCE_LENGTH * MULTIPLIER;
+ private static final int INCREMENT = 1;
+
+ /**
+ * Buffer of regular content.
+ */
+ private byte [] regularBuffer = null;
+
+ /*
+ * @see TestCase#setUp()
+ */
+ protected void setUp() throws Exception
+ {
+ super.setUp();
+ this.regularBuffer =
+ fillBufferWithRegularContent(new byte [BUFFER_SIZE]);
+ }
+
+ public void testShiftjis() throws IOException {
+
+ // Here's the bytes for the JIS encoding of the Japanese form of Nihongo
+ byte[] bytes_nihongo = {
+ (byte) 0x1B, (byte) 0x24, (byte) 0x42, (byte) 0x46,
+ (byte) 0x7C, (byte) 0x4B, (byte) 0x5C, (byte) 0x38,
+ (byte) 0x6C, (byte) 0x1B, (byte) 0x28, (byte) 0x42,
+ (byte) 0x1B, (byte) 0x28, (byte) 0x42 };
+ final String ENCODING = "SJIS";
+ // Here is nihongo converted to JVM encoding.
+ String nihongo = new String(bytes_nihongo, ENCODING);
+
+ RecordingOutputStream ros = writeTestStream(
+ bytes_nihongo,MULTIPLIER,
+ "testShiftjis",MULTIPLIER);
+ // TODO: check for existence of overflow file?
+ ReplayCharSequence rcs = getReplayCharSequence(ros,Charset.forName(ENCODING));
+
+ // Now check that start of the rcs comes back in as nihongo string.
+ String rcsStr = rcs.subSequence(0, nihongo.length()).toString();
+ assertTrue("Nihongo " + nihongo + " does not equal converted string" +
+ " from rcs " + rcsStr,
+ nihongo.equals(rcsStr));
+ // And assert next string is also properly nihongo.
+ if (rcs.length() >= (nihongo.length() * 2)) {
+ rcsStr = rcs.subSequence(nihongo.length(),
+ nihongo.length() + nihongo.length()).toString();
+ assertTrue("Nihongo " + nihongo + " does not equal converted " +
+ " string from rcs (2nd time)" + rcsStr,
+ nihongo.equals(rcsStr));
+ }
+ }
+
+ public void testGetReplayCharSequenceByteZeroOffset() throws IOException {
+
+ RecordingOutputStream ros = writeTestStream(
+ regularBuffer,MULTIPLIER,
+ "testGetReplayCharSequenceByteZeroOffset",MULTIPLIER);
+ ReplayCharSequence rcs = getReplayCharSequence(ros);
+
+ for (int i = 0; i < MULTIPLIER; i++) {
+ accessingCharacters(rcs);
+ }
+ }
+
+ private ReplayCharSequence getReplayCharSequence(RecordingOutputStream ros) throws IOException {
+ return getReplayCharSequence(ros,null);
+ }
+
+ private ReplayCharSequence getReplayCharSequence(RecordingOutputStream ros, Charset charset) throws IOException {
+ return new GenericReplayCharSequence(ros.getReplayInputStream(),
+ ros.getBufferLength()/2, ros.backingFilename, charset);
+ }
+
+
+ public void testGetReplayCharSequenceMultiByteZeroOffset()
+ throws IOException {
+
+ RecordingOutputStream ros = writeTestStream(
+ regularBuffer,MULTIPLIER,
+ "testGetReplayCharSequenceMultiByteZeroOffset",MULTIPLIER);
+ ReplayCharSequence rcs = getReplayCharSequence(ros,Charsets.UTF_8);
+
+ for (int i = 0; i < MULTIPLIER; i++) {
+ accessingCharacters(rcs);
+ }
+ }
+
+ public void testReplayCharSequenceByteToString() throws IOException {
+ String fileContent = "Some file content";
+ byte [] buffer = fileContent.getBytes();
+ RecordingOutputStream ros = writeTestStream(
+ buffer,1,
+ "testReplayCharSequenceByteToString.txt",0);
+ ReplayCharSequence rcs = getReplayCharSequence(ros);
+ String result = rcs.toString();
+ assertEquals("Strings don't match",result,fileContent);
+ }
+
+ private String toHexString(String str)
+ {
+ if (str != null) {
+ StringBuilder buf = new StringBuilder("{ ");
+ buf.append(Integer.toString(str.charAt(0), 16));
+ for (int i = 1; i < str.length(); i++) {
+ buf.append(", ");
+ buf.append(Integer.toString(str.charAt(i), 16));
+ }
+ buf.append(" }");
+ return buf.toString();
+ }
+ else
+ return "null";
+ }
+
+ public void testSingleByteEncodings() throws IOException {
+ byte[] bytes = {
+ (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64,
+ (byte) 0x7d, (byte) 0x7e, (byte) 0x7f, (byte) 0x80,
+ (byte) 0x81, (byte) 0x82, (byte) 0x83, (byte) 0x84,
+ (byte) 0xfc, (byte) 0xfd, (byte) 0xfe, (byte) 0xff };
+
+ String latin1String = new String(bytes, "latin1");
+ RecordingOutputStream ros = writeTestStream(
+ bytes, 1, "testSingleByteEncodings-latin1.txt", 0);
+ ReplayCharSequence rcs = getReplayCharSequence(ros,Charsets.ISO_8859_1);
+ String result = rcs.toString();
+ logger.fine("latin1[0] " + toHexString(latin1String));
+ logger.fine("latin1[1] " + toHexString(result));
+ assertEquals("latin1 strings don't match", result, latin1String);
+
+ String w1252String = new String(bytes, "windows-1252");
+ ros = writeTestStream(
+ bytes, 1, "testSingleByteEncodings-windows-1252.txt", 0);
+ rcs = getReplayCharSequence(ros,Charset.forName("windows-1252"));
+ result = rcs.toString();
+ logger.fine("windows-1252[0] " + toHexString(w1252String));
+ logger.fine("windows-1252[1] " + toHexString(result));
+ assertEquals("windows-1252 strings don't match", result, w1252String);
+
+ String asciiString = new String(bytes, "ascii");
+ ros = writeTestStream(
+ bytes, 1, "testSingleByteEncodings-ascii.txt", 0);
+ rcs = getReplayCharSequence(ros,Charset.forName("ascii"));
+ result = rcs.toString();
+ logger.fine("ascii[0] " + toHexString(asciiString));
+ logger.fine("ascii[1] " + toHexString(result));
+ assertEquals("ascii strings don't match", result, asciiString);
+ }
+
+ public void testReplayCharSequenceByteToStringOverflow() throws IOException {
+ String fileContent = "Some file content. "; // ascii
+ byte [] buffer = fileContent.getBytes();
+ RecordingOutputStream ros = writeTestStream(
+ buffer,1,
+ "testReplayCharSequenceByteToStringOverflow.txt",1);
+ String expectedContent = fileContent+fileContent;
+
+ // The string is ascii which is a subset of both these encodings. Use
+ // both encodings because they exercise different code paths. UTF-8 is
+ // decoded to UTF-16 while windows-1252 is memory mapped directly. See
+ // GenericReplayCharSequence
+ ReplayCharSequence rcsUtf8 = getReplayCharSequence(ros,Charsets.UTF_8);
+ ReplayCharSequence rcs1252 = getReplayCharSequence(ros,Charset.forName("windows-1252"));
+
+ String result = rcsUtf8.toString();
+ assertEquals("Strings don't match", expectedContent, result);
+
+ result = rcs1252.toString();
+ assertEquals("Strings don't match", expectedContent, result);
+ }
+
+ public void testReplayCharSequenceByteToStringMulti() throws IOException {
+ String fileContent = "Some file content";
+ byte [] buffer = fileContent.getBytes("UTF-8");
+ final int MULTIPLICAND = 10;
+ StringBuilder sb =
+ new StringBuilder(MULTIPLICAND * fileContent.length());
+ for (int i = 0; i < MULTIPLICAND; i++) {
+ sb.append(fileContent);
+ }
+ String expectedResult = sb.toString();
+ RecordingOutputStream ros = writeTestStream(
+ buffer,1,
+ "testReplayCharSequenceByteToStringMulti.txt",MULTIPLICAND-1);
+ for (int i = 0; i < 3; i++) {
+ ReplayCharSequence rcs = getReplayCharSequence(ros,Charsets.UTF_8);
+ String result = rcs.toString();
+ assertEquals("Strings don't match", result, expectedResult);
+ rcs.close();
+ System.gc();
+ System.runFinalization();
+ }
+ }
+
+ public void xestHugeReplayCharSequence() throws IOException {
+ String fileContent = "01234567890123456789";
+ String characterEncoding = "ascii";
+ byte[] buffer = fileContent.getBytes(characterEncoding);
+
+ long reps = (long) Integer.MAX_VALUE / (long) buffer.length + 1000000l;
+
+ logger.info("writing " + (reps * buffer.length)
+ + " bytes to testHugeReplayCharSequence.txt");
+ RecordingOutputStream ros = writeTestStream(buffer, 0,
+ "testHugeReplayCharSequence.txt", reps);
+ ReplayCharSequence rcs = getReplayCharSequence(ros,Charset.forName(characterEncoding));
+
+ if (reps * fileContent.length() > (long) Integer.MAX_VALUE) {
+ assertTrue("ReplayCharSequence has wrong length (length()="
+ + rcs.length() + ") (should be " + Integer.MAX_VALUE + ")",
+ rcs.length() == Integer.MAX_VALUE);
+ } else {
+ assertEquals("ReplayCharSequence has wrong length (length()="
+ + rcs.length() + ") (should be "
+ + (reps * fileContent.length()) + ")", (long) rcs.length(),
+ reps * (long) fileContent.length());
+ }
+
+ // boundary cases or something
+ for (int index : new int[] { 0, rcs.length() / 4, rcs.length() / 2,
+ rcs.length() - 1, rcs.length() / 4 }) {
+ // logger.info("testing char at index=" +
+ // NumberFormat.getInstance().format(index));
+ assertEquals("Characters don't match (index="
+ + NumberFormat.getInstance().format(index) + ")",
+ fileContent.charAt(index % fileContent.length()), rcs
+ .charAt(index));
+ }
+
+ // check that out of bounds indices throw exception
+ for (int n : new int[] { -1, Integer.MIN_VALUE, rcs.length() + 1 }) {
+ try {
+ String message = "rcs.charAt(" + n + ")=" + rcs.charAt(n)
+ + " ?!? -- expected IndexOutOfBoundsException";
+ logger.severe(message);
+ fail(message);
+ } catch (IndexOutOfBoundsException e) {
+ logger.info("got expected exception: " + e);
+ }
+ }
+
+ // check some characters at random spots & kinda stress test the
+ // system's memory mapping facility
+ Random rand = new Random(0); // seed so we get the same ones each time
+ for (int i = 0; i < 5000; i++) {
+ int index = rand.nextInt(rcs.length());
+ // logger.info(i + ". testing char at index=" +
+ // NumberFormat.getInstance().format(index));
+ assertEquals("Characters don't match (index="
+ + NumberFormat.getInstance().format(index) + ")",
+ fileContent.charAt(index % fileContent.length()), rcs
+ .charAt(index));
+ }
+ }
+
+ /**
+ * Accessing characters test.
+ *
+ * Checks that characters in the rcs are in sequence.
+ *
+ * @param rcs The ReplayCharSequence to try out.
+ */
+ private void accessingCharacters(CharSequence rcs) {
+ long timestamp = (new Date()).getTime();
+ int seeks = 0;
+ for (int i = (INCREMENT * 2); (i + INCREMENT) < rcs.length();
+ i += INCREMENT) {
+ checkCharacter(rcs, i);
+ seeks++;
+ for (int j = i - INCREMENT; j < i; j++) {
+ checkCharacter(rcs, j);
+ seeks++;
+ }
+ }
+ // Note that printing out below breaks cruisecontrols drawing
+ // of the xml unit test results because it outputs disallowed
+ // xml characters.
+ logger.fine(rcs + " seeks count " + seeks + " in " +
+ ((new Date().getTime()) - timestamp) + " milliseconds.");
+ }
+
+ /**
+ * Check the character read.
+ *
+ * Throws assertion if not expected result.
+ *
+ * @param rcs ReplayCharSequence to read from.
+ * @param i Character offset.
+ */
+ private void checkCharacter(CharSequence rcs, int i) {
+ int c = rcs.charAt(i);
+ assertTrue("Character " + Integer.toString(c) + " at offset " + i +
+ " unexpected.", (c % SEQUENCE_LENGTH) == (i % SEQUENCE_LENGTH));
+ }
+
+ /**
+ * @param baseName
+ * @return RecordingOutputStream
+ * @throws IOException
+ */
+ private RecordingOutputStream writeTestStream(byte[] content,
+ int memReps, String baseName, long fileReps) throws IOException {
+ String backingFilename = FileUtils.maybeRelative(getTmpDir(),baseName).getAbsolutePath();
+ RecordingOutputStream ros = new RecordingOutputStream(
+ content.length * memReps,
+ backingFilename);
+ ros.open();
+ ros.markMessageBodyBegin();
+ for(long i = 0; i < (memReps+fileReps); i++) {
+ // fill buffer (repeat MULTIPLIER times) and
+ // overflow to disk (also MULTIPLIER times)
+ ros.write(content);
+ }
+ ros.close();
+ return ros;
+ }
+
+
+ /**
+ * Fill a buffer w/ regular progression of single-byte
+ * (and <= 127) characters.
+ * @param buffer Buffer to fill.
+ * @return The buffer we filled.
+ */
+ private byte [] fillBufferWithRegularContent(byte [] buffer) {
+ int index = 0;
+ for (int i = 0; i < buffer.length; i++) {
+ buffer[i] = (byte) (index & 0x00ff);
+ index++;
+ if (index >= SEQUENCE_LENGTH) {
+ // Reset the index.
+ index = 0;
+ }
+ }
+ return buffer;
+ }
+
+ public void testCheckParameters()
+ {
+ // TODO.
+ }
+}
diff --git a/src/test/java/org/archive/io/RepositionableInputStreamTest.java b/src/test/java/org/archive/io/RepositionableInputStreamTest.java
new file mode 100644
index 00000000..1c7cc74c
--- /dev/null
+++ b/src/test/java/org/archive/io/RepositionableInputStreamTest.java
@@ -0,0 +1,70 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.PrintWriter;
+
+import org.archive.util.TmpDirTestCase;
+
+public class RepositionableInputStreamTest extends TmpDirTestCase {
+ private File testFile;
+ private static final String LINE = "0123456789abcdefghijklmnopqrstuv";
+ protected void setUp() throws Exception {
+ super.setUp();
+ this.testFile = new File(getTmpDir(), this.getClass().getName());
+ PrintWriter pw = new PrintWriter(new FileOutputStream(testFile));
+ for (int i = 0; i < 100; i++) {
+ pw.print(LINE);
+ }
+ pw.close();
+ }
+ protected void tearDown() throws Exception {
+ super.tearDown();
+ }
+ public void testname() throws Exception {
+ // Make buffer awkward size so we run into buffers spanning issues.
+ RepositionableInputStream ris =
+ new RepositionableInputStream(new FileInputStream(this.testFile),
+ 57);
+ int c = ris.read();
+ assertEquals(1, ris.position());
+ ris.read();
+ ris.position(0);
+ assertEquals(0, ris.position());
+ int c1 = ris.read();
+ assertEquals(c, c1);
+ ris.position(0);
+ byte [] bytes = new byte[LINE.length()];
+ long offset = 0;
+ for (int i = 0; i < 10; i++) {
+ ris.read(bytes, 0, LINE.length());
+ assertEquals(LINE, new String(bytes));
+ offset += LINE.length();
+ assertEquals(offset, ris.position());
+ }
+ long p = ris.position();
+ ris.position(p - LINE.length());
+ assertEquals(p - LINE.length(), ris.position());
+ c = ris.read();
+ assertEquals(c, c1);
+ }
+}
diff --git a/src/test/java/org/archive/io/arc/ARCWriterPoolTest.java b/src/test/java/org/archive/io/arc/ARCWriterPoolTest.java
new file mode 100644
index 00000000..f0be6506
--- /dev/null
+++ b/src/test/java/org/archive/io/arc/ARCWriterPoolTest.java
@@ -0,0 +1,122 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io.arc;
+
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.util.Arrays;
+
+import org.archive.io.WriterPool;
+import org.archive.io.WriterPoolMember;
+import org.archive.io.WriterPoolSettings;
+import org.archive.util.TmpDirTestCase;
+
+
+/**
+ * Test ARCWriterPool
+ */
+@SuppressWarnings("deprecation")
+public class ARCWriterPoolTest extends TmpDirTestCase {
+ private static final String PREFIX = "TEST";
+
+ public void testARCWriterPool()
+ throws Exception {
+ final int MAX_ACTIVE = 3;
+ final int MAX_WAIT_MILLISECONDS = 100;
+ cleanUpOldFiles(PREFIX);
+ WriterPool pool = new ARCWriterPool(getSettings(true),
+ MAX_ACTIVE, MAX_WAIT_MILLISECONDS);
+ WriterPoolMember [] writers = new WriterPoolMember[MAX_ACTIVE];
+ final String CONTENT = "Any old content";
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ baos.write(CONTENT.getBytes());
+ for (int i = 0; i < MAX_ACTIVE; i++) {
+ writers[i] = pool.borrowFile();
+ assertEquals("Number active", i + 1, pool.getNumActive());
+ ((ARCWriter)writers[i]).write("http://one.two.three", "no-type",
+ "0.0.0.0", 1234567890, CONTENT.length(), baos);
+ }
+
+ // Pool is maxed out. New behavior is that additional requests
+ // block as long as necessary -- so no longer testing for timeout/
+ // exception
+
+ for (int i = (MAX_ACTIVE - 1); i >= 0; i--) {
+ pool.returnFile(writers[i]);
+ assertEquals("Number active", i, pool.getNumActive());
+ assertEquals("Number idle", MAX_ACTIVE - pool.getNumActive(),
+ pool.getNumIdle());
+ }
+ pool.close();
+ }
+
+ public void testInvalidate() throws Exception {
+ final int MAX_ACTIVE = 3;
+ final int MAX_WAIT_MILLISECONDS = 100;
+ cleanUpOldFiles(PREFIX);
+ WriterPool pool = new ARCWriterPool(getSettings(true),
+ MAX_ACTIVE, MAX_WAIT_MILLISECONDS);
+ WriterPoolMember [] writers = new WriterPoolMember[MAX_ACTIVE];
+ final String CONTENT = "Any old content";
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ baos.write(CONTENT.getBytes());
+ for (int i = 0; i < MAX_ACTIVE; i++) {
+ writers[i] = pool.borrowFile();
+ assertEquals("Number active", i + 1, pool.getNumActive());
+ ((ARCWriter)writers[i]).write("http://one.two.three", "no-type",
+ "0.0.0.0", 1234567890, CONTENT.length(), baos);
+ }
+
+ WriterPoolMember writer2Invalidate = writers[pool.getNumActive() - 1];
+ writers[pool.getNumActive() - 1] = null;
+ pool.invalidateFile(writer2Invalidate);
+ for (int i = 0; i < (MAX_ACTIVE - 1); i++) {
+ if (writers[i] == null) {
+ continue;
+ }
+ pool.returnFile(writers[i]);
+ }
+
+ for (int i = 0; i < MAX_ACTIVE; i++) {
+ writers[i] = pool.borrowFile();
+ assertEquals("Number active", i + 1, pool.getNumActive());
+ ((ARCWriter)writers[i]).write("http://one.two.three", "no-type",
+ "0.0.0.0", 1234567890, CONTENT.length(), baos);
+ }
+ for (int i = (MAX_ACTIVE - 1); i >= 0; i--) {
+ pool.returnFile(writers[i]);
+ assertEquals("Number active", i, pool.getNumActive());
+ assertEquals("Number idle", MAX_ACTIVE - pool.getNumActive(),
+ pool.getNumIdle());
+ }
+ pool.close();
+ }
+
+ private WriterPoolSettings getSettings(final boolean isCompressed) {
+ File [] files = {getTmpDir()};
+ return new WriterPoolSettingsData(
+ PREFIX,
+ "${prefix}-${timestamp17}-${serialno}-${heritrix.hostname}",
+ ARCConstants.DEFAULT_MAX_ARC_FILE_SIZE,
+ isCompressed,
+ Arrays.asList(files),
+ null);
+ }
+}
\ No newline at end of file
diff --git a/src/test/java/org/archive/io/arc/ARCWriterTest.java b/src/test/java/org/archive/io/arc/ARCWriterTest.java
new file mode 100644
index 00000000..f6e2bf6a
--- /dev/null
+++ b/src/test/java/org/archive/io/arc/ARCWriterTest.java
@@ -0,0 +1,699 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io.arc;
+
+import java.io.BufferedInputStream;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PrintStream;
+import java.util.Arrays;
+import java.util.Date;
+import java.util.Iterator;
+import java.util.List;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.input.NullInputStream;
+import org.apache.commons.io.output.NullOutputStream;
+import org.archive.io.ArchiveRecord;
+import org.archive.io.ArchiveRecordHeader;
+import org.archive.io.ReplayInputStream;
+import org.archive.io.WriterPoolMember;
+import org.archive.io.WriterPoolSettings;
+import org.archive.util.ArchiveUtils;
+import org.archive.util.TmpDirTestCase;
+
+import com.google.common.io.Closeables;
+
+
+/**
+ * Test ARCWriter class.
+ *
+ * This code exercises ARCWriter AND ARCReader. First it writes ARCs w/
+ * ARCWriter. Then it validates what was written w/ ARCReader.
+ *
+ * @author stack
+ */
+public class ARCWriterTest
+extends TmpDirTestCase implements ARCConstants {
+ /**
+ * Utility class for writing bad ARCs (with trailing junk)
+ */
+ public class CorruptibleARCWriter extends ARCWriter {
+ byte[] endJunk = null;
+
+ public CorruptibleARCWriter(AtomicInteger serial_no, WriterPoolSettings settings) {
+ super(serial_no, settings);
+ }
+
+ @Override
+ protected void postWriteRecordTasks() throws IOException {
+ if (endJunk != null) {
+ this.write(endJunk);
+ }
+ super.postWriteRecordTasks();
+ }
+
+ public void setEndJunk(byte[] b) throws IOException {
+ this.endJunk = b;
+ }
+ }
+
+ /**
+ * Suffix to use for ARC files made by JUNIT.
+ */
+ private static final String SUFFIX = "JUNIT";
+
+ private static final String SOME_URL = "http://www.archive.org/test/";
+
+
+ private static final AtomicInteger SERIAL_NO = new AtomicInteger();
+
+ /*
+ * @see TestCase#setUp()
+ */
+ protected void setUp() throws Exception {
+ super.setUp();
+ }
+
+ /*
+ * @see TestCase#tearDown()
+ */
+ protected void tearDown() throws Exception {
+ super.tearDown();
+ }
+
+ protected static String getContent() {
+ return getContent(null);
+ }
+
+ protected static String getContent(String indexStr) {
+ String page = (indexStr != null)? "Page #" + indexStr: "Some Page";
+ return "HTTP/1.1 200 OK\r\n" +
+ "Content-Type: text/html\r\n\r\n" +
+ "" + page +
+ "" +
+ "" + page +
+ "";
+ }
+
+ @SuppressWarnings("deprecation")
+ protected int writeRandomHTTPRecord(ARCWriter arcWriter, int index)
+ throws IOException {
+ String indexStr = Integer.toString(index);
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ // Start the record with an arbitrary 14-digit date per RFC2540
+ String now = ArchiveUtils.get14DigitDate();
+ int recordLength = 0;
+ byte[] record = (getContent(indexStr)).getBytes();
+ recordLength += record.length;
+ baos.write(record);
+ // Add the newline between records back in
+ baos.write("\n".getBytes());
+ recordLength += 1;
+ arcWriter.write("http://www.one.net/id=" + indexStr, "text/html",
+ "0.1.2.3", Long.parseLong(now), recordLength, baos);
+ return recordLength;
+ }
+
+ private File writeRecords(String baseName, boolean compress,
+ long maxSize, int recordCount)
+ throws IOException {
+ cleanUpOldFiles(baseName);
+ File [] files = {getTmpDir()};
+ ARCWriter arcWriter =
+ new ARCWriter(
+ SERIAL_NO,
+ new WriterPoolSettingsData(
+ baseName,
+ "${prefix}-"+SUFFIX,
+ maxSize,
+ compress,
+ Arrays.asList(files),
+ null));
+ assertNotNull(arcWriter);
+ for (int i = 0; i < recordCount; i++) {
+ writeRandomHTTPRecord(arcWriter, i);
+ }
+ arcWriter.close();
+ assertTrue("Doesn't exist: " +
+ arcWriter.getFile().getAbsolutePath(),
+ arcWriter.getFile().exists());
+ return arcWriter.getFile();
+ }
+
+ private void validate(File arcFile, int recordCount)
+ throws FileNotFoundException, IOException {
+ ARCReader reader = ARCReaderFactory.get(arcFile);
+ assertNotNull(reader);
+ List metaDatas = null;
+ if (recordCount == -1) {
+ metaDatas = reader.validate();
+ } else {
+ metaDatas = reader.validate(recordCount);
+ }
+ reader.close();
+ // Now, run through each of the records doing absolute get going from
+ // the end to start. Reopen the arc so no context between this test
+ // and the previous.
+
+ for (int i = metaDatas.size() - 1; i >= 0; i--) {
+ reader = ARCReaderFactory.get(arcFile);
+ ARCRecordMetaData meta = (ARCRecordMetaData)metaDatas.get(i);
+ ArchiveRecord r = reader.get(meta.getOffset());
+ String mimeType = r.getHeader().getMimetype();
+ assertTrue("Record is bogus",
+ mimeType != null && mimeType.length() > 0);
+ reader.close();
+ }
+ assertEquals("Metadata count not as expected",recordCount, metaDatas.size());
+ for (Iterator i = metaDatas.iterator(); i.hasNext();) {
+ ARCRecordMetaData r = (ARCRecordMetaData)i.next();
+ assertTrue("Record is empty", r.getLength() > 0);
+ }
+ }
+
+ public void testCheckARCFileSize()
+ throws IOException {
+ runCheckARCFileSizeTest("checkARCFileSize", false);
+ }
+
+ public void testCheckARCFileSizeCompressed()
+ throws IOException {
+ runCheckARCFileSizeTest("checkARCFileSize", true);
+ }
+
+ public void testWriteRecord() throws IOException {
+ final int recordCount = 2;
+ File arcFile = writeRecords("writeRecord", false,
+ DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
+ validate(arcFile, recordCount + 1); // Header record.
+ }
+
+ public void testRandomAccess() throws IOException {
+ final int recordCount = 3;
+ File arcFile = writeRecords("writeRecord", true,
+ DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
+ ARCReader reader = ARCReaderFactory.get(arcFile);
+ // Get to second record. Get its offset for later use.
+ boolean readFirst = false;
+ String url = null;
+ long offset = -1;
+ long totalRecords = 0;
+ boolean readSecond = false;
+ for (final Iterator i = reader.iterator(); i.hasNext(); totalRecords++) {
+ ARCRecord ar = (ARCRecord)i.next();
+ if (!readFirst) {
+ readFirst = true;
+ continue;
+ }
+ if (!readSecond) {
+ url = ar.getMetaData().getUrl();
+ offset = ar.getMetaData().getOffset();
+ readSecond = true;
+ }
+ }
+ reader.close();
+
+ reader = ARCReaderFactory.get(arcFile, offset);
+ ArchiveRecord ar = reader.get();
+ assertEquals(ar.getHeader().getUrl(), url);
+ ar.close();
+ reader.close();
+
+ // Get reader again. See how iterator works with offset
+ reader = ARCReaderFactory.get(arcFile, offset);
+ int count = 0;
+ for (final Iterator i = reader.iterator(); i.hasNext(); i.next()) {
+ count++;
+ }
+ reader.close();
+ assertEquals(totalRecords - 1, count);
+ }
+
+ public void testWriteRecordCompressed() throws IOException {
+ final int recordCount = 2;
+ File arcFile = writeRecords("writeRecordCompressed", true,
+ DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
+ validate(arcFile, recordCount + 1 /*Header record*/);
+ }
+
+ public void testWriteGiantRecord() throws IOException {
+ PrintStream dummyStream = new PrintStream(new NullOutputStream());
+ ARCWriter arcWriter =
+ new ARCWriter(
+ SERIAL_NO,
+ dummyStream,
+ new File("dummy"),
+ new WriterPoolSettingsData(
+ "",
+ "",
+ -1,
+ false,
+ null,
+ null));
+ assertNotNull(arcWriter);
+
+ // Start the record with an arbitrary 14-digit date per RFC2540
+ long now = System.currentTimeMillis();
+ long recordLength = org.apache.commons.io.FileUtils.ONE_GB * 3;
+
+ arcWriter.write("dummy:uri", "application/octet-stream",
+ "0.1.2.3", now, recordLength, new NullInputStream(recordLength));
+ arcWriter.close();
+ }
+
+ private void runCheckARCFileSizeTest(String baseName, boolean compress)
+ throws FileNotFoundException, IOException {
+ File f = writeRecords(baseName, compress, 1024, 15);
+ validate(f, 15+1);
+ }
+
+ protected CorruptibleARCWriter createARCWriter(String name, boolean compress) {
+ File [] files = {getTmpDir()};
+ return new CorruptibleARCWriter(
+ SERIAL_NO,
+ new WriterPoolSettingsData(
+ name,
+ "${prefix}-"+SUFFIX,
+ DEFAULT_MAX_ARC_FILE_SIZE,
+ compress,
+ Arrays.asList(files),
+ null));
+ }
+
+ protected static ByteArrayInputStream getBais(String str)
+ throws IOException {
+ return new ByteArrayInputStream(str.getBytes());
+ }
+
+ /**
+ * Writes a record, suppressing normal length-checks (so that
+ * intentionally malformed records may be written).
+ */
+ protected static void writeRecord(ARCWriter writer, String url,
+ String type, int len, ByteArrayInputStream bais)
+ throws IOException {
+ writer.write(url, type, "192.168.1.1", (new Date()).getTime(), len,
+ bais, false);
+ }
+
+ protected int iterateRecords(ARCReader r)
+ throws IOException {
+ int count = 0;
+ for (Iterator i = r.iterator(); i.hasNext();) {
+ ARCRecord rec = (ARCRecord)i.next();
+ rec.close();
+ if (count != 0) {
+ assertTrue("Unexpected URL " + rec.getMetaData().getUrl(),
+ rec.getMetaData().getUrl().startsWith(SOME_URL));
+ }
+ count++;
+ }
+ return count;
+ }
+
+ protected CorruptibleARCWriter createArcWithOneRecord(String name,
+ boolean compressed)
+ throws IOException {
+ CorruptibleARCWriter writer = createARCWriter(name, compressed);
+ String content = getContent();
+ writeRecord(writer, SOME_URL, "text/html",
+ content.length(), getBais(content));
+ return writer;
+ }
+
+ public void testSpaceInURL() {
+ String eMessage = null;
+ try {
+ holeyUrl("testSpaceInURL", false, " ");
+ } catch (IOException e) {
+ eMessage = e.getMessage();
+ }
+ assertTrue("Didn't get expected exception: " + eMessage,
+ eMessage.startsWith("Metadata line doesn't match"));
+ }
+
+ public void testTabInURL() {
+ String eMessage = null;
+ try {
+ holeyUrl("testTabInURL", false, "\t");
+ } catch (IOException e) {
+ eMessage = e.getMessage();
+ }
+ assertTrue("Didn't get expected exception: " + eMessage,
+ eMessage.startsWith("Metadata line doesn't match"));
+ }
+
+ protected void holeyUrl(String name, boolean compress, String urlInsert)
+ throws IOException {
+ ARCWriter writer = null;
+ try {
+ writer = createArcWithOneRecord(name, compress);
+ // Add some bytes on the end to mess up the record.
+ String content = getContent();
+ writeRecord(writer, SOME_URL + urlInsert + "/index.html", "text/html",
+ content.length(), getBais(content));
+ } finally {
+ Closeables.close(writer, true);
+ }
+ }
+
+// If uncompressed, length has to be right or parse will fail.
+//
+// public void testLengthTooShort() throws IOException {
+// lengthTooShort("testLengthTooShort-" + PREFIX, false);
+// }
+
+ public void testLengthTooShortCompressed() throws IOException {
+ lengthTooShort("testLengthTooShortCompressed", true, false);
+ }
+
+ public void testLengthTooShortCompressedStrict()
+ throws IOException {
+ String eMessage = null;
+ try {
+ lengthTooShort("testLengthTooShortCompressedStrict",
+ true, true);
+ } catch (RuntimeException e) {
+ eMessage = e.getMessage();
+ }
+ assertTrue("Didn't get expected exception: " + eMessage,
+ eMessage.startsWith("java.io.IOException: Record STARTING at"));
+ }
+
+ protected void lengthTooShort(String name, boolean compress, boolean strict)
+ throws IOException {
+ CorruptibleARCWriter writer = null;
+ try {
+ writer = createArcWithOneRecord(name, compress);
+ // Add some bytes on the end to mess up the record.
+ String content = getContent();
+ ByteArrayInputStream bais = getBais(content+"SOME TRAILING BYTES");
+ writeRecord(writer, SOME_URL, "text/html",
+ content.length(), bais);
+ writer.setEndJunk("SOME TRAILING BYTES".getBytes());
+ writeRecord(writer, SOME_URL, "text/html",
+ content.length(), getBais(content));
+ } finally {
+ Closeables.close(writer, true);
+ }
+
+ // Catch System.err into a byte stream.
+ ByteArrayOutputStream os = new ByteArrayOutputStream();
+ PrintStream origErr = System.err;
+ ARCReader r = null;
+ try {
+ System.setErr(new PrintStream(os));
+
+ r = ARCReaderFactory.get(writer.getFile());
+ r.setStrict(strict);
+ int count = iterateRecords(r);
+ assertTrue("Count wrong " + count, count == 4);
+
+ // Make sure we get the warning string which complains about the
+ // trailing bytes.
+ String err = os.toString();
+ assertTrue("No message " + err, err.startsWith("WARNING") &&
+ (err.indexOf("Record STARTING at") > 0));
+ r.close();
+ } finally {
+ Closeables.close(r, true);
+ System.setErr(origErr);
+ }
+ }
+
+// If uncompressed, length has to be right or parse will fail.
+//
+// public void testLengthTooLong()
+// throws IOException {
+// lengthTooLong("testLengthTooLongCompressed-" + PREFIX,
+// false, false);
+// }
+
+ public void testLengthTooLongCompressed()
+ throws IOException {
+ lengthTooLong("testLengthTooLongCompressed",
+ true, false);
+ }
+
+ public void testLengthTooLongCompressedStrict() {
+ String eMessage = null;
+ try {
+ lengthTooLong("testLengthTooLongCompressed",
+ true, true);
+ } catch (IOException e) {
+ eMessage = e.getMessage();
+ }
+ assertTrue("Didn't get expected exception: " + eMessage,
+ eMessage.startsWith("Premature EOF before end-of-record"));
+ }
+
+ protected void lengthTooLong(String name, boolean compress,
+ boolean strict)
+ throws IOException {
+ ARCWriter writer = createArcWithOneRecord(name, compress);
+ // Add a record with a length that is too long.
+ String content = getContent();
+ writeRecord(writer, SOME_URL+"2", "text/html",
+ content.length() + 10, getBais(content));
+ writeRecord(writer, SOME_URL+"3", "text/html",
+ content.length(), getBais(content));
+ writer.close();
+
+ // Catch System.err.
+ ByteArrayOutputStream os = new ByteArrayOutputStream();
+
+ PrintStream origErr = System.err;
+ ARCReader r = null;
+ try {
+ System.setErr(new PrintStream(os));
+
+ r = ARCReaderFactory.get(writer.getFile());
+ r.setStrict(strict);
+ int count = iterateRecords(r);
+ assertTrue("Count wrong " + count, count == 4);
+
+ // Make sure we get the warning string which complains about the
+ // trailing bytes.
+ String err = os.toString();
+ assertTrue("No message " + err,
+ err.startsWith("WARNING Premature EOF before end-of-record"));
+ } finally {
+ Closeables.close(r, true);
+ System.setErr(origErr);
+ }
+ }
+
+ public void testGapError() throws IOException {
+ ARCWriter writer = createArcWithOneRecord("testGapError", true);
+ String content = getContent();
+ // Make a 'weird' RIS that returns bad 'remaining' length
+ // awhen remaining should be 0
+ ReplayInputStream ris = new ReplayInputStream(content.getBytes(),
+ content.length(), null) {
+ public long remaining() {
+ return (super.remaining()==0) ? -1 : super.remaining();
+ }
+ };
+ String message = null;
+ try {
+ writer.write(SOME_URL, "text/html", "192.168.1.1",
+ (new Date()).getTime(), content.length(), ris);
+ } catch (IOException e) {
+ message = e.getMessage();
+ } finally {
+ IOUtils.closeQuietly(ris);
+ }
+ writer.close();
+ assertTrue("No gap when should be",
+ message != null &&
+ message.indexOf("Gap between expected and actual") >= 0);
+ }
+
+ /**
+ * Write an arc file for other tests to use.
+ * @param arcdir Directory to write to.
+ * @param compress True if file should be compressed.
+ * @return ARC written.
+ * @throws IOException
+ */
+ public static File createARCFile(File arcdir, boolean compress)
+ throws IOException {
+ File [] files = {arcdir};
+ ARCWriter writer = new ARCWriter(SERIAL_NO,
+ new WriterPoolSettingsData(
+ "",
+ "test",
+ DEFAULT_MAX_ARC_FILE_SIZE,
+ compress,
+ Arrays.asList(files),
+ null));
+ String content = getContent();
+ writeRecord(writer, SOME_URL, "text/html", content.length(),
+ getBais(content));
+ writer.close();
+ return writer.getFile();
+ }
+
+// public void testSpeed() throws IOException {
+// ARCWriter writer = createArcWithOneRecord("speed", true);
+// // Add a record with a length that is too long.
+// String content = getContent();
+// final int count = 100000;
+// logger.info("Starting speed write of " + count + " records.");
+// for (int i = 0; i < count; i++) {
+// writeRecord(writer, SOME_URL, "text/html", content.length(),
+// getBaos(content));
+// }
+// writer.close();
+// logger.info("Finished speed write test.");
+// }
+
+
+ public void testValidateMetaLine() throws Exception {
+ final String line = "http://www.aandw.net/images/walden2.png " +
+ "128.197.34.86 20060111174224 image/png 2160";
+ ARCWriter w = createARCWriter("testValidateMetaLine", true);
+ try {
+ w.validateMetaLine(line);
+ w.validateMetaLine(line + LINE_SEPARATOR);
+ w.validateMetaLine(line + "\\r\\n");
+ } finally {
+ w.close();
+ }
+ }
+
+ public void testArcRecordOffsetReads() throws Exception {
+ ARCReader r = getSingleRecordReader("testArcRecordInBufferStream");
+ ARCRecord ar = getSingleRecord(r);
+ // Now try getting some random set of bytes out of it
+ // at an odd offset (used to fail because we were
+ // doing bad math to find where in buffer to read).
+ final byte[] buffer = new byte[17];
+ final int maxRead = 4;
+ int totalRead = 0;
+ while (totalRead < maxRead) {
+ totalRead = totalRead
+ + ar.read(buffer, 13 + totalRead, maxRead - totalRead);
+ assertTrue(totalRead > 0);
+ }
+ r.close();
+ }
+
+ // available should always be >= 0; extra read()s should all give EOF
+ public void testArchiveRecordAvailableConsistent() throws Exception {
+ // first test reading byte-at-a-time via no-param read()
+ ARCReader r = getSingleRecordReader("testArchiveRecordAvailableConsistent");
+ ARCRecord record = getSingleRecord(r);
+ int c = record.read();
+ while(c>=0) {
+ c = record.read();
+ }
+ // consecutive reads after EOR should always give -1, still show zero available()
+ for (int i=0; i<5; i++) {
+ assertTrue("available negative:"+record.available(), record.available()>=0);
+ assertEquals(-1, record.read());
+ }
+ r.close();
+ }
+
+ // should always give -1 on repeated reads past EOR
+ public void testArchiveRecordEORConsistent() throws Exception {
+ ARCReader r = getSingleRecordReader("testArchiveRecordEORConsistent");
+ ARCRecord record = getSingleRecord(r);
+ this.readToEOS(record);
+ // consecutive reads after EOR should always give -1
+ for (int i=0; i<5; i++) {
+ assertEquals(-1, record.read(new byte[1]));
+ }
+ r.close();
+ }
+
+ // should not throw premature EOF when wrapped with BufferedInputStream
+ // [HER-1450] showed this was the case using Apache Tika
+ public void testArchiveRecordMarkSupport() throws Exception {
+ ARCReader r = getSingleRecordReader("testArchiveRecordMarkSupport");
+ ARCRecord record = getSingleRecord(r);
+ record.setStrict(true);
+ // ensure mark support
+ InputStream stream = new BufferedInputStream(record);
+ if (stream.markSupported()) {
+ for (int i=0; i<3; i++) {
+ this.readToEOS(stream);
+ stream.mark(stream.available());
+ stream.reset();
+ }
+ stream.close();
+ }
+ r.close();
+ }
+
+ /**
+ * Test a particular style of using the reader iterator. (Should
+ * possibly be on a reader-centric test class, but the best setup
+ * functionality is here.)
+ *
+ * @throws IOException
+ */
+ public void testReadIterator() throws IOException {
+ final int recordCount = 3;
+ File arcFile = writeRecords("writeRecord", true,
+ DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
+ ARCReader reader = ARCReaderFactory.get(arcFile);
+ Iterator it = reader.iterator();
+ while (it.hasNext()) {
+ ArchiveRecord next = it.next();
+ next.close();
+ }
+ reader.close();
+ }
+
+ protected void readToEOS(InputStream in) throws Exception {
+ byte [] buf = new byte[1024];
+ int read = 0;
+ while (read >= 0) {
+ read = in.read(buf);
+ // System.out.println("readToEOS read " + read + " bytes");
+ }
+ }
+
+ protected ARCReader getSingleRecordReader(String name) throws Exception {
+ // Get an ARC with one record.
+ WriterPoolMember w = createArcWithOneRecord(name, true);
+ w.close();
+ // Get reader on said ARC.
+ ARCReader r = ARCReaderFactory.get(w.getFile());
+ return r;
+ }
+
+ protected ARCRecord getSingleRecord(ARCReader r) {
+ final Iterator i = r.iterator();
+ // Skip first ARC meta record.
+ i.next();
+ i.hasNext();
+ // Now we're at first and only record in ARC.
+ return (ARCRecord) i.next();
+ }
+}
diff --git a/src/test/java/org/archive/io/warc/WARCWriterTest.java b/src/test/java/org/archive/io/warc/WARCWriterTest.java
new file mode 100644
index 00000000..35c68714
--- /dev/null
+++ b/src/test/java/org/archive/io/warc/WARCWriterTest.java
@@ -0,0 +1,512 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io.warc;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.net.URI;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.archive.io.ArchiveRecord;
+import org.archive.io.ArchiveRecordHeader;
+import org.archive.io.UTF8Bytes;
+import org.archive.io.WriterPoolMember;
+import org.archive.uid.RecordIDGenerator;
+import org.archive.uid.UUIDGenerator;
+import org.archive.util.ArchiveUtils;
+import org.archive.util.TmpDirTestCase;
+import org.archive.util.anvl.ANVLRecord;
+
+/**
+ * Test Writer and Reader.
+ * @author stack
+ * @version $Date: 2006-08-29 19:35:48 -0700 (Tue, 29 Aug 2006) $ $Version$
+ */
+public class WARCWriterTest
+extends TmpDirTestCase implements WARCConstants {
+
+ private static final AtomicInteger SERIAL_NO = new AtomicInteger();
+
+ RecordIDGenerator generator = new UUIDGenerator();
+
+ /**
+ * Prefix to use for ARC files made by JUNIT.
+ */
+ private static final String SUFFIX = "JUNIT";
+
+ private static final String SOME_URL = "http://www.archive.org/test/";
+
+ @SuppressWarnings("unchecked")
+ public void testCheckHeaderLineValue() throws Exception {
+ WARCWriter writer = new WARCWriter(
+ SERIAL_NO,
+ new WARCWriterPoolSettingsData(
+ "","test",1,false,Collections.EMPTY_LIST,Collections.EMPTY_LIST,generator));
+ writer.checkHeaderValue("one");
+ IllegalArgumentException exception = null;
+ try {
+ writer.checkHeaderValue("with space");
+ } catch(IllegalArgumentException e) {
+ exception = e;
+ }
+ assertNotNull(exception);
+ exception = null;
+ try {
+ writer.checkHeaderValue("with\0x0000controlcharacter");
+ } catch(IllegalArgumentException e) {
+ exception = e;
+ }
+ writer.close();
+ assertNotNull(exception);
+ }
+
+ @SuppressWarnings("unchecked")
+ public void testMimetypes() throws IOException {
+ WARCWriter writer = new WARCWriter(SERIAL_NO,
+ new WARCWriterPoolSettingsData(
+ "m","testM",1,false,Collections.EMPTY_LIST,Collections.EMPTY_LIST,generator));
+ writer.checkHeaderLineMimetypeParameter("text/xml");
+ writer.checkHeaderLineMimetypeParameter("text/xml+rdf");
+ assertEquals(writer.checkHeaderLineMimetypeParameter(
+ "text/plain; charset=SHIFT-JIS"), "text/plain; charset=SHIFT-JIS");
+ assertEquals(writer.checkHeaderLineMimetypeParameter(
+ "multipart/mixed; \r\n boundary=\"simple boundary\""),
+ "multipart/mixed; boundary=\"simple boundary\"");
+ }
+
+ public void testWriteRecord() throws IOException {
+ File [] files = {getTmpDir()};
+
+ // Write uncompressed.
+ WARCWriter writer =
+ new WARCWriter(SERIAL_NO, new WARCWriterPoolSettingsData(
+ this.getClass().getName(), "templateWR1", -1, false, Arrays.asList(files), null, generator));
+
+ writeFile(writer);
+ writer.close();
+
+ // Write compressed.
+ writer = new WARCWriter(SERIAL_NO, new WARCWriterPoolSettingsData(
+ this.getClass().getName(), "templateWR2", -1, true, Arrays.asList(files), null, generator));
+
+ writeFile(writer);
+ writer.close();
+ }
+
+ private void writeFile(final WARCWriter writer)
+ throws IOException {
+ try {
+ writeWarcinfoRecord(writer);
+ writeBasicRecords(writer);
+ } finally {
+ writer.close();
+ writer.getFile().delete();
+ }
+ }
+
+ private void writeWarcinfoRecord(WARCWriter writer)
+ throws IOException {
+ WARCRecordInfo recordInfo = new WARCRecordInfo();
+ recordInfo.setType(WARCRecordType.warcinfo);
+ recordInfo.setUrl(null);
+ recordInfo.setCreate14DigitDate(ArchiveUtils.getLog14Date());
+ recordInfo.setMimetype(ANVLRecord.MIMETYPE);
+ recordInfo.setExtraHeaders(null);
+ recordInfo.setEnforceLength(true);
+
+ ANVLRecord meta = new ANVLRecord();
+ meta.addLabelValue("size", "1G");
+ meta.addLabelValue("operator", "igor");
+ byte [] bytes = meta.getUTF8Bytes();
+ recordInfo.setContentStream(new ByteArrayInputStream(bytes));
+ recordInfo.setContentLength((long) bytes.length);
+
+ final URI recordid = writer.generateRecordId(WARCWriter.TYPE, WARCRecordType.warcinfo.toString());
+ recordInfo.setRecordId(recordid);
+
+ writer.writeRecord(recordInfo);
+ }
+
+ protected void writeBasicRecords(final WARCWriter writer)
+ throws IOException {
+ WARCRecordInfo recordInfo = new WARCRecordInfo();
+ recordInfo.setType(WARCRecordType.metadata);
+ recordInfo.setUrl("http://www.archive.org/");
+ recordInfo.setCreate14DigitDate(ArchiveUtils.get14DigitDate());
+ recordInfo.setMimetype("no/type");
+ recordInfo.setEnforceLength(true);
+
+ ANVLRecord headerFields = new ANVLRecord();
+ headerFields.addLabelValue("x", "y");
+ headerFields.addLabelValue("a", "b");
+ recordInfo.setExtraHeaders(headerFields);
+
+ URI rid = (new UUIDGenerator()).getQualifiedRecordID(TYPE, WARCRecordType.metadata.toString());
+ recordInfo.setRecordId(rid);
+
+ final String content = "Any old content.";
+ for (int i = 0; i < 10; i++) {
+ String body = i + ". " + content;
+ byte [] bodyBytes = body.getBytes(UTF8Bytes.UTF8);
+ recordInfo.setContentStream(new ByteArrayInputStream(bodyBytes));
+ recordInfo.setContentLength((long)bodyBytes.length);
+ writer.writeRecord(recordInfo);
+ }
+ }
+
+ /**
+ * @return Generic HTML Content.
+ */
+ protected static String getContent() {
+ return getContent(null);
+ }
+
+ /**
+ * @return Generic HTML Content with mention of passed indexStr
+ * in title and body.
+ */
+ protected static String getContent(String indexStr) {
+ String page = (indexStr != null)? "Page #" + indexStr: "Some Page";
+ return "HTTP/1.1 200 OK\r\n" +
+ "Content-Type: text/html\r\n\r\n" +
+ "" + page +
+ "" +
+ "" + page +
+ "";
+ }
+
+ /**
+ * Write random HTML Record.
+ * @param w Where to write.
+ * @param index An index to put into content.
+ * @return Length of record written.
+ * @throws IOException
+ */
+ protected int writeRandomHTTPRecord(WARCWriter w, int index)
+ throws IOException {
+ WARCRecordInfo recordInfo = new WARCRecordInfo();
+ recordInfo.setType(WARCRecordType.resource);
+ recordInfo.setCreate14DigitDate(ArchiveUtils.get14DigitDate());
+ recordInfo.setMimetype("text/html; charset=UTF-8");
+ recordInfo.setRecordId(w.generateRecordId(null));
+ recordInfo.setEnforceLength(true);
+
+ String indexStr = Integer.toString(index);
+ recordInfo.setUrl("http://www.one.net/id=" + indexStr);
+
+ byte[] record = (getContent(indexStr)).getBytes();
+ recordInfo.setContentLength((long) record.length);
+
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ baos.write(record);
+ recordInfo.setContentStream(new ByteArrayInputStream(baos.toByteArray()));
+
+ // Add named fields for ip, checksum, and relate the metadata
+ // and request to the resource field.
+ recordInfo.addExtraHeader(NAMED_FIELD_IP_LABEL, "127.0.0.1");
+
+ w.writeRecord(recordInfo);
+ return record.length;
+ }
+
+ /**
+ * Fill a WARC with HTML Records.
+ * @param baseName WARC basename.
+ * @param compress Whether to compress or not.
+ * @param maxSize Maximum WARC size.
+ * @param recordCount How many records.
+ * @return The written file.
+ * @throws IOException
+ */
+ private File writeRecords(String baseName, boolean compress,
+ int maxSize, int recordCount)
+ throws IOException {
+ cleanUpOldFiles(baseName);
+ File [] files = {getTmpDir()};
+ WARCWriter w = new WARCWriter(SERIAL_NO, new WARCWriterPoolSettingsData(
+ baseName + '-' + SUFFIX, "${prefix}", maxSize, compress, Arrays.asList(files), null, generator));
+
+ assertNotNull(w);
+ for (int i = 0; i < recordCount; i++) {
+ writeRandomHTTPRecord(w, i);
+ }
+ w.close();
+ assertTrue("Doesn't exist: " + w.getFile().getAbsolutePath(),
+ w.getFile().exists());
+ return w.getFile();
+ }
+
+ /**
+ * Run validation of passed file.
+ * @param f File to validate.
+ * @param recordCount Expected count of records.
+ * @throws FileNotFoundException
+ * @throws IOException
+ */
+ private void validate(File f, int recordCount)
+ throws FileNotFoundException, IOException {
+ WARCReader reader = WARCReaderFactory.get(f);
+ assertNotNull(reader);
+ List headers = null;
+ if (recordCount == -1) {
+ headers = reader.validate();
+ } else {
+ headers = reader.validate(recordCount);
+ }
+ reader.close();
+
+ // Now, run through each of the records doing absolute get going from
+ // the end to start. Reopen the arc so no context between this test
+ // and the previous.
+
+ for (int i = headers.size() - 1; i >= 0; i--) {
+ reader = WARCReaderFactory.get(f);
+ ArchiveRecordHeader h = (ArchiveRecordHeader)headers.get(i);
+ ArchiveRecord r = reader.get(h.getOffset());
+ String mimeType = r.getHeader().getMimetype();
+ assertTrue("Record is bogus",
+ mimeType != null && mimeType.length() > 0);
+ reader.close();
+ }
+
+ assertTrue("Metadatas not equal", headers.size() == recordCount);
+ for (Iterator i = headers.iterator(); i.hasNext();) {
+ ArchiveRecordHeader r = (ArchiveRecordHeader)i.next();
+ assertTrue("Record is empty", r.getLength() > 0);
+ }
+ }
+
+ public void testWriteRecords() throws IOException {
+ final int recordCount = 2;
+ File f = writeRecords("writeRecords", false, DEFAULT_MAX_WARC_FILE_SIZE,
+ recordCount);
+ validate(f, recordCount + 1); // Header record.
+ }
+
+ public void testRandomAccess() throws IOException {
+ final int recordCount = 3;
+ File f = writeRecords("randomAccess", true, DEFAULT_MAX_WARC_FILE_SIZE,
+ recordCount);
+ WARCReader reader = WARCReaderFactory.get(f);
+ // Get to second record. Get its offset for later use.
+ boolean readFirst = false;
+ String url = null;
+ long offset = -1;
+ long totalRecords = 0;
+ boolean readSecond = false;
+ for (final Iterator i = reader.iterator(); i.hasNext();
+ totalRecords++) {
+ WARCRecord ar = (WARCRecord)i.next();
+ if (!readFirst) {
+ readFirst = true;
+ continue;
+ }
+ if (!readSecond) {
+ url = ar.getHeader().getUrl();
+ offset = ar.getHeader().getOffset();
+ readSecond = true;
+ }
+ }
+ reader.close();
+
+ reader = WARCReaderFactory.get(f, offset);
+ ArchiveRecord ar = reader.get();
+ assertEquals(ar.getHeader().getUrl(), url);
+ ar.close();
+ reader.close();
+
+ // Get reader again. See how iterator works with offset
+ reader = WARCReaderFactory.get(f, offset);
+ int count = 0;
+ for (final Iterator i = reader.iterator(); i.hasNext(); i.next()) {
+ count++;
+ }
+ reader.close();
+ assertEquals(totalRecords - 1, count);
+ }
+
+ public void testWriteRecordCompressed() throws IOException {
+ final int recordCount = 2;
+ File arcFile = writeRecords("writeRecordCompressed", true,
+ DEFAULT_MAX_WARC_FILE_SIZE, recordCount);
+ validate(arcFile, recordCount + 1 /*Header record*/);
+ }
+
+ protected WARCWriter createWARCWriter(String name,
+ boolean compress) {
+ File [] files = {getTmpDir()};
+ return new WARCWriter(SERIAL_NO,
+ new WARCWriterPoolSettingsData(
+ name,
+ "${prefix}-"+SUFFIX,
+ DEFAULT_MAX_WARC_FILE_SIZE,
+ compress,
+ Arrays.asList(files),
+ null,
+ generator));
+ }
+
+ protected static ByteArrayOutputStream getBaos(String str)
+ throws IOException {
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ baos.write(str.getBytes());
+ return baos;
+ }
+
+ protected static void writeRecord(WARCWriter w, String url,
+ String mimetype, int len, ByteArrayOutputStream baos)
+ throws IOException {
+ WARCRecordInfo recordInfo = new WARCRecordInfo();
+ recordInfo.setType(WARCRecordType.resource);
+ recordInfo.setUrl(url);
+ recordInfo.setCreate14DigitDate(ArchiveUtils.get14DigitDate());
+ recordInfo.setMimetype(mimetype);
+ recordInfo.setRecordId(w.generateRecordId(null));
+ recordInfo.setExtraHeaders(null);
+ recordInfo.setContentStream(new ByteArrayInputStream(baos.toByteArray()));
+ recordInfo.setContentLength((long) len);
+ recordInfo.setEnforceLength(true);
+
+ w.writeRecord(recordInfo);
+ }
+
+ protected int iterateRecords(WARCReader r)
+ throws IOException {
+ int count = 0;
+ for (Iterator i = r.iterator(); i.hasNext();) {
+ ArchiveRecord ar = i.next();
+ ar.close();
+ if (count != 0) {
+ assertTrue("Unexpected URL " + ar.getHeader().getUrl(),
+ ar.getHeader().getUrl().equals(SOME_URL));
+ }
+ count++;
+ }
+ return count;
+ }
+
+ protected WARCWriter createWithOneRecord(String name,
+ boolean compressed)
+ throws IOException {
+ WARCWriter writer = createWARCWriter(name, compressed);
+ String content = getContent();
+ writeRecord(writer, SOME_URL, "text/html",
+ content.length(), getBaos(content));
+ return writer;
+ }
+
+ public void testSpaceInURL() throws IOException {
+ long bytesWritten = holeyUrl("testSpaceInURL", false, " ");
+ assertEquals("Unexpected successful writing occurred",0,bytesWritten);
+ }
+
+ public void testTabInURL() throws IOException {
+ long bytesWritten = holeyUrl("testTabInURL", false, "\t");
+ assertEquals("Unexpected successful writing occurred",0,bytesWritten);
+ }
+
+ protected long holeyUrl(String name, boolean compress, String urlInsert)
+ throws IOException {
+ WARCWriter writer = createWithOneRecord(name, compress);
+ // Add some bytes on the end to mess up the record.
+ long startPos = writer.getPosition();
+ String content = getContent();
+ ByteArrayOutputStream baos = getBaos(content);
+ writeRecord(writer, SOME_URL + urlInsert + "/index.html", "text/html",
+ content.length(), baos);
+ long endPos = writer.getPosition();
+ writer.close();
+ return endPos-startPos;
+ }
+
+ /**
+ * Write an arc file for other tests to use.
+ * @param arcdir Directory to write to.
+ * @param compress True if file should be compressed.
+ * @return ARC written.
+ * @throws IOException
+ */
+ public static File createWARCFile(File arcdir, boolean compress)
+ throws IOException {
+ File [] files = {arcdir};
+ WARCWriter writer =
+ new WARCWriter(SERIAL_NO,
+ new WARCWriterPoolSettingsData(
+ "",
+ "test",
+ DEFAULT_MAX_WARC_FILE_SIZE,
+ compress,
+ Arrays.asList(files),
+ null,
+ new UUIDGenerator()));
+ String content = getContent();
+ writeRecord(writer, SOME_URL, "text/html", content.length(),
+ getBaos(content));
+ writer.close();
+ return writer.getFile();
+ }
+
+// public void testSpeed() throws IOException {
+// ARCWriter writer = createArcWithOneRecord("speed", true);
+// // Add a record with a length that is too long.
+// String content = getContent();
+// final int count = 100000;
+// logger.info("Starting speed write of " + count + " records.");
+// for (int i = 0; i < count; i++) {
+// writeRecord(writer, SOME_URL, "text/html", content.length(),
+// getBaos(content));
+// }
+// writer.close();
+// logger.info("Finished speed write test.");
+// }
+
+ public void testArcRecordOffsetReads() throws Exception {
+ // Get an ARC with one record.
+ WriterPoolMember w =
+ createWithOneRecord("testArcRecordInBufferStream", true);
+ w.close();
+ // Get reader on said ARC.
+ WARCReader r = WARCReaderFactory.get(w.getFile());
+ final Iterator i = r.iterator();
+ // Skip first ARC meta record.
+ ArchiveRecord ar = i.next();
+ i.hasNext();
+ // Now we're at first and only record in ARC.
+ ar = (WARCRecord) i.next();
+ // Now try getting some random set of bytes out of it
+ // at an odd offset (used to fail because we were
+ // doing bad math to find where in buffer to read).
+ final byte[] buffer = new byte[17];
+ final int maxRead = 4;
+ int totalRead = 0;
+ while (totalRead < maxRead) {
+ totalRead = totalRead
+ + ar.read(buffer, 13 + totalRead, maxRead - totalRead);
+ assertTrue(totalRead > 0);
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/test/java/org/archive/uid/UUIDGeneratorTest.java b/src/test/java/org/archive/uid/UUIDGeneratorTest.java
new file mode 100644
index 00000000..79e98fb6
--- /dev/null
+++ b/src/test/java/org/archive/uid/UUIDGeneratorTest.java
@@ -0,0 +1,44 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.uid;
+
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.HashMap;
+import java.util.Map;
+
+import junit.framework.TestCase;
+
+/**
+ * @author stack
+ * @version $Revision$ $Date$
+ */
+public class UUIDGeneratorTest extends TestCase {
+ public void testQualifyRecordID() throws URISyntaxException {
+ RecordIDGenerator g = new UUIDGenerator();
+ URI uri = g.getRecordID();
+ Map qualifiers = new HashMap();
+ qualifiers.put("a", "b");
+ URI nuURI = g.qualifyRecordID(uri, qualifiers);
+ assertNotSame(uri, nuURI);
+ qualifiers.put("c", "d");
+ nuURI = g.qualifyRecordID(nuURI, qualifiers);
+ assertNotSame(uri, nuURI);
+ }
+}
diff --git a/src/test/java/org/archive/util/FileUtilsTest.java b/src/test/java/org/archive/util/FileUtilsTest.java
new file mode 100644
index 00000000..19271435
--- /dev/null
+++ b/src/test/java/org/archive/util/FileUtilsTest.java
@@ -0,0 +1,271 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.util;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Collections;
+import java.util.LinkedList;
+import java.util.List;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang.math.LongRange;
+
+
+/**
+ * FileUtils tests.
+ *
+ * @contributor stack
+ * @contributor gojomo
+ * @version $Date$, $Revision$
+ */
+public class FileUtilsTest extends TmpDirTestCase {
+ private String srcDirName = FileUtilsTest.class.getName() + ".srcdir";
+ private File srcDirFile = null;
+ private String tgtDirName = FileUtilsTest.class.getName() + ".tgtdir";
+ private File tgtDirFile = null;
+
+ protected File zeroLengthLinesUnix;
+ protected File zeroLengthLinesWindows;
+
+ protected File smallLinesUnix;
+ protected File smallLinesWindows;
+ protected File largeLinesUnix;
+ protected File largeLinesWindows;
+ protected File nakedLastLineUnix;
+ protected File nakedLastLineWindows;
+
+
+ protected void setUp() throws Exception {
+ super.setUp();
+ this.srcDirFile = new File(getTmpDir(), srcDirName);
+ FileUtils.ensureWriteableDirectory(srcDirFile);
+ this.tgtDirFile = new File(getTmpDir(), tgtDirName);
+ FileUtils.ensureWriteableDirectory(tgtDirFile);
+ addFiles();
+
+ zeroLengthLinesUnix = setUpLinesFile("zeroLengthLinesUnix",0,0,400,IOUtils.LINE_SEPARATOR_UNIX);
+ zeroLengthLinesWindows = setUpLinesFile("zeroLengthLinesUnix",0,0,400,IOUtils.LINE_SEPARATOR_WINDOWS);
+
+ smallLinesUnix = setUpLinesFile("smallLinesUnix", 0, 25, 400, IOUtils.LINE_SEPARATOR_UNIX);
+ smallLinesWindows = setUpLinesFile("smallLinesWindows", 0, 25, 400, IOUtils.LINE_SEPARATOR_WINDOWS);
+ largeLinesUnix = setUpLinesFile("largeLinesUnix", 128, 256, 5, IOUtils.LINE_SEPARATOR_UNIX);
+ largeLinesWindows = setUpLinesFile("largeLinesWindows", 128, 256, 4096, IOUtils.LINE_SEPARATOR_WINDOWS);
+
+ nakedLastLineUnix = setUpLinesFile("nakedLastLineUnix", 0, 50, 401, IOUtils.LINE_SEPARATOR_UNIX);
+ org.apache.commons.io.FileUtils.writeStringToFile(nakedLastLineUnix,"a");
+ nakedLastLineWindows = setUpLinesFile("nakedLastLineWindows", 0, 50, 401, IOUtils.LINE_SEPARATOR_WINDOWS);
+ org.apache.commons.io.FileUtils.writeStringToFile(nakedLastLineWindows,"a");
+ }
+
+ private void addFiles() throws IOException {
+ addFiles(3, this.getName());
+ }
+
+ private void addFiles(final int howMany, final String baseName)
+ throws IOException {
+ for (int i = 0; i < howMany; i++) {
+ File.createTempFile(baseName, null, this.srcDirFile);
+ }
+ }
+
+ private File setUpLinesFile(String name, int minLineSize, int maxLineSize, int lineCount, String lineEnding) throws IOException {
+ List lines = new LinkedList();
+ StringBuilder sb = new StringBuilder(maxLineSize);
+ for(int i = 0; i< lineSize; j++) {
+ sb.append("-");
+ }
+ lines.add(sb.toString());
+ }
+ File file = File.createTempFile(name, null);
+ org.apache.commons.io.FileUtils.writeLines(file, lines, lineEnding);
+ return file;
+
+ }
+
+ protected void tearDown() throws Exception {
+ super.tearDown();
+ org.apache.commons.io.FileUtils.deleteQuietly(this.srcDirFile);
+ org.apache.commons.io.FileUtils.deleteQuietly(this.tgtDirFile);
+ org.apache.commons.io.FileUtils.deleteQuietly(zeroLengthLinesUnix);
+ org.apache.commons.io.FileUtils.deleteQuietly(zeroLengthLinesWindows);
+ org.apache.commons.io.FileUtils.deleteQuietly(smallLinesUnix);
+ org.apache.commons.io.FileUtils.deleteQuietly(smallLinesWindows);
+ org.apache.commons.io.FileUtils.deleteQuietly(largeLinesUnix);
+ org.apache.commons.io.FileUtils.deleteQuietly(largeLinesWindows);
+ org.apache.commons.io.FileUtils.deleteQuietly(nakedLastLineUnix);
+ org.apache.commons.io.FileUtils.deleteQuietly(nakedLastLineWindows);
+
+ }
+
+ public void testCopyFile() {
+ // Test exception copying nonexistent file.
+ File [] srcFiles = this.srcDirFile.listFiles();
+ srcFiles[0].delete();
+ IOException e = null;
+ try {
+ FileUtils.copyFile(srcFiles[0],
+ new File(this.tgtDirFile, srcFiles[0].getName()));
+ } catch (IOException ioe) {
+ e = ioe;
+ }
+ assertNotNull("Didn't get expected IOE", e);
+ }
+
+ public void testTailLinesZeroLengthUnix() throws IOException {
+ verifyTailLines(zeroLengthLinesUnix);
+ }
+
+ public void testTailLinesZeroLengthWindows() throws IOException {
+ verifyTailLines(zeroLengthLinesWindows);
+ }
+
+ public void testTailLinesSmallUnix() throws IOException {
+ verifyTailLines(smallLinesUnix);
+ }
+
+ public void testTailLinesLargeUnix() throws IOException {
+ verifyTailLines(largeLinesUnix);
+ }
+
+ public void testTailLinesSmallWindows() throws IOException {
+ verifyTailLines(smallLinesWindows);
+ }
+
+ public void testTailLinesLargeWindows() throws IOException {
+ verifyTailLines(largeLinesWindows);
+ }
+
+ public void testTailLinesNakedUnix() throws IOException {
+ verifyTailLines(nakedLastLineUnix);
+ }
+
+ public void testTailLinesNakedWindows() throws IOException {
+ verifyTailLines(nakedLastLineWindows);
+ }
+
+ @SuppressWarnings("unchecked")
+ private void verifyTailLines(File file) throws IOException {
+ List lines = org.apache.commons.io.FileUtils.readLines(file);
+ verifyTailLines(file, lines, 1, 80);
+ verifyTailLines(file, lines, 5, 80);
+ verifyTailLines(file, lines, 10, 80);
+ verifyTailLines(file, lines, 20, 80);
+ verifyTailLines(file, lines, 100, 80);
+ verifyTailLines(file, lines, 1, 1);
+ verifyTailLines(file, lines, 5, 1);
+ verifyTailLines(file, lines, 10, 1);
+ verifyTailLines(file, lines, 20, 1);
+ verifyTailLines(file, lines, 100, 1);
+ }
+
+
+ private void verifyTailLines(File file, List lines, int count, int estimate) throws IOException {
+ List testLines;
+ testLines = getTestTailLines(file,count,estimate);
+ assertEquals("line counts not equal:"+file.getName()+" "+count+" "+estimate,lines.size(),testLines.size());
+ assertEquals("lines not equal: "+file.getName()+" "+count+" "+estimate,lines,testLines);
+ }
+
+ private List getTestTailLines(File file, int count, int estimate) throws IOException {
+ long pos = -1;
+ List testLines = new LinkedList();
+ do {
+ List returnedLines = new LinkedList();
+ LongRange range = FileUtils.pagedLines(file,pos,-count,returnedLines,estimate);
+ Collections.reverse(returnedLines);
+ testLines.addAll(returnedLines);
+ pos = range.getMinimumLong()-1;
+ } while (pos>=0);
+ Collections.reverse(testLines);
+ return testLines;
+ }
+
+ public void testHeadLinesZeroLengthUnix() throws IOException {
+ verifyHeadLines(zeroLengthLinesUnix);
+ }
+
+ public void testHeadLinesZeroLengthWindows() throws IOException {
+ verifyHeadLines(zeroLengthLinesWindows);
+ }
+
+ public void testHeadLinesSmallUnix() throws IOException {
+ verifyHeadLines(smallLinesUnix);
+ }
+
+ public void testHeadLinesLargeUnix() throws IOException {
+ verifyHeadLines(largeLinesUnix);
+ }
+
+ public void testHeadLinesSmallWindows() throws IOException {
+ verifyHeadLines(smallLinesWindows);
+ }
+
+ public void testHeadLinesLargeWindows() throws IOException {
+ verifyHeadLines(largeLinesWindows);
+ }
+
+ public void testHeadLinesNakedUnix() throws IOException {
+ verifyHeadLines(nakedLastLineUnix);
+ }
+
+ public void testHeadLinesNakedWindows() throws IOException {
+ verifyHeadLines(nakedLastLineWindows);
+ }
+
+
+ @SuppressWarnings("unchecked")
+ private void verifyHeadLines(File file) throws IOException {
+ List lines = org.apache.commons.io.FileUtils.readLines(file);
+ verifyHeadLines(file, lines, 1, 80);
+ verifyHeadLines(file, lines, 5, 80);
+ verifyHeadLines(file, lines, 10, 80);
+ verifyHeadLines(file, lines, 20, 80);
+ verifyHeadLines(file, lines, 100, 80);
+ verifyHeadLines(file, lines, 1, 1);
+ verifyHeadLines(file, lines, 5, 1);
+ verifyHeadLines(file, lines, 10, 1);
+ verifyHeadLines(file, lines, 20, 1);
+ verifyHeadLines(file, lines, 100, 1);
+ }
+
+
+ private void verifyHeadLines(File file, List lines, int count, int estimate) throws IOException {
+ List testLines;
+ testLines = getTestHeadLines(file,count,estimate);
+ assertEquals("line counts not equal:"+file.getName()+" "+count+" "+estimate,lines.size(),testLines.size());
+ assertEquals("lines not equal: "+file.getName()+" "+count+" "+estimate,lines,testLines);
+ }
+
+ private List getTestHeadLines(File file, int count, int estimate) throws IOException {
+ long pos = 0;
+ List testLines = new LinkedList();
+ do {
+ LongRange range = FileUtils.pagedLines(file,pos,count,testLines,estimate);
+ pos = range.getMaximumLong();
+ } while (pos m = am.asMap();
+ logger.fine(m.toString());
+ }
+
+ public void testEmptyRecord() throws Exception {
+ byte [] b = ANVLRecord.EMPTY_ANVL_RECORD.getUTF8Bytes();
+ assertEquals(b.length, 2);
+ assertEquals(b[0], '\r');
+ assertEquals(b[1], '\n');
+ }
+
+ public void testFolding() throws Exception {
+ ANVLRecord am = new ANVLRecord();
+ Exception e = null;
+ try {
+ am.addLabel("Label with \n in it");
+ } catch (IllegalArgumentException iae) {
+ e = iae;
+ }
+ assertTrue(e != null && e instanceof IllegalArgumentException);
+ am.addLabelValue("label", "value with \n in it");
+ }
+
+ public void testParse() throws UnsupportedEncodingException, IOException {
+ String record = " a: b\r\n#c#\r\nc:d\r\n \t\t\r\t\n\te" +
+ "\r\nx:\r\n # z\r\n\r\n";
+ ANVLRecord r = ANVLRecord.load(new ByteArrayInputStream(
+ record.getBytes("ISO-8859-1")));
+ logger.fine(r.toString());
+ assertEquals(r.get(0).toString(), "a: b");
+ record = " a: b\r\n\r\nsdfsdsdfds";
+ r = ANVLRecord.load(new ByteArrayInputStream(
+ record.getBytes("ISO-8859-1")));
+ logger.fine(r.toString());
+ record = "x:\r\n # z\r\ny:\r\n\r\n";
+ r = ANVLRecord.load(new ByteArrayInputStream(
+ record.getBytes("ISO-8859-1")));
+ logger.fine(r.toString());
+ assertEquals(r.get(0).toString(), "x:");
+ }
+
+ public void testExampleParse()
+ throws UnsupportedEncodingException, IOException {
+ final String sample = "entry:\t\t\r\n# first ###draft\r\n" +
+ "who:\tGilbert, W.S. | Sullivan, Arthur\r\n" +
+ "what:\tThe Yeoman of\r\n" +
+ "\t\tthe Guard\r\n" +
+ "when/created:\t 1888\r\n\r\n";
+ ANVLRecord r = ANVLRecord.load(new ByteArrayInputStream(
+ sample.getBytes("ISO-8859-1")));
+ logger.fine(r.toString());
+ }
+
+ public void testPoundLabel()
+ throws UnsupportedEncodingException, IOException {
+ final String sample = "ent#ry:\t\t\r\n# first ###draft\r\n" +
+ "who:\tGilbert, W.S. | Sullivan, Arthur\r\n" +
+ "what:\tThe Yeoman of\r\n" +
+ "\t\tthe Guard\r\n" +
+ "when/created:\t 1888\r\n\r\n";
+ ANVLRecord r = ANVLRecord.load(sample);
+ logger.fine(r.toString());
+ }
+
+ public void testNewlineLabel()
+ throws UnsupportedEncodingException, IOException {
+ final String sample = "ent\nry:\t\t\r\n# first ###draft\r\n" +
+ "who:\tGilbert, W.S. | Sullivan, Arthur\r\n" +
+ "what:\tThe Yeoman of\r\n" +
+ "\t\tthe Guard\r\n" +
+ "when/created:\t 1888\r\n\r\n";
+ IllegalArgumentException iae = null;
+ try {
+ ANVLRecord.load(sample);
+ } catch(IllegalArgumentException e) {
+ iae = e;
+ }
+ assertTrue(iae != null);
+ }
+}
From b04f5d82604245461b6a802f1962d86e3d899e98 Mon Sep 17 00:00:00 2001
From: Lauren Ko
Date: Thu, 9 Mar 2017 11:32:03 -0600
Subject: [PATCH 052/240] Updating CHANGES.md
---
CHANGES.md | 1 +
1 file changed, 1 insertion(+)
diff --git a/CHANGES.md b/CHANGES.md
index fee29e16..767881ec 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,5 +1,6 @@
1.1.8
-----
+* [Move unit tests over from heritrix3 to webarchive-commons](https://github.com/iipc/webarchive-commons/issues/25)
* [Strip empty port via URLParser](https://github.com/iipc/webarchive-commons/pull/69/)
* [Use CharsetDetector to guess encoding of HTML documents](https://github.com/iipc/webarchive-commons/pull/68/)
* [Fix last header was lost if LF LF](https://github.com/iipc/webarchive-commons/pull/65/)
From b655796770eb967c931d656b1c80d4967f91e7fc Mon Sep 17 00:00:00 2001
From: Lauren Ko
Date: Tue, 21 Mar 2017 14:20:54 -0500
Subject: [PATCH 053/240] Updating change log.
---
CHANGES.md | 1 +
1 file changed, 1 insertion(+)
diff --git a/CHANGES.md b/CHANGES.md
index 767881ec..ccdc1ce7 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,5 +1,6 @@
1.1.8
-----
+* [Improve HTML link extraction](https://github.com/iipc/webarchive-commons/pull/72)
* [Move unit tests over from heritrix3 to webarchive-commons](https://github.com/iipc/webarchive-commons/issues/25)
* [Strip empty port via URLParser](https://github.com/iipc/webarchive-commons/pull/69/)
* [Use CharsetDetector to guess encoding of HTML documents](https://github.com/iipc/webarchive-commons/pull/68/)
From aee6ff55bfcaa5a9e15092f8c3b1e40ec9faaf87 Mon Sep 17 00:00:00 2001
From: John Erik Halse
Date: Tue, 2 May 2017 12:25:28 +0200
Subject: [PATCH 054/240] [maven-release-plugin] prepare release
webarchive-commons-1.1.8
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index 24780063..63909b90 100644
--- a/pom.xml
+++ b/pom.xml
@@ -9,7 +9,7 @@
org.netpreserve.commonswebarchive-commons
- 1.1.8-SNAPSHOT
+ 1.1.8jarwebarchive-commons
From dfe1f62e416f6a881fe15a2544449fff44dd1e51 Mon Sep 17 00:00:00 2001
From: John Erik Halse
Date: Tue, 2 May 2017 12:25:35 +0200
Subject: [PATCH 055/240] [maven-release-plugin] prepare for next development
iteration
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index 63909b90..23953c06 100644
--- a/pom.xml
+++ b/pom.xml
@@ -9,7 +9,7 @@
org.netpreserve.commonswebarchive-commons
- 1.1.8
+ 1.1.9-SNAPSHOTjarwebarchive-commons
From cf34a3e13c09cfa4a1412492cfcf3503df698931 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Fri, 28 Apr 2017 22:41:56 +0200
Subject: [PATCH 056/240] Do not add value of preceding HTTP header field if
there is no value (or only white space)
---
.../archive/format/http/HttpHeaderParser.java | 4 ++--
.../format/http/HttpResponseParserTest.java | 24 +++++++++++++++++++
2 files changed, 26 insertions(+), 2 deletions(-)
diff --git a/src/main/java/org/archive/format/http/HttpHeaderParser.java b/src/main/java/org/archive/format/http/HttpHeaderParser.java
index d63ec405..bee3c28b 100755
--- a/src/main/java/org/archive/format/http/HttpHeaderParser.java
+++ b/src/main/java/org/archive/format/http/HttpHeaderParser.java
@@ -301,8 +301,9 @@ public ParseState handleByte(byte b, HttpHeaderParser parser) throws HttpParseEx
if(isLWSP(b)) {
return parser.postColonState;
}
+ // reset previous value also in case the header value is empty
+ parser.setValueStartIdx();
if(b == CR) {
- // TODO: THINK more...
parser.valuePreCRState = parser.postColonState;
return parser.valuePostCRState;
}
@@ -310,7 +311,6 @@ public ParseState handleByte(byte b, HttpHeaderParser parser) throws HttpParseEx
// TODO: this is lax, is LFLF an OK terminator?
return parser.lineStartState;
}
- parser.setValueStartIdx();
parser.addValueByte(b);
return parser.valueState;
}
diff --git a/src/test/java/org/archive/format/http/HttpResponseParserTest.java b/src/test/java/org/archive/format/http/HttpResponseParserTest.java
index c0d13230..ea076a69 100644
--- a/src/test/java/org/archive/format/http/HttpResponseParserTest.java
+++ b/src/test/java/org/archive/format/http/HttpResponseParserTest.java
@@ -57,4 +57,28 @@ public void testParseWithLf() throws IOException {
}
+ public void testParseEmptyHeaderField() throws IOException {
+
+ HttpResponseParser parser = new HttpResponseParser();
+ String message = "200 OK\r\nContent-Type: text/plain\r\nServer: \r\n\r\nHi there";
+ try {
+ HttpResponse response =
+ parser.parse(new ByteArrayInputStream(message.getBytes(IAUtils.UTF8)));
+ assertNotNull(response);
+ HttpHeaders headers = response.getHeaders();
+ assertNotNull(headers);
+ assertEquals(2, headers.size());
+ HttpHeader header = headers.get(1);
+ assertEquals("Server",header.getName());
+ System.err.println(header.getValue());
+ assertFalse("text/plain".equals(header.getValue()));
+ TestUtils.assertStreamEquals(response, "Hi there".getBytes(IAUtils.UTF8));
+
+ } catch (HttpParseException e) {
+ e.printStackTrace();
+ fail();
+ }
+
+ }
+
}
From bd08143577ea35cb48047a08b2bb67e806992cc2 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Thu, 29 Sep 2016 11:44:18 +0200
Subject: [PATCH 057/240] Extract also `property` attributes of HTML meta
elements, this fixes #67
---
.../java/org/archive/resource/html/ExtractingParseObserver.java | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
index 826851e0..52989455 100644
--- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
+++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
@@ -406,7 +406,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
private static class MetaTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
- ArrayList l = getAttrList(node,"name","rel","content","http-equiv");
+ ArrayList l = getAttrList(node,"name","rel","content","http-equiv","property");
if(l != null) {
data.addMeta(l);
}
From 4077670acca3f0d2958d926692cdb3a6b29428ca Mon Sep 17 00:00:00 2001
From: Lauren Ko
Date: Tue, 2 May 2017 15:15:06 -0500
Subject: [PATCH 058/240] Fix HTTP-Response-Metadata for wget WARCs. Changes
came from
https://github.com/commoncrawl/ia-web-commons/commit/58e85a60d75707da55ed499f836e57d49347484a
---
.../org/archive/extract/ExtractingResourceFactoryMapper.java | 5 ++++-
src/main/java/org/archive/format/warc/WARCConstants.java | 4 +++-
2 files changed, 7 insertions(+), 2 deletions(-)
diff --git a/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java b/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java
index ad10be40..0afe16fb 100644
--- a/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java
+++ b/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java
@@ -153,7 +153,10 @@ private boolean isWARCInfoResource(MetaData envelope) {
private boolean isHTTPResponseWARCResource(MetaData envelope) {
return childFieldEquals(envelope,WARC_HEADER_METADATA,
WARCConstants.CONTENT_TYPE,
- WARCConstants.HTTP_RESPONSE_MIMETYPE);
+ WARCConstants.HTTP_RESPONSE_MIMETYPE)
+ || childFieldEquals(envelope,WARC_HEADER_METADATA,
+ WARCConstants.CONTENT_TYPE,
+ WARCConstants.HTTP_RESPONSE_MIMETYPE_NS);
}
private boolean isWARCJSONResource(MetaData envelope) {
return childFieldEquals(envelope,WARC_HEADER_METADATA,
diff --git a/src/main/java/org/archive/format/warc/WARCConstants.java b/src/main/java/org/archive/format/warc/WARCConstants.java
index 93a81f96..504dc380 100644
--- a/src/main/java/org/archive/format/warc/WARCConstants.java
+++ b/src/main/java/org/archive/format/warc/WARCConstants.java
@@ -209,7 +209,9 @@ enum WARCRecordType {
"application/http; msgtype=request";
public static final String HTTP_RESPONSE_MIMETYPE =
"application/http; msgtype=response";
-
+ public static final String HTTP_RESPONSE_MIMETYPE_NS =
+ "application/http;msgtype=response"; // wget does this
+
public static final String FTP_CONTROL_CONVERSATION_MIMETYPE =
"text/x-ftp-control-conversation";
From 3bba7e489b7d946eea83344e2150faebe0b35ed2 Mon Sep 17 00:00:00 2001
From: Lauren Ko
Date: Tue, 2 May 2017 15:41:23 -0500
Subject: [PATCH 059/240] Update with fixes for 1.1.9
---
CHANGES.md | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/CHANGES.md b/CHANGES.md
index ccdc1ce7..1ba5c1de 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,3 +1,9 @@
+1.1.9
+-----
+* [Extract `property` attributes of HTML meta elements](https://github.com/iipc/webarchive-commons/pull/75)
+* [Do not add value of preceding HTTP header field if there is no value](https://github.com/iipc/webarchive-commons/pull/74)
+* [Fix WAT records corresponding to response records of Wget generated WARCs](https://github.com/iipc/webarchive-commons/pull/74)
+
1.1.8
-----
* [Improve HTML link extraction](https://github.com/iipc/webarchive-commons/pull/72)
From 4101f7e39cbdcc508a936faf8b519e68258b9639 Mon Sep 17 00:00:00 2001
From: Naomi Dushay
Date: Tue, 8 Aug 2017 16:08:43 -0700
Subject: [PATCH 060/240] use commons-collections v3.2.2 to avoid v3.2.1
vulnerability
---
pom.xml | 29 +++++++++++++++++++++--------
1 file changed, 21 insertions(+), 8 deletions(-)
diff --git a/pom.xml b/pom.xml
index 23953c06..8373cdad 100644
--- a/pom.xml
+++ b/pom.xml
@@ -72,7 +72,7 @@
guava17.0
-
+
org.jsonjson
@@ -89,12 +89,12 @@
juniversalchardet1.0.3
-
+
commons-httpclientcommons-httpclient3.1
-
+
org.apache.hadoop
@@ -128,12 +128,12 @@
tomcatjasper-compiler
-
+
hsqldbhsqldb
-
-
+
+
@@ -160,7 +160,7 @@
libidn1.15
-
+ it.unimi.dsidsiutils2.0.12
@@ -170,13 +170,26 @@
ch.qos.logbacklogback-classic
+
+
+ commons-collections
+ commons-collections
+
+
+
+
+ commons-collections
+ commons-collections
+ 3.2.2
+
+
org.apache.httpcomponentshttpcore4.3
-
+ joda-timejoda-time
From 988bec707c27a01333becfc3bd502af4441ea1e1 Mon Sep 17 00:00:00 2001
From: Lauren Ko
Date: Wed, 9 Aug 2017 10:57:28 -0500
Subject: [PATCH 061/240] Update CHANGES.md for PR 77
---
CHANGES.md | 1 +
1 file changed, 1 insertion(+)
diff --git a/CHANGES.md b/CHANGES.md
index 1ba5c1de..dcb598d9 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,5 +1,6 @@
1.1.9
-----
+* [Use commons-collections v3.2.2 to avoid v3.2.1 vulnerability](https://github.com/iipc/webarchive-commons/pull/77)
* [Extract `property` attributes of HTML meta elements](https://github.com/iipc/webarchive-commons/pull/75)
* [Do not add value of preceding HTTP header field if there is no value](https://github.com/iipc/webarchive-commons/pull/74)
* [Fix WAT records corresponding to response records of Wget generated WARCs](https://github.com/iipc/webarchive-commons/pull/74)
From 2e8cdea3d245c11e1ea3a2a6153c0038479aef12 Mon Sep 17 00:00:00 2001
From: nruest
Date: Tue, 7 May 2019 13:23:28 -0400
Subject: [PATCH 062/240] [maven-release-plugin] prepare release
webarchive-commons-1.1.9
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index 8373cdad..833f42c3 100644
--- a/pom.xml
+++ b/pom.xml
@@ -9,7 +9,7 @@
org.netpreserve.commonswebarchive-commons
- 1.1.9-SNAPSHOT
+ 1.1.9jarwebarchive-commons
From da029db2ba89205b93a5291ed18b9c69155271bb Mon Sep 17 00:00:00 2001
From: nruest
Date: Tue, 7 May 2019 13:23:34 -0400
Subject: [PATCH 063/240] [maven-release-plugin] prepare for next development
iteration
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index 833f42c3..1cbeb99a 100644
--- a/pom.xml
+++ b/pom.xml
@@ -9,7 +9,7 @@
org.netpreserve.commonswebarchive-commons
- 1.1.9
+ 1.1.10-SNAPSHOTjarwebarchive-commons
From 723b18a35f8be786cb073282b5ea88b5d8c643ce Mon Sep 17 00:00:00 2001
From: nruest
Date: Tue, 7 May 2019 13:56:18 -0400
Subject: [PATCH 064/240] Update TravisCI config; resolves #82.
- Test Oracle Java 8
- Test OpenJDK Java 8
- Use trusty
- Require sudo for OpenJDK7
- Remove Oracle Java 7 (it's gone!)
- Remove mvn site from the build process since there is no javadoc site
(at least that I can tell)
---
.travis.yml | 13 +++++++++----
1 file changed, 9 insertions(+), 4 deletions(-)
diff --git a/.travis.yml b/.travis.yml
index 0dfd3f7f..54daf83b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,7 +1,13 @@
+dist: trusty
language: java
+# sudo required for OpenJDK7 support per:
+# https://github.com/travis-ci/travis-ci/issues/7884#issuecomment-309689557
+sudo: required
jdk:
- - oraclejdk7
+ - openjdk7
+ - oraclejdk8
+ - openjdk8
before_install:
- "git clone https://github.com/iipc/travis.git target/travis"
@@ -11,8 +17,8 @@ before_script:
- "export MAVEN_OPTS=-Xmx512m"
- "ulimit -u 2048"
-script:
- - "target/travis/deploy-if.sh"
+script:
+ - mvn install -B -V
# whitelist in the master branch only
branches:
@@ -23,4 +29,3 @@ env:
global:
- secure: "qDKjVdoe4Qcz4WfXiQydU7tyl51T62FUJrjqu4FUPBcgeQhFQiggwhpaE6xCOzOpxbsuBi2R1c8gMQf5esE5iDL5jZMu+kz++dYbuzMTd13ttvZWMW5wRPH0H8iHk609FP/RDtVKKBr7WO0JvvIAZEhWNHZrLXBrrKgdTey171g="
- secure: "FXGBKJNP9X7ePJfS4eYTZtoFo4RT1sxor34XxncSJr7uV6ggtZb4B4WNd16IlLcDk6E32sx8YoWdltaOGwQ5Vg/kux5Ko/wKZCoccS018Ln1bRT86dD1KoPY34rGoNJVQxe7J/1MPqpBKwmi2XCKfzpsEh3W7bbIqg8w9MEOOZA="
-
From 79aed910b44510294367a4acf4f3e6376b1c62c0 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Wed, 23 Aug 2017 17:04:52 +0200
Subject: [PATCH 065/240] ExtractingParseObserver: get links from onClick
attributes - extract links from JavaScript code snippets in onClick
attributes of INPUT and DIV elements
---
.../html/ExtractingParseObserver.java | 40 +++++++++++++++++-
.../html/ExtractingParseObserverTest.java | 10 +++++
.../resource/html/link-extraction-test.warc | 42 +++++++++++++++++++
3 files changed, 91 insertions(+), 1 deletion(-)
diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
index 52989455..e4fa83c7 100644
--- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
+++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
@@ -39,6 +39,15 @@ public class ExtractingParseObserver implements ParseObserver {
protected static Pattern cssUrlTrimPattern = Pattern.compile(cssUrlTrimPatString);
+ protected static String jsOnClickUrl1PatString =
+ "(?i)^(?:javascript:)?(?:(?:window|top|document|self|parent)\\.)?location(?:\\.href)?\\s*=\\s*('|')([^'\"]{3,256})\\1$";
+ protected static String jsOnClickUrl2PatString =
+ "(?i)^(?:javascript:)?(?:window|parent)\\.open\\((['\"]|')([^\"']{3,256}?)\\1[,)]";
+ protected static Pattern[] jsOnClickUrlPatterns = {
+ Pattern.compile(jsOnClickUrl1PatString),
+ Pattern.compile(jsOnClickUrl2PatString)
+ };
+
private final static int MAX_TEXT_LEN = 100;
private static final String PATH = "path";
@@ -51,6 +60,7 @@ public class ExtractingParseObserver implements ParseObserver {
extractors.put("APPLET", new AppletTagExtractor());
extractors.put("AREA", new AreaTagExtractor());
extractors.put("BASE", new BaseTagExtractor());
+ extractors.put("DIV", new DivTagExtractor());
extractors.put("EMBED", new EmbedTagExtractor());
extractors.put("FORM", new FormTagExtractor());
extractors.put("FRAME", new FrameTagExtractor());
@@ -268,7 +278,20 @@ private static void addHrefWithAttrs(HTMLMetaData data, TagNode node,
if(l != null) {
data.addHref(l);
}
- }
+ }
+
+ private static void addHrefsOnclick(HTMLMetaData data, TagNode node) {
+ String onclick = node.getAttribute("onclick");
+ if (onclick != null) {
+ String path = makePath(node.getTagName(), "onclick");
+ for (Pattern pattern : jsOnClickUrlPatterns) {
+ String url = patternJSExtract(pattern, onclick);
+ if (url != null) {
+ data.addHref(PATH, path, "url", url);
+ }
+ }
+ }
+ }
private interface TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs);
@@ -330,6 +353,12 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
}
}
+ private static class DivTagExtractor implements TagExtractor {
+ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
+ addHrefsOnclick(data,node);
+ }
+ }
+
private static class EmbedTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
addBasicHrefs(data,node,"src");
@@ -386,6 +415,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
private static class InputTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
addBasicHrefs(data,node,"src","formaction");
+ addHrefsOnclick(data,node);
}
}
@@ -450,4 +480,12 @@ private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String conten
}
}
}
+
+ private static String patternJSExtract(Pattern pattern, String content) {
+ Matcher m = pattern.matcher(content);
+ if (m.find()) {
+ return m.group(2);
+ }
+ return null;
+ }
}
diff --git a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
index 8f690a06..4828ad64 100644
--- a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
+++ b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
@@ -263,6 +263,16 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
{"http://www.your-domain.com/your-page.html", "DIV@/data-href"}
};
checkLinks(extractor.getNext(), fbSocialLinks);
+ String[][] onClickLinks = {
+ {"webpage.html", "DIV@/onclick"},
+ {"index.html", "INPUT@/onclick"},
+ {"http://www.x.com/", "INPUT@/onclick"},
+ {"button-child.php", "INPUT@/onclick"},
+ {"http://example.com/", "INPUT@/onclick"},
+ {"http://example.com/location/href/1.html", "INPUT@/onclick"},
+ {"http://example.com/location/href/2.html", "INPUT@/onclick"}
+ };
+ checkLinks(extractor.getNext(), onClickLinks);
}
}
diff --git a/src/test/resources/org/archive/resource/html/link-extraction-test.warc b/src/test/resources/org/archive/resource/html/link-extraction-test.warc
index ab0e54c8..1a30598e 100644
--- a/src/test/resources/org/archive/resource/html/link-extraction-test.warc
+++ b/src/test/resources/org/archive/resource/html/link-extraction-test.warc
@@ -318,3 +318,45 @@ Content-Type: text/html
+WARC/1.0
+WARC-Type: response
+WARC-Date: 2017-08-23T13:54:59Z
+Content-Type: application/http;msgtype=response
+Content-Length: 1279
+
+HTTP/1.1 200 OK
+Date: Wed, 23 Aug 2017 13:54:59 GMT
+Server: Apache/2.4.18 (Ubuntu)
+Last-Modified: Wed, 23 Aug 2017 13:54:03 GMT
+ETag: "3ca-5576c0b718ab3"
+Accept-Ranges: bytes
+Content-Length: 971
+Vary: Accept-Encoding
+Keep-Alive: timeout=5, max=100
+Connection: Keep-Alive
+Content-Type: text/html
+
+
+
+Test Extraction of URLs from INPUT onClick Attributes
+
+
+
+
+
Click to load webpage
+
+
+
+
+
+
+
+
From 26b1e7af27abec102ab36faf6a786dfedf9436fd Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Wed, 23 Aug 2017 14:48:05 +0200
Subject: [PATCH 066/240] ExtractingParseObserver: extract rel, hreflang and
type attributes - add "rel" attribute to A and AREA links - add attributes
"hreflang" and "type" (MIME type) to A@/href links
---
.../html/ExtractingParseObserver.java | 19 +++++++++++++++++--
1 file changed, 17 insertions(+), 2 deletions(-)
diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
index 52989455..a487fd34 100644
--- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
+++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
@@ -284,7 +284,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
l.add(makePath("A","href"));
l.add("url");
l.add(url);
- for(String a : new String[] {"target","alt","title"}) {
+ for(String a : new String[] {"target","alt","title","rel","hreflang","type"}) {
String v = node.getAttribute(a);
if(v != null) {
l.add(a);
@@ -311,7 +311,22 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
private static class AreaTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
- addBasicHrefs(data,node,"href");
+ String url = node.getAttribute("href");
+ if(url != null) {
+ ArrayList l = new ArrayList();
+ l.add(PATH);
+ l.add(makePath("AREA","href"));
+ l.add("url");
+ l.add(url);
+ for(String a : new String[] {"rel"}) {
+ String v = node.getAttribute(a);
+ if(v != null) {
+ l.add(a);
+ l.add(v);
+ }
+ }
+ data.addHref(l);
+ }
}
}
From a2cc42cac2777d06ab40e09811cdc883773775b9 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Thu, 11 Jun 2020 14:24:03 +0200
Subject: [PATCH 067/240] WAT extractor: do not fail on missing WARC-Filename
in warcinfo record, fixes #88 - do not throw IOException if there is no
WARC-Filename in warcinfo record - write metadata record (corresponding to
warcinfo) without WARC-Target-URI
---
src/main/java/org/archive/extract/WATExtractorOutput.java | 2 +-
src/main/java/org/archive/format/warc/WARCRecordWriter.java | 5 ++++-
2 files changed, 5 insertions(+), 2 deletions(-)
diff --git a/src/main/java/org/archive/extract/WATExtractorOutput.java b/src/main/java/org/archive/extract/WATExtractorOutput.java
index 3bcfa924..4b5f72ed 100644
--- a/src/main/java/org/archive/extract/WATExtractorOutput.java
+++ b/src/main/java/org/archive/extract/WATExtractorOutput.java
@@ -151,7 +151,7 @@ private void writeWARC(OutputStream recOut, MetaData md) throws IOException {
String warcType = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Type");
String targetURI;
if(warcType.equals("warcinfo")) {
- targetURI = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Filename");
+ targetURI = JSONUtils.extractSingle(md, "Envelope.WARC-Header-Metadata.WARC-Filename");
} else {
targetURI = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Target-URI");
}
diff --git a/src/main/java/org/archive/format/warc/WARCRecordWriter.java b/src/main/java/org/archive/format/warc/WARCRecordWriter.java
index 0aab83b7..3278b289 100644
--- a/src/main/java/org/archive/format/warc/WARCRecordWriter.java
+++ b/src/main/java/org/archive/format/warc/WARCRecordWriter.java
@@ -88,7 +88,10 @@ public void writeJSONMetadataRecord( OutputStream out,
{
HttpHeaders headers = new HttpHeaders();
headers.add(HEADER_KEY_TYPE, WARCRecordType.metadata.name());
- headers.add(HEADER_KEY_URI, targetURI);
+ if (targetURI != null) {
+ // WARC-Target-URI is optional in metadata records
+ headers.add(HEADER_KEY_URI, targetURI);
+ }
headers.add(HEADER_KEY_DATE, DateUtils.getLog14Date(originalDate));
headers.add(HEADER_KEY_ID, makeRecordId());
headers.add(HEADER_KEY_REFERS_TO, origRecordId);
From 04e10397b9137a36812c17276826bc60d1a37ede Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Mon, 15 Jun 2020 13:29:25 +0200
Subject: [PATCH 068/240] Update change log to include #85, #86 and #89
---
CHANGES.md | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/CHANGES.md b/CHANGES.md
index dcb598d9..bf985ada 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,3 +1,10 @@
+1.1.10
+------
+* [WAT extractor: do not fail on missing WARC-Filename in warcinfo record](https://github.com/iipc/webarchive-commons/pull/89)
+* [ExtractingParseObserver: extract rel, hreflang and type attributes](https://github.com/iipc/webarchive-commons/pull/86)
+* [ExtractingParseObserver: extract links from onClick attributes](https://github.com/iipc/webarchive-commons/pull/85)
+* [Update TravisCI config](https://github.com/iipc/webarchive-commons/pull/83)
+
1.1.9
-----
* [Use commons-collections v3.2.2 to avoid v3.2.1 vulnerability](https://github.com/iipc/webarchive-commons/pull/77)
From 9041ff4e96f6554658742affe490223dc0241d06 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 13 Oct 2020 01:28:48 +0000
Subject: [PATCH 069/240] Bump junit from 3.8.1 to 4.13.1
Bumps [junit](https://github.com/junit-team/junit4) from 3.8.1 to 4.13.1.
- [Release notes](https://github.com/junit-team/junit4/releases)
- [Changelog](https://github.com/junit-team/junit4/blob/main/doc/ReleaseNotes4.13.1.md)
- [Commits](https://github.com/junit-team/junit4/commits/r4.13.1)
Signed-off-by: dependabot[bot]
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index 1cbeb99a..5ca7e1a3 100644
--- a/pom.xml
+++ b/pom.xml
@@ -64,7 +64,7 @@
junitjunit
- 3.8.1
+ 4.13.1
From c2530d77b73838c31f4e83f2be941ec61032ebb2 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Tue, 16 Mar 2021 11:58:11 +0100
Subject: [PATCH 070/240] Fix InterruptibleCharSequenceTest
(testInterruptibility) to run on JDK 11 - if thread running the regexp
matching is already finished after the initial/current sleeping time, rerun
the test again with a shorter sleeping time until the expected
RuntimeException is hit
---
.../util/InterruptibleCharSequenceTest.java | 26 +++++++++++++------
1 file changed, 18 insertions(+), 8 deletions(-)
diff --git a/src/test/java/org/archive/util/InterruptibleCharSequenceTest.java b/src/test/java/org/archive/util/InterruptibleCharSequenceTest.java
index a3a5f180..8b5c5d1b 100644
--- a/src/test/java/org/archive/util/InterruptibleCharSequenceTest.java
+++ b/src/test/java/org/archive/util/InterruptibleCharSequenceTest.java
@@ -107,14 +107,24 @@ public void testNoninterruptible() throws InterruptedException {
}
public void testInterruptibility() throws InterruptedException {
- BlockingQueue
-
-
- cloudera
- Cloudera Hadoop
- https://repository.cloudera.com/artifactory/cloudera-repos/
- default
-
-
- true
- daily
- warn
-
-
- true
- daily
- warn
-
-
-
-
)
+ * |ID1|ID2|CM |FLG| MTIME |XFL|OS | (more-->)
* +---+---+---+---+---+---+---+---+---+---+
*/
public class GZIPStaticHeader implements GZIPConstants {
diff --git a/src/main/java/org/archive/io/ReplayCharSequence.java b/src/main/java/org/archive/io/ReplayCharSequence.java
index aa9b9587..e456e293 100644
--- a/src/main/java/org/archive/io/ReplayCharSequence.java
+++ b/src/main/java/org/archive/io/ReplayCharSequence.java
@@ -59,7 +59,7 @@ public interface ReplayCharSequence extends CharSequence, Closeable {
public long getDecodeExceptionCount();
/**
- * Return the first coding-exception encountered, if the count > 0.
+ * Return the first coding-exception encountered, if the count > 0.
* @return CharacterCodingException
*/
public CharacterCodingException getCodingException();
diff --git a/src/main/java/org/archive/io/arc/ARCWriter.java b/src/main/java/org/archive/io/arc/ARCWriter.java
index 0bd0ef9b..c7042943 100644
--- a/src/main/java/org/archive/io/arc/ARCWriter.java
+++ b/src/main/java/org/archive/io/arc/ARCWriter.java
@@ -86,7 +86,7 @@
* write our own GZIP*Streams, ones that resettable and consious of gzip
* members.
*
- *
This class will write until we hit >= maxSize. The check is done at
+ *
This class will write until we hit >= maxSize. The check is done at
* record boundary. Records do not span ARC files. We will then close current
* file and open another and then continue writing.
*
@@ -95,9 +95,9 @@
* alexa
* ARC c-tools:
*
* Examine the produced cdx file to make sure it makes sense. Search
* for 'no-type 0'. If found, then we're opening a gzip record w/o data to
diff --git a/src/main/java/org/archive/util/DateUtils.java b/src/main/java/org/archive/util/DateUtils.java
index 0be20e63..7d6a7c98 100755
--- a/src/main/java/org/archive/util/DateUtils.java
+++ b/src/main/java/org/archive/util/DateUtils.java
@@ -557,7 +557,7 @@ private static String doubleToString(double val, int maxFractionDigits, int minF
* Takes a byte size and formats it for display with 'friendly' units.
*
* This involves converting it to the largest unit
- * (of B, KiB, MiB, GiB, TiB) for which the amount will be > 1.
+ * (of B, KiB, MiB, GiB, TiB) for which the amount will be > 1.
*
* Additionally, at least 2 significant digits are always displayed.
*
From 0d881e967daf2a023006032dd0d015b714821b11 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Tue, 15 Oct 2024 17:42:23 +0900
Subject: [PATCH 092/240] [maven-release-plugin] prepare release
webarchive-commons-1.1.10
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index f0c6ac73..2dd9223b 100644
--- a/pom.xml
+++ b/pom.xml
@@ -9,7 +9,7 @@
org.netpreserve.commonswebarchive-commons
- 1.1.10-SNAPSHOT
+ 1.1.10jarwebarchive-commons
From 76d95ccd75ddc31c5b8c3e9136f9e422ab528898 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Tue, 15 Oct 2024 17:42:28 +0900
Subject: [PATCH 093/240] [maven-release-plugin] prepare for next development
iteration
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index 2dd9223b..dc3088f0 100644
--- a/pom.xml
+++ b/pom.xml
@@ -9,7 +9,7 @@
org.netpreserve.commonswebarchive-commons
- 1.1.10
+ 1.1.11-SNAPSHOTjarwebarchive-commons
From 835f4e115b2cd288bed3f703136a7325c81fa751 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Sat, 9 Nov 2024 20:27:47 +0100
Subject: [PATCH 094/240] Make MetaData multi-valued to preserve values of
repeating WARC and HTTP headers
- code cleanup: fix indentation, remove unneeded return statements
---
src/main/java/org/archive/resource/MetaData.java | 10 ++++------
1 file changed, 4 insertions(+), 6 deletions(-)
diff --git a/src/main/java/org/archive/resource/MetaData.java b/src/main/java/org/archive/resource/MetaData.java
index 30ce849b..fb3b24a4 100755
--- a/src/main/java/org/archive/resource/MetaData.java
+++ b/src/main/java/org/archive/resource/MetaData.java
@@ -83,7 +83,6 @@ public int optInt(String key, int defaultValue) {
return super.getInt(key);
} catch(JSONException e) {
LOG.severe(e.getMessage());
- return defaultValue;
}
}
return defaultValue;
@@ -106,7 +105,6 @@ public long optLong(String key, long defaultValue) {
return super.getLong(key);
} catch(JSONException e) {
LOG.severe(e.getMessage());
- return defaultValue;
}
}
return defaultValue;
@@ -167,10 +165,10 @@ public JSONObject put(String key, Object value) {
((JSONArray) super.get(key)).put(value);
return this;
} else {
- JSONArray array = new JSONArray();
- array.put(super.get(key));
- array.put(value);
- super.put(key, array);
+ JSONArray array = new JSONArray();
+ array.put(super.get(key));
+ array.put(value);
+ super.put(key, array);
}
return super.accumulate(key, value);
}
From a4748d9e79abb972a6571f5f4d46951be6049b1a Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Wed, 27 Nov 2024 13:24:17 +0100
Subject: [PATCH 095/240] URLParser and WaybackURLKeyMaker fail on URLs with
IPv6 address hostname
---
src/main/java/org/archive/url/URLParser.java | 11 ++++++++++-
.../java/org/archive/url/URLRegexTransformer.java | 4 ++++
src/test/java/org/archive/url/URLParserTest.java | 3 +++
.../java/org/archive/url/WaybackURLKeyMakerTest.java | 3 +++
4 files changed, 20 insertions(+), 1 deletion(-)
diff --git a/src/main/java/org/archive/url/URLParser.java b/src/main/java/org/archive/url/URLParser.java
index a7860b02..bcd0b7fb 100644
--- a/src/main/java/org/archive/url/URLParser.java
+++ b/src/main/java/org/archive/url/URLParser.java
@@ -226,7 +226,16 @@ public static HandyURL parse(String urlString) throws URISyntaxException {
String colonPort = null;
int atIndex = uriAuthority.indexOf(COMMERCIAL_AT);
- int portColonIndex = uriAuthority.indexOf(COLON,(atIndex<0)?0:atIndex);
+ int portColonIndex = -1;
+ int startColonIndex = 0;
+ if (atIndex > -1) {
+ startColonIndex = atIndex;
+ }
+ if (uriAuthority.charAt(startColonIndex) == '[') {
+ // IPv6 address
+ startColonIndex = uriAuthority.indexOf(']', (startColonIndex + 1));
+ }
+ portColonIndex = uriAuthority.indexOf(COLON, startColonIndex);
if(atIndex<0 && portColonIndex<0) {
// most common case: neither userinfo nor port
diff --git a/src/main/java/org/archive/url/URLRegexTransformer.java b/src/main/java/org/archive/url/URLRegexTransformer.java
index 617e0225..5f31c81c 100644
--- a/src/main/java/org/archive/url/URLRegexTransformer.java
+++ b/src/main/java/org/archive/url/URLRegexTransformer.java
@@ -121,6 +121,10 @@ public static String hostToSURT(String host) {
// TODO: ensure we DONT reverse IP addresses!
String parts[] = host.split("\\.",-1);
if(parts.length == 1) {
+ // strip enclosing "[" and "]" from IPv6 hosts
+ if (host.charAt(0) == '[' && host.charAt(host.length() - 1) == ']') {
+ return host.substring(1, host.length() - 1);
+ }
return host;
}
StringBuilder sb = new StringBuilder(host.length());
diff --git a/src/test/java/org/archive/url/URLParserTest.java b/src/test/java/org/archive/url/URLParserTest.java
index b060ffa7..68dfcd23 100644
--- a/src/test/java/org/archive/url/URLParserTest.java
+++ b/src/test/java/org/archive/url/URLParserTest.java
@@ -86,6 +86,9 @@ public void testParse() throws UnsupportedEncodingException, URISyntaxException
checkParse(" \n http://:****@www.archive.org:8080/inde\rx.html?query#foo \r\n \t ",
null, "http", "", "****", "www.archive.org", 8080, "/index.html", "query", "foo",
"http://:****@www.archive.org:8080/index.html?query#foo", "/index.html?query");
+ checkParse("https://[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/robots.txt", null, "https", null, null,
+ "[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]", -1, "/robots.txt", null, null,
+ "https://[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/robots.txt", "/robots.txt");
}
private void checkParse(String s, String opaque, String scheme, String authUser,
diff --git a/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java b/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java
index 26161456..1a1403ee 100644
--- a/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java
+++ b/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java
@@ -23,6 +23,9 @@ public void testMakeKey() throws URISyntaxException {
assertEquals("org,archive)/goo?a&b", km.makeKey("http://archive.org/goo/?b&a"));
assertEquals("org,archive)/goo?a=1&a=2&b", km.makeKey("http://archive.org/goo/?a=2&b&a=1"));
assertEquals("org,archive)/", km.makeKey("http://archive.org:/"));
+ assertEquals("192,211,203,34)/robots.txt", km.makeKey("https://34.203.211.192/robots.txt"));
+ assertEquals("2600:1f18:200d:fb00:2b74:867c:ab0c:150a)/robots.txt",
+ km.makeKey("https://[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/robots.txt"));
}
}
From 8e89847d79ea2882bc55e2d00939fd8a2ca21865 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Wed, 27 Nov 2024 21:51:58 +0900
Subject: [PATCH 096/240] Update release plugins
---
pom.xml | 110 ++++++++++++++++++++++++++++++++++----------------------
1 file changed, 67 insertions(+), 43 deletions(-)
diff --git a/pom.xml b/pom.xml
index dc3088f0..048787a5 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1,12 +1,6 @@
4.0.0
-
- org.sonatype.oss
- oss-parent
- 7
-
-
org.netpreserve.commonswebarchive-commons1.1.11-SNAPSHOT
@@ -45,19 +39,13 @@
scm:git:git@github.com:iipc/webarchive-commons.gitscm:git:git@github.com:iipc/webarchive-commons.git
- git@github.com:iipc/webarchive-commons.git
+ https://github.com/iipc/webarchive-commonsUTF-8${maven.build.timestamp}yyyyMMddhhmmss
-
-
- sonatype-nexus-staging
- https://oss.sonatype.org/service/local/staging/deploy/maven2/
- sonatype-nexus-snapshots
- https://oss.sonatype.org/content/repositories/snapshots/
@@ -201,24 +189,6 @@
8
-
- maven-assembly-plugin
- 2.4
-
-
- jar-with-dependencies
-
- webarchive-commons
-
-
-
- package
-
- single
-
-
-
- org.apache.maven.pluginsmaven-enforcer-plugin
@@ -251,17 +221,71 @@
-
+
+
+ release
+
+
+ ossrh
+ https://oss.sonatype.org/content/repositories/snapshots
+
+
+
+
+
+ org.sonatype.plugins
+ nexus-staging-maven-plugin
+ 1.6.7
+ true
+
+ ossrh
+ https://oss.sonatype.org/
+ true
+
+
+
+ org.apache.maven.plugins
+ maven-source-plugin
+ 2.2.1
+
+
+ attach-sources
+
+ jar-no-fork
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-javadoc-plugin
+ 2.9.1
+
+
+ attach-javadocs
+
+ jar
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-gpg-plugin
+ 1.5
+
+
+ sign-artifacts
+ verify
+
+ sign
+
+
+
+
+
+
+
+
From 829566b1385a8dae6bc9774cd1299469f37e78c3 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Wed, 27 Nov 2024 21:53:22 +0900
Subject: [PATCH 097/240] [maven-release-plugin] prepare release
webarchive-commons-1.1.11
---
pom.xml | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/pom.xml b/pom.xml
index 048787a5..28bd9145 100644
--- a/pom.xml
+++ b/pom.xml
@@ -3,7 +3,7 @@
org.netpreserve.commonswebarchive-commons
- 1.1.11-SNAPSHOT
+ 1.1.11jarwebarchive-commons
@@ -40,7 +40,8 @@
scm:git:git@github.com:iipc/webarchive-commons.gitscm:git:git@github.com:iipc/webarchive-commons.githttps://github.com/iipc/webarchive-commons
-
+ webarchive-commons-1.1.11
+
UTF-8
From 9b0bbcfdeea7a9c2ac9a28b245bce2f8e9df5dce Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Wed, 27 Nov 2024 21:53:27 +0900
Subject: [PATCH 098/240] [maven-release-plugin] prepare for next development
iteration
---
pom.xml | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/pom.xml b/pom.xml
index 28bd9145..c86add9f 100644
--- a/pom.xml
+++ b/pom.xml
@@ -3,7 +3,7 @@
org.netpreserve.commonswebarchive-commons
- 1.1.11
+ 1.1.12-SNAPSHOTjarwebarchive-commons
@@ -40,7 +40,7 @@
scm:git:git@github.com:iipc/webarchive-commons.gitscm:git:git@github.com:iipc/webarchive-commons.githttps://github.com/iipc/webarchive-commons
- webarchive-commons-1.1.11
+ HEAD
From 9e4723b313a542320a4f09f4b4e2dbccdc0f58ac Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Wed, 27 Nov 2024 21:58:55 +0900
Subject: [PATCH 099/240] Update CHANGES.md
---
CHANGES.md | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/CHANGES.md b/CHANGES.md
index 6fe7c4bd..579b659f 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,7 +1,14 @@
+1.1.11
+------
+
+#### Bug fixes
+
+* Fixed URLParser and WaybackURLKeyMaker failing on URLs with IPv6 address hostnames [#100](https://github.com/iipc/webarchive-commons/pull/100)
+
1.1.10
------
-#### Fixes
+#### Bug fixes
* [WAT extractor: do not fail on missing WARC-Filename in warcinfo record](https://github.com/iipc/webarchive-commons/pull/89)
* [ExtractingParseObserver: extract rel, hreflang and type attributes](https://github.com/iipc/webarchive-commons/pull/86)
From cd2da63f1f56d41705e014e2c3290635fcc99099 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Wed, 27 Nov 2024 22:00:18 +0900
Subject: [PATCH 100/240] Add description to pom.xml (now mandatory for
central)
---
pom.xml | 1 +
1 file changed, 1 insertion(+)
diff --git a/pom.xml b/pom.xml
index c86add9f..18aca329 100644
--- a/pom.xml
+++ b/pom.xml
@@ -8,6 +8,7 @@
webarchive-commonshttps://github.com/iipc/webarchive-commons
+ Common web archive utility codeThe International Internet Preservation Consortium
From 7b6df0c619899ae70e350fb0d955c00b59ba68e5 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Wed, 27 Nov 2024 22:02:31 +0900
Subject: [PATCH 101/240] [maven-release-plugin] prepare release
webarchive-commons-1.1.11
---
pom.xml | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/pom.xml b/pom.xml
index 18aca329..a57230d9 100644
--- a/pom.xml
+++ b/pom.xml
@@ -3,7 +3,7 @@
org.netpreserve.commonswebarchive-commons
- 1.1.12-SNAPSHOT
+ 1.1.11jarwebarchive-commons
@@ -41,7 +41,7 @@
scm:git:git@github.com:iipc/webarchive-commons.gitscm:git:git@github.com:iipc/webarchive-commons.githttps://github.com/iipc/webarchive-commons
- HEAD
+ webarchive-commons-1.1.11
From a70f23e8b654d3a661877641f2fa7e51d696ceeb Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Wed, 27 Nov 2024 22:02:36 +0900
Subject: [PATCH 102/240] [maven-release-plugin] prepare for next development
iteration
---
pom.xml | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/pom.xml b/pom.xml
index a57230d9..18aca329 100644
--- a/pom.xml
+++ b/pom.xml
@@ -3,7 +3,7 @@
org.netpreserve.commonswebarchive-commons
- 1.1.11
+ 1.1.12-SNAPSHOTjarwebarchive-commons
@@ -41,7 +41,7 @@
scm:git:git@github.com:iipc/webarchive-commons.gitscm:git:git@github.com:iipc/webarchive-commons.githttps://github.com/iipc/webarchive-commons
- webarchive-commons-1.1.11
+ HEAD
From 0514b2387decaf5e40e24bcda0f7c70b438d0997 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Wed, 27 Nov 2024 22:08:04 +0900
Subject: [PATCH 103/240] Add Maven Central and Javadoc shields to README
---
README.md | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/README.md b/README.md
index 72858a52..55be6e68 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,6 @@
IIPC Web Archive Commons
========================
-
-[](https://travis-ci.org/iipc/webarchive-commons/)
+[](https://maven-badges.herokuapp.com/maven-central/org.netpreserve.commons/webarchive-commons) [](https://www.javadoc.io/doc/org.netpreserve.commons/webarchive-commons)
This repository contains common utility code for [OpenWayback][1] and other projects.
From c6095082fdecadd6882456a51c5f91b8a3d4faa5 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Fri, 29 Nov 2024 15:42:05 +0900
Subject: [PATCH 104/240] Bump guava from 33.3.0-jre to 33.3.1-jre
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index 18aca329..0ac11df9 100644
--- a/pom.xml
+++ b/pom.xml
@@ -60,7 +60,7 @@
com.google.guavaguava
- 33.3.0-jre
+ 33.3.1-jre
From 23c8887c2a3eb4d4d5b0bac0cf805c71fcaeabaf Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Fri, 29 Nov 2024 15:42:41 +0900
Subject: [PATCH 105/240] Bump commons-io from 2.14.0 to 2.18.0
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index 0ac11df9..84822a4f 100644
--- a/pom.xml
+++ b/pom.xml
@@ -140,7 +140,7 @@
commons-iocommons-io
- 2.14.0
+ 2.18.0
From f13c7b2a3b254a83827ad5a1c27131c6980c79eb Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Fri, 29 Nov 2024 15:47:28 +0900
Subject: [PATCH 106/240] Bump commons-lang from 2.5 to 2.6
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index 84822a4f..3d5f995f 100644
--- a/pom.xml
+++ b/pom.xml
@@ -134,7 +134,7 @@
commons-langcommons-lang
- 2.5
+ 2.6
From 5528afc05f77189b7ef59dbb9cdcce2bd35656e7 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Fri, 29 Nov 2024 16:24:04 +0900
Subject: [PATCH 107/240] Bump junit from 4.13.1 to 4.13.2
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index 3d5f995f..46f26766 100644
--- a/pom.xml
+++ b/pom.xml
@@ -54,7 +54,7 @@
junitjunit
- 4.13.1
+ 4.13.2
From 7426c563310f73a0820a9af729b5f3621cea57f4 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Fri, 29 Nov 2024 16:24:52 +0900
Subject: [PATCH 108/240] Bump hadoop from 3.4.0 to 3.4.1
---
pom.xml | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/pom.xml b/pom.xml
index 46f26766..c1bc7798 100644
--- a/pom.xml
+++ b/pom.xml
@@ -95,7 +95,7 @@
org.apache.hadoophadoop-common
- 3.4.0
+ 3.4.1true
@@ -108,7 +108,7 @@
org.apache.hadoophadoop-mapreduce-client-core
- 3.4.0
+ 3.4.1true
From 88607b2ed67c8c73e8b199adf85ac1ddf2fcdddb Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Fri, 29 Nov 2024 16:26:16 +0900
Subject: [PATCH 109/240] Bump httpcore from 4.3 to 4.4.16
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index c1bc7798..a993945e 100644
--- a/pom.xml
+++ b/pom.xml
@@ -176,7 +176,7 @@
org.apache.httpcomponentshttpcore
- 4.3
+ 4.4.16
From 0256ae6131e80c49e1ed4a16e5631ccff0d74e36 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Fri, 29 Nov 2024 16:27:04 +0900
Subject: [PATCH 110/240] Bump htmlparser from 1.6 to 2.1
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index a993945e..ce0a2aec 100644
--- a/pom.xml
+++ b/pom.xml
@@ -71,7 +71,7 @@
org.htmlparserhtmlparser
- 1.6
+ 2.1
From e1d458a86a2203ca1cd5cab967fb17f268994082 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Fri, 29 Nov 2024 16:27:31 +0900
Subject: [PATCH 111/240] Bump json from 20231013 to 20240303
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index ce0a2aec..1023560c 100644
--- a/pom.xml
+++ b/pom.xml
@@ -66,7 +66,7 @@
org.jsonjson
- 20231013
+ 20240303org.htmlparser
From c839700d472bac5b4625ea4fe10ef47ee02a5a31 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Fri, 29 Nov 2024 16:35:12 +0900
Subject: [PATCH 112/240] Update CHANGES.md
---
CHANGES.md | 18 ++++++++++++++++++
1 file changed, 18 insertions(+)
diff --git a/CHANGES.md b/CHANGES.md
index 579b659f..e3afd137 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,3 +1,21 @@
+1.2.0
+-----
+
+#### New features
+
+* MetaData is now multivalued to support repeated WARC and HTTP headers. [#98](https://github.com/iipc/webarchive-commons/pull/98/files)
+
+#### Dependency upgrades
+
+* commons-io 2.18.0
+* commons-lang 2.6
+* guava 33.3.1-jre
+* hadoop 3.4.1
+* htmlparser 2.1
+* httpcore 4.4.16
+* json 20240303
+* junit 4.13.2
+
1.1.11
------
From 91c01ddb0561798d204c957fefafa782c0b53921 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Fri, 29 Nov 2024 16:37:15 +0900
Subject: [PATCH 113/240] [maven-release-plugin] prepare release
webarchive-commons-1.2.0
---
pom.xml | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/pom.xml b/pom.xml
index 1023560c..12dfae9f 100644
--- a/pom.xml
+++ b/pom.xml
@@ -3,7 +3,7 @@
org.netpreserve.commonswebarchive-commons
- 1.1.12-SNAPSHOT
+ 1.2.0jarwebarchive-commons
@@ -41,7 +41,7 @@
scm:git:git@github.com:iipc/webarchive-commons.gitscm:git:git@github.com:iipc/webarchive-commons.githttps://github.com/iipc/webarchive-commons
- HEAD
+ webarchive-commons-1.2.0
From f37418d08d8fa7fd4ccad4fbb919cc0fc371f2f2 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Fri, 29 Nov 2024 16:37:20 +0900
Subject: [PATCH 114/240] [maven-release-plugin] prepare for next development
iteration
---
pom.xml | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/pom.xml b/pom.xml
index 12dfae9f..0d84b0d2 100644
--- a/pom.xml
+++ b/pom.xml
@@ -3,7 +3,7 @@
org.netpreserve.commonswebarchive-commons
- 1.2.0
+ 1.2.1-SNAPSHOTjarwebarchive-commons
@@ -41,7 +41,7 @@
scm:git:git@github.com:iipc/webarchive-commons.gitscm:git:git@github.com:iipc/webarchive-commons.githttps://github.com/iipc/webarchive-commons
- webarchive-commons-1.2.0
+ HEAD
From 3ae5720ad43e2e80b5ab853078e891ee53641a3c Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Tue, 3 Dec 2024 20:22:10 +0900
Subject: [PATCH 115/240] Remove dependency on dsiutils
---
pom.xml | 16 ++--------------
.../java/org/archive/url/UsableURIFactory.java | 5 ++---
2 files changed, 4 insertions(+), 17 deletions(-)
diff --git a/pom.xml b/pom.xml
index 0d84b0d2..da2e14da 100644
--- a/pom.xml
+++ b/pom.xml
@@ -150,20 +150,8 @@
it.unimi.dsi
- dsiutils
- 2.2.8
- compile
-
-
- ch.qos.logback
- logback-classic
-
-
-
- commons-collections
- commons-collections
-
-
+ fastutil
+ 7.0.10
diff --git a/src/main/java/org/archive/url/UsableURIFactory.java b/src/main/java/org/archive/url/UsableURIFactory.java
index d44b5c84..3dfc33a7 100644
--- a/src/main/java/org/archive/url/UsableURIFactory.java
+++ b/src/main/java/org/archive/url/UsableURIFactory.java
@@ -20,7 +20,6 @@
import gnu.inet.encoding.IDNA;
import gnu.inet.encoding.IDNAException;
-import it.unimi.dsi.lang.MutableString;
import java.io.UnsupportedEncodingException;
import java.util.BitSet;
@@ -485,7 +484,7 @@ private String fixup(String uri, final URI base, final String charset)
// Preallocate. The '1's and '2's in below are space for ':',
// '//', etc. URI characters.
- MutableString s = new MutableString(
+ StringBuilder s = new StringBuilder(
((uriScheme != null)? uriScheme.length(): 0)
+ 1 // ';'
+ ((uriAuthority != null)? uriAuthority.length(): 0)
@@ -707,7 +706,7 @@ private String checkPort(String uriAuthority)
* @param substr Suffix or prefix to use if str is not null.
* @param suffix True if substr is a suffix.
*/
- private void appendNonNull(MutableString b, String str, String substr,
+ private void appendNonNull(StringBuilder b, String str, String substr,
boolean suffix) {
if (str != null && str.length() > 0) {
if (!suffix) {
From 33556bf741eaa10421b9214bbbd69f40618d27d1 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Tue, 3 Dec 2024 20:38:46 +0900
Subject: [PATCH 116/240] Remove pom-cdh4.xml
---
pom-cdh4.xml | 229 ---------------------------------------------------
1 file changed, 229 deletions(-)
delete mode 100644 pom-cdh4.xml
diff --git a/pom-cdh4.xml b/pom-cdh4.xml
deleted file mode 100644
index de19d8d0..00000000
--- a/pom-cdh4.xml
+++ /dev/null
@@ -1,229 +0,0 @@
-
- 4.0.0
-
- org.archive
- ia-web-commons
- 1.0-SNAPSHOT
- jar
-
- ia-web-commons
- http://maven.apache.org
-
-
- UTF-8
- ${maven.build.timestamp}
- yyyyMMddhhmmss
-
-
-
-
- junit
- junit
- 3.8.1
- test
-
-
-
- com.google.guava
- guava
- 14.0.1
-
-
-
- org.json
- json
- 20090211
-
-
- org.htmlparser
- htmlparser
- 1.6
-
-
-
- org.mozilla
- juniversalchardet
- 1.0.3
-
-
-
- commons-httpclient
- commons-httpclient
- 3.1
-
-
-
- org.apache.hadoop
- hadoop-core
- 2.0.0-mr1-cdh4.2.0
-
-
- commons-httpclient
- commons-httpclient
-
-
- javax.servlet
- servlet-api
-
-
- javax.servlet.jsp
- jsp-api
-
-
- org.mortbay.jetty
- jetty
-
-
- org.mortbay.jetty
- jetty-util
-
-
- tomcat
- jasper-runtime
-
-
- tomcat
- jasper-compiler
-
-
-
-
- org.apache.hadoop
- hadoop-common
- 2.0.0-cdh4.2.0
-
-
- org.apache.hadoop
- hadoop-mapreduce-client-common
- 2.0.0-cdh4.2.0
-
-
- org.apache.hadoop
- hadoop-mapreduce-client-core
- 2.0.0-cdh4.2.0
-
-
-
- org.apache.pig
- pig
- 0.11.1
- provided
-
-
-
- commons-lang
- commons-lang
- 2.5
-
-
-
- commons-io
- commons-io
- 2.4
-
-
-
- org.gnu.inet
- libidn
- 1.15
-
-
- it.unimi.dsi
- mg4j
- 1.0.1
- compile
-
-
- org.apache.httpcomponents
- httpcore
- 4.3
-
-
-
-
-
-
- org.apache.maven.plugins
- maven-compiler-plugin
- 2.3.2
-
- 1.6
- 1.6
-
-
-
- maven-assembly-plugin
- 2.4
-
-
- jar-with-dependencies
-
- ia-web-commons
-
-
-
- package
-
- single
-
-
-
-
-
-
-
- src/main/resources
- true
-
-
-
-
-
-
- internetarchive
- Internet Archive Maven Repository
- http://builds.archive.org:8080/maven2
- default
-
-
- true
- daily
- warn
-
-
- true
- daily
- warn
-
-
-
-
- cloudera
- Cloudera Hadoop
- https://repository.cloudera.com/artifactory/cloudera-repos/
- default
-
-
- true
- daily
- warn
-
-
- true
- daily
- warn
-
-
-
-
-
-
-
- repository
-
- ${repository.url}
-
-
-
-
From 4bb03baec41d90795e312e4a2865abb0395670f3 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Tue, 3 Dec 2024 20:42:29 +0900
Subject: [PATCH 117/240] Use Files.createLink instead of shelling out to ln
---
.../io/ObjectPlusFilesOutputStream.java | 19 ++++---------------
1 file changed, 4 insertions(+), 15 deletions(-)
diff --git a/src/main/java/org/archive/io/ObjectPlusFilesOutputStream.java b/src/main/java/org/archive/io/ObjectPlusFilesOutputStream.java
index 224f24e7..bd5c1eea 100644
--- a/src/main/java/org/archive/io/ObjectPlusFilesOutputStream.java
+++ b/src/main/java/org/archive/io/ObjectPlusFilesOutputStream.java
@@ -18,10 +18,8 @@
*/
package org.archive.io;
-import java.io.File;
-import java.io.IOException;
-import java.io.ObjectOutputStream;
-import java.io.OutputStream;
+import java.io.*;
+import java.nio.file.Files;
import java.util.LinkedList;
import org.archive.util.FileUtils;
@@ -116,19 +114,10 @@ public void snapshotAppendOnlyFile(File file) throws IOException {
* @throws IOException
*/
private void hardlinkOrCopy(File file, File destination) throws IOException {
- // For Linux/UNIX, try a hard link first.
- Process link = Runtime.getRuntime().exec("ln "+file.getAbsolutePath()+" "+destination.getAbsolutePath());
- // TODO NTFS also supports hard links; add appropriate try
try {
- link.waitFor();
- } catch (InterruptedException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- if(link.exitValue()!=0) {
- // hard link failed
+ Files.createLink(destination.toPath(), file.toPath());
+ } catch (UnsupportedEncodingException e) {
FileUtils.copyFile(file,destination);
}
}
-
}
From 328aef2788313a2abc6123c385f9c31b863d6f1b Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Wed, 4 Dec 2024 15:07:23 +0900
Subject: [PATCH 118/240] Remove dependency on fastutil
Fastutil is our largest dependency and consumes a third of the overall Heritrix distribution size. If we update to the latest version it will be even larger. But we're only using two tiny classes from it: the trivial RepositionableStream interface and the unsynchronized FastBufferedOutputStream.
Some downstream users (e.g. lockss-core) actually implement RepositionableStream, so to preserve API compatiblity this change includes a copy of just that interface while keeping the same package name.
Regarding FastBufferedOutputStream, for WARC writing the outer GZIPOutputStream is synchronized anyway. And RecordingOutputStream will typically be doing moderately large writes copying from the network. So in both usages it seems unlikely that there's much practical benefit in using it here over the standard BufferedOutputStream. The JVM JIT has a lot of optimizations for synchronized these days too.
---
pom.xml | 5 ---
.../dsi/fastutil/io/RepositionableStream.java | 42 +++++++++++++++++++
.../org/archive/io/RecordingOutputStream.java | 5 +--
.../java/org/archive/io/WriterPoolMember.java | 5 +--
4 files changed, 46 insertions(+), 11 deletions(-)
create mode 100644 src/main/java/it/unimi/dsi/fastutil/io/RepositionableStream.java
diff --git a/pom.xml b/pom.xml
index da2e14da..5e5fa419 100644
--- a/pom.xml
+++ b/pom.xml
@@ -148,11 +148,6 @@
libidn1.15
-
- it.unimi.dsi
- fastutil
- 7.0.10
-
diff --git a/src/main/java/it/unimi/dsi/fastutil/io/RepositionableStream.java b/src/main/java/it/unimi/dsi/fastutil/io/RepositionableStream.java
new file mode 100644
index 00000000..a81645f0
--- /dev/null
+++ b/src/main/java/it/unimi/dsi/fastutil/io/RepositionableStream.java
@@ -0,0 +1,42 @@
+// copied from fastutil, keeping the original package name to avoid breaking
+// compatibility with existing user code that implements this interface
+package it.unimi.dsi.fastutil.io;
+
+/*
+ * Copyright (C) 2005-2015 Sebastiano Vigna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/** A basic interface specifying positioning methods for a byte stream.
+ *
+ * @author Sebastiano Vigna
+ * @since 4.4
+ */
+
+public interface RepositionableStream {
+
+ /** Sets the current stream position.
+ *
+ * @param newPosition the new stream position.
+ */
+ void position( long newPosition ) throws java.io.IOException;
+
+ /** Returns the current stream position.
+ *
+ * @return the current stream position.
+ */
+ long position() throws java.io.IOException;
+
+}
diff --git a/src/main/java/org/archive/io/RecordingOutputStream.java b/src/main/java/org/archive/io/RecordingOutputStream.java
index 7d2ff212..6c77997b 100644
--- a/src/main/java/org/archive/io/RecordingOutputStream.java
+++ b/src/main/java/org/archive/io/RecordingOutputStream.java
@@ -19,8 +19,7 @@
package org.archive.io;
-import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
-
+import java.io.BufferedOutputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
@@ -207,7 +206,7 @@ public void open(OutputStream wrappedStream) throws IOException {
protected OutputStream ensureDiskStream() throws FileNotFoundException {
if (this.diskStream == null) {
FileOutputStream fis = new FileOutputStream(this.backingFilename);
- this.diskStream = new FastBufferedOutputStream(fis);
+ this.diskStream = new BufferedOutputStream(fis);
}
return this.diskStream;
}
diff --git a/src/main/java/org/archive/io/WriterPoolMember.java b/src/main/java/org/archive/io/WriterPoolMember.java
index 893007ec..e10d443b 100644
--- a/src/main/java/org/archive/io/WriterPoolMember.java
+++ b/src/main/java/org/archive/io/WriterPoolMember.java
@@ -19,8 +19,7 @@
package org.archive.io;
-import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
-
+import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
@@ -200,7 +199,7 @@ protected String createFile(final File file) throws IOException {
close();
this.f = file;
FileOutputStream fos = new FileOutputStream(this.f);
- this.countOut = new MiserOutputStream(new FastBufferedOutputStream(fos),settings.getFrequentFlushes());
+ this.countOut = new MiserOutputStream(new BufferedOutputStream(fos),settings.getFrequentFlushes());
this.out = this.countOut;
logger.fine("Opened " + this.f.getAbsolutePath());
return this.f.getName();
From 8988fbbc3528afcc7f792bcc967189311e8a1286 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Wed, 4 Dec 2024 16:54:03 +0900
Subject: [PATCH 119/240] Deprecate some classes specific to HttpClient 3
These are intended to be removed in webarchive-commons 2. #78
---
.../java/org/archive/httpclient/HttpRecorderGetMethod.java | 2 ++
src/main/java/org/archive/httpclient/HttpRecorderMethod.java | 2 ++
.../java/org/archive/httpclient/HttpRecorderPostMethod.java | 2 ++
.../org/archive/httpclient/SingleHttpConnectionManager.java | 2 ++
.../archive/httpclient/ThreadLocalHttpConnectionManager.java | 4 +++-
.../util/binsearch/impl/HTTPSeekableLineReaderFactory.java | 1 +
.../archive/util/binsearch/impl/http/ApacheHttp31SLR.java | 4 ++++
.../util/binsearch/impl/http/ApacheHttp31SLRFactory.java | 5 +++++
8 files changed, 21 insertions(+), 1 deletion(-)
diff --git a/src/main/java/org/archive/httpclient/HttpRecorderGetMethod.java b/src/main/java/org/archive/httpclient/HttpRecorderGetMethod.java
index ef241b48..1a94af1f 100644
--- a/src/main/java/org/archive/httpclient/HttpRecorderGetMethod.java
+++ b/src/main/java/org/archive/httpclient/HttpRecorderGetMethod.java
@@ -70,7 +70,9 @@
*
* @author stack
* @version $Revision$, $Date$
+ * @deprecated Commons HttpClient 3 is end of life, this will be removed in webarchive-commons 2.0
*/
+@Deprecated
public class HttpRecorderGetMethod extends GetMethod {
protected static Logger logger =
diff --git a/src/main/java/org/archive/httpclient/HttpRecorderMethod.java b/src/main/java/org/archive/httpclient/HttpRecorderMethod.java
index 932e7e98..b08bc0bd 100644
--- a/src/main/java/org/archive/httpclient/HttpRecorderMethod.java
+++ b/src/main/java/org/archive/httpclient/HttpRecorderMethod.java
@@ -34,7 +34,9 @@
*
* @author stack
* @version $Revision$, $Date$
+ * @deprecated Commons HttpClient 3 is end of life, this will be removed in webarchive-commons 2.0
*/
+@Deprecated
public class HttpRecorderMethod {
protected static Logger logger =
Logger.getLogger(HttpRecorderMethod.class.getName());
diff --git a/src/main/java/org/archive/httpclient/HttpRecorderPostMethod.java b/src/main/java/org/archive/httpclient/HttpRecorderPostMethod.java
index 20f1bfd1..d55d816a 100644
--- a/src/main/java/org/archive/httpclient/HttpRecorderPostMethod.java
+++ b/src/main/java/org/archive/httpclient/HttpRecorderPostMethod.java
@@ -36,7 +36,9 @@
*
* @author stack
* @version $Date$ $Revision$
+ * @deprecated Commons HttpClient 3 is end of life, this will be removed in webarchive-commons 2.0
*/
+@Deprecated
public class HttpRecorderPostMethod extends PostMethod {
/**
* Instance of http recorder method.
diff --git a/src/main/java/org/archive/httpclient/SingleHttpConnectionManager.java b/src/main/java/org/archive/httpclient/SingleHttpConnectionManager.java
index 4ba6a837..d6cf27ab 100644
--- a/src/main/java/org/archive/httpclient/SingleHttpConnectionManager.java
+++ b/src/main/java/org/archive/httpclient/SingleHttpConnectionManager.java
@@ -32,7 +32,9 @@
* with external mechanisms.
*
* @author gojomo
+ * @deprecated Commons HttpClient 3 is end of life, this will be removed in webarchive-commons 2.0
*/
+@Deprecated
public class SingleHttpConnectionManager extends SimpleHttpConnectionManager {
public SingleHttpConnectionManager() {
diff --git a/src/main/java/org/archive/httpclient/ThreadLocalHttpConnectionManager.java b/src/main/java/org/archive/httpclient/ThreadLocalHttpConnectionManager.java
index 91e850ea..16821b36 100644
--- a/src/main/java/org/archive/httpclient/ThreadLocalHttpConnectionManager.java
+++ b/src/main/java/org/archive/httpclient/ThreadLocalHttpConnectionManager.java
@@ -36,8 +36,10 @@
*
* Java >= 1.4 is recommended.
*
- * @author Christian Kohlschuetter
+ * @author Christian Kohlschuetter
+ * @deprecated Commons HttpClient 3 is end of life, this will be removed in webarchive-commons 2.0
*/
+@Deprecated
public final class ThreadLocalHttpConnectionManager implements
HttpConnectionManager {
diff --git a/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReaderFactory.java b/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReaderFactory.java
index b4a23db0..68ee6551 100644
--- a/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReaderFactory.java
+++ b/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReaderFactory.java
@@ -20,6 +20,7 @@ protected HTTPSeekableLineReaderFactory()
public enum HttpLibs
{
+ @Deprecated
APACHE_31,
APACHE_43,
URLCONN,
diff --git a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java
index c4fdbba8..124d3d03 100644
--- a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java
+++ b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java
@@ -14,6 +14,10 @@
import org.apache.commons.io.input.CountingInputStream;
import org.archive.util.binsearch.impl.HTTPSeekableLineReader;
+/**
+ * @deprecated Commons HttpClient 3 is end of life, this will be removed in webarchive-commons 2.0
+ */
+@Deprecated
public class ApacheHttp31SLR extends HTTPSeekableLineReader {
private HttpClient http;
diff --git a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java
index bc5b83f4..2af03dab 100644
--- a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java
+++ b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java
@@ -15,6 +15,11 @@
import org.archive.util.binsearch.impl.HTTPSeekableLineReader;
import org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory;
+/**
+ *
+ * @deprecated Commons HttpClient 3 is end of life, this will be removed in webarchive-commons 2.0
+ */
+@Deprecated
public class ApacheHttp31SLRFactory extends HTTPSeekableLineReaderFactory {
private final static Logger LOGGER = Logger.getLogger(ApacheHttp31SLRFactory.class.getName());
From b8a91bb3b7e8a36b2162251314ff52b42a379221 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Thu, 5 Dec 2024 07:49:10 +0900
Subject: [PATCH 120/240] Remove unused dependency on commons-collections
---
pom.xml | 7 -------
1 file changed, 7 deletions(-)
diff --git a/pom.xml b/pom.xml
index 5e5fa419..6dec154c 100644
--- a/pom.xml
+++ b/pom.xml
@@ -149,13 +149,6 @@
1.15
-
-
- commons-collections
- commons-collections
- 3.2.2
-
-
org.apache.httpcomponentshttpcore
From a80b98dfe4b1c2a7556e7df2574c16426849f6d9 Mon Sep 17 00:00:00 2001
From: Tom Morris
Date: Sat, 26 Aug 2023 20:05:34 -0400
Subject: [PATCH 121/240] Add failing test from Sebastian's issue
---
src/test/java/org/archive/url/BasicURLCanonicalizerTest.java | 3 +++
src/test/java/org/archive/url/WaybackURLKeyMakerTest.java | 4 ++++
2 files changed, 7 insertions(+)
diff --git a/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java b/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java
index c21bcbe8..cc100e4c 100644
--- a/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java
+++ b/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java
@@ -143,6 +143,9 @@ public void testUnescapeRepeatedly() {
assertEquals("%",guc.unescapeRepeatedly("%25%32%35"));
assertEquals("168.188.99.26",guc.unescapeRepeatedly("%31%36%38%2e%31%38%38%2e%39%39%2e%32%36"));
+
+ assertEquals("tag=%E4%EE%F8%EA%EE%EB%FC%ED%EE%E5",
+ guc.unescapeRepeatedly("tag=%E4%EE%F8%EA%EE%EB%FC%ED%EE%E5"));
}
public void testAttemptIPFormats() throws URIException {
diff --git a/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java b/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java
index 1a1403ee..86250972 100644
--- a/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java
+++ b/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java
@@ -26,6 +26,10 @@ public void testMakeKey() throws URISyntaxException {
assertEquals("192,211,203,34)/robots.txt", km.makeKey("https://34.203.211.192/robots.txt"));
assertEquals("2600:1f18:200d:fb00:2b74:867c:ab0c:150a)/robots.txt",
km.makeKey("https://[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/robots.txt"));
+ assertEquals("ua,1kr)/newslist.html?tag=%e4%ee%f8%ea%ee%eb%fc%ed%ee%e5",
+ km.makeKey("http://1kr.ua/newslist.html?tag=%E4%EE%F8%EA%EE%EB%FC%ED%EE%E5"));
+ assertEquals("com,aluroba)/tags/%c3%ce%ca%c7%d1%e5%c7.htm",
+ km.makeKey("http://www.aluroba.com/tags/%C3%CE%CA%C7%D1%E5%C7.htm"));
}
}
From 5161306d9ec993d1986f0d092c056f33ba3abdfe Mon Sep 17 00:00:00 2001
From: Tom Morris
Date: Sun, 27 Aug 2023 13:01:19 -0400
Subject: [PATCH 122/240] Add non-UTF-8 encoded test from mailing list
---
src/test/java/org/archive/url/WaybackURLKeyMakerTest.java | 2 ++
1 file changed, 2 insertions(+)
diff --git a/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java b/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java
index 86250972..26371ba8 100644
--- a/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java
+++ b/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java
@@ -30,6 +30,8 @@ public void testMakeKey() throws URISyntaxException {
km.makeKey("http://1kr.ua/newslist.html?tag=%E4%EE%F8%EA%EE%EB%FC%ED%EE%E5"));
assertEquals("com,aluroba)/tags/%c3%ce%ca%c7%d1%e5%c7.htm",
km.makeKey("http://www.aluroba.com/tags/%C3%CE%CA%C7%D1%E5%C7.htm"));
+ assertEquals("ac,insbase)/xoops2/modules/xpwiki?%a4%d5%a4%af%a4%aa%a4%ab%b8%a9%a4%aa%a4%aa%a4%ce%a4%b8%a4%e7%a4%a6%bb%d4",
+ km.makeKey("https://www.insbase.ac/xoops2/modules/xpwiki/?%A4%D5%A4%AF%A4%AA%A4%AB%B8%A9%A4%AA%A4%AA%A4%CE%A4%B8%A4%E7%A4%A6%BB%D4"));
}
}
From f7be47bc523c4d06cc7960dc2d3b1b58f9580906 Mon Sep 17 00:00:00 2001
From: Tom Morris
Date: Sun, 27 Aug 2023 13:11:30 -0400
Subject: [PATCH 123/240] Handle non-UTF-8 encoded characters. Fixes #6
---
.../archive/url/BasicURLCanonicalizer.java | 27 +++++++++++++++----
1 file changed, 22 insertions(+), 5 deletions(-)
diff --git a/src/main/java/org/archive/url/BasicURLCanonicalizer.java b/src/main/java/org/archive/url/BasicURLCanonicalizer.java
index c09ad6e6..37b448c1 100644
--- a/src/main/java/org/archive/url/BasicURLCanonicalizer.java
+++ b/src/main/java/org/archive/url/BasicURLCanonicalizer.java
@@ -15,18 +15,18 @@
/**
* Canonicalizer that does more or less basic fixup. Based initially on rules
* specified at https://developers.google.com/safe-browsing/developers_guide_v2#
- * Canonicalization. These rules are designed for clients of google's
+ * Canonicalization. These rules are designed for clients of Google's
* "experimental" Safe Browsing API to "check URLs against Google's
* constantly-updated blacklists of suspected phishing and malware pages".
*
*
- * This class differs from google in treatment of non-ascii input. Google's
+ * This class differs from Google in treatment of non-ascii input. Google's
* rules don't really address this except with one example test case, which
* seems to suggest taking raw input bytes and pct-encoding them byte for byte.
* Since the input to this class consists of java strings, not raw bytes, that
- * wouldn't be possible, even if deemed preferable. Instead
+ * wouldn't be possible, even if deemed preferable. Instead,
* BasicURLCanonicalizer expresses non-ascii characters pct-encoded UTF-8.
*/
public class BasicURLCanonicalizer implements URLCanonicalizer {
@@ -212,6 +212,10 @@ protected static Charset UTF8() {
return _UTF8;
}
+ /**
+ * @param input String to be percent-encoded. Assumed to be fully unescaped.
+ * @return percent-encoded string
+ */
public String escapeOnce(String input) {
if (input == null) {
return null;
@@ -243,6 +247,19 @@ public String escapeOnce(String input) {
*/
sb = new StringBuilder(input.substring(0, i));
}
+ if (b == '%' && i < utf8bytes.length - 2) {
+ // Any hex escapes left at this point represent non-UTF-8 encoded characters
+ // Unescape them, so they don't get double escaped
+ int hex1 = getHex(utf8bytes[i + 1]);
+ if (hex1 >= 0) {
+ int hex2 = getHex(utf8bytes[i + 2]);
+ if (hex2 >= 0) {
+ i = i+2;
+ b = hex1 * 16 + hex2;
+ }
+ }
+
+ }
sb.append("%");
String hex = Integer.toHexString(b).toUpperCase();
if (hex.length() == 1) {
@@ -337,7 +354,7 @@ public String decode(String input) {
* Decodes bytes in bbuf as utf-8 and appends decoded characters to sb. If
* decoding of any portion fails, appends the un-decodable %xx%xx sequence
* extracted from inputStr instead of decoded characters. See "bad unicode"
- * tests in GoogleCanonicalizerTest#testDecode(). Variables only make sense
+ * tests in BasicURLCanonicalizerTest#testDecode(). Variables only make sense
* within context of {@link #decode(String)}.
*
* @param sb
From 6a3cf1b317c87305d05faee73d2c3ee3f5ec08b0 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Wed, 11 Dec 2024 21:14:06 +0100
Subject: [PATCH 124/240] WAT: Duplicated payload metadata values for
"Actual-Content-Length" and "Trailing-Slop-Length"
---
.../org/archive/resource/arc/ARCResource.java | 2 +
.../http/HTTPHeadersResourceFactory.java | 11 +++--
.../archive/resource/warc/WARCResource.java | 14 ++++--
.../record/WARCMetaDataResourceFactory.java | 10 +++-
.../archive/resource/arc/ARCResourceTest.java | 48 +++++++++++++++++++
.../resource/warc/WARCResourceTest.java | 46 ++++++++++++++++++
6 files changed, 123 insertions(+), 8 deletions(-)
create mode 100644 src/test/java/org/archive/resource/arc/ARCResourceTest.java
create mode 100644 src/test/java/org/archive/resource/warc/WARCResourceTest.java
diff --git a/src/main/java/org/archive/resource/arc/ARCResource.java b/src/main/java/org/archive/resource/arc/ARCResource.java
index b6e0a1c1..b0195f08 100644
--- a/src/main/java/org/archive/resource/arc/ARCResource.java
+++ b/src/main/java/org/archive/resource/arc/ARCResource.java
@@ -64,10 +64,12 @@ public ARCResource(MetaData metaData, ResourceContainer container,
}
}
+ @Override
public InputStream getInputStream() {
return new EOFNotifyingInputStream(digIS, this);
}
+ @Override
public void notifyEOF() throws IOException {
metaData.putLong(PAYLOAD_LENGTH, countingIS.getCount());
String digString = Base32.encode(digIS.getMessageDigest().digest());
diff --git a/src/main/java/org/archive/resource/http/HTTPHeadersResourceFactory.java b/src/main/java/org/archive/resource/http/HTTPHeadersResourceFactory.java
index 79805090..eb25d821 100644
--- a/src/main/java/org/archive/resource/http/HTTPHeadersResourceFactory.java
+++ b/src/main/java/org/archive/resource/http/HTTPHeadersResourceFactory.java
@@ -31,6 +31,7 @@ public HTTPHeadersResourceFactory(String name, String type) {
parser = new HttpHeaderParser();
}
+ @Override
public Resource getResource(InputStream is, MetaData parentMetaData,
ResourceContainer container) throws ResourceParseException,
IOException {
@@ -40,9 +41,13 @@ public Resource getResource(InputStream is, MetaData parentMetaData,
if(headers.isCorrupt()) {
parentMetaData.putBoolean(HTTP_HEADERS_CORRUPT, true);
}
- parentMetaData.putLong(PAYLOAD_LENGTH, bytes);
-
- parentMetaData.putLong(PAYLOAD_SLOP_BYTES, StreamCopy.readToEOF(is));
+ if (!parentMetaData.has(PAYLOAD_LENGTH) || bytes != parentMetaData.getLong(PAYLOAD_LENGTH)) {
+ parentMetaData.putLong(PAYLOAD_LENGTH, bytes);
+ }
+ long trailingSlopBytes = StreamCopy.readToEOF(is);
+ if (!parentMetaData.has(PAYLOAD_SLOP_BYTES) || trailingSlopBytes > 0) {
+ parentMetaData.putLong(PAYLOAD_SLOP_BYTES, trailingSlopBytes);
+ }
if(type != null) {
parentMetaData.putString(PAYLOAD_CONTENT_TYPE, type);
}
diff --git a/src/main/java/org/archive/resource/warc/WARCResource.java b/src/main/java/org/archive/resource/warc/WARCResource.java
index d538a25d..a9c3fcc3 100644
--- a/src/main/java/org/archive/resource/warc/WARCResource.java
+++ b/src/main/java/org/archive/resource/warc/WARCResource.java
@@ -53,7 +53,7 @@ public WARCResource(MetaData metaData, ResourceContainer container,
countingIS = new CountingInputStream(
ByteStreams.limit(response, length));
} else {
- throw new ResourceParseException(null);
+ throw new ResourceParseException(new Exception("Zero or negative length: " + length));
}
try {
digIS = new DigestInputStream(countingIS,
@@ -63,14 +63,18 @@ public WARCResource(MetaData metaData, ResourceContainer container,
}
}
+ @Override
public InputStream getInputStream() {
return new EOFNotifyingInputStream(digIS, this);
}
+ @Override
public void notifyEOF() throws IOException {
String digString = Base32.encode(digIS.getMessageDigest().digest());
if(container.isCompressed()) {
- metaData.putLong(PAYLOAD_LENGTH, countingIS.getCount());
+ if (!metaData.has(PAYLOAD_LENGTH) || countingIS.getCount() != metaData.getLong(PAYLOAD_LENGTH)) {
+ metaData.putLong(PAYLOAD_LENGTH, countingIS.getCount());
+ }
metaData.putLong(PAYLOAD_SLOP_BYTES, StreamCopy.readToEOF(response));
metaData.putString(PAYLOAD_DIGEST, "sha1:"+digString);
} else {
@@ -81,13 +85,17 @@ public void notifyEOF() throws IOException {
(PushBackOneByteInputStream) raw;
long numNewlines = StreamCopy.skipChars(pb1bis, CR_NL_CHARS);
if(numNewlines > 0) {
- metaData.putLong(PAYLOAD_LENGTH, countingIS.getCount());
+ long payloadLength = countingIS.getCount();
+ if (!metaData.has(PAYLOAD_LENGTH) || payloadLength != metaData.getLong(PAYLOAD_LENGTH)) {
+ metaData.putLong(PAYLOAD_LENGTH, payloadLength);
+ }
metaData.putLong(PAYLOAD_SLOP_BYTES, numNewlines);
metaData.putString(PAYLOAD_DIGEST, "sha1:"+digString);
}
}
}
}
+
public MetaData getEnvelopeMetaData() {
return envelope;
}
diff --git a/src/main/java/org/archive/resource/warc/record/WARCMetaDataResourceFactory.java b/src/main/java/org/archive/resource/warc/record/WARCMetaDataResourceFactory.java
index 0dfb2834..ba8a35da 100644
--- a/src/main/java/org/archive/resource/warc/record/WARCMetaDataResourceFactory.java
+++ b/src/main/java/org/archive/resource/warc/record/WARCMetaDataResourceFactory.java
@@ -21,6 +21,7 @@ public WARCMetaDataResourceFactory() {
parser = new HttpHeaderParser();
}
+ @Override
public Resource getResource(InputStream is, MetaData parentMetaData,
ResourceContainer container) throws ResourceParseException,
IOException {
@@ -33,8 +34,13 @@ public Resource getResource(InputStream is, MetaData parentMetaData,
if(headers.isCorrupt()) {
md.putBoolean(WARC_META_FIELDS_CORRUPT, true);
}
- parentMetaData.putLong(PAYLOAD_SLOP_BYTES, StreamCopy.readToEOF(is));
- parentMetaData.putLong(PAYLOAD_LENGTH, bytes);
+ long trailingSlopBytes = StreamCopy.readToEOF(is);
+ if (!parentMetaData.has(PAYLOAD_SLOP_BYTES) || trailingSlopBytes > 0) {
+ parentMetaData.putLong(PAYLOAD_SLOP_BYTES, trailingSlopBytes);
+ }
+ if (!parentMetaData.has(PAYLOAD_LENGTH) || bytes != parentMetaData.getLong(PAYLOAD_LENGTH)) {
+ parentMetaData.putLong(PAYLOAD_LENGTH, bytes);
+ }
return new WARCMetaDataResource(md,container, headers);
} catch (HttpParseException e) {
diff --git a/src/test/java/org/archive/resource/arc/ARCResourceTest.java b/src/test/java/org/archive/resource/arc/ARCResourceTest.java
new file mode 100644
index 00000000..43116af7
--- /dev/null
+++ b/src/test/java/org/archive/resource/arc/ARCResourceTest.java
@@ -0,0 +1,48 @@
+package org.archive.resource.arc;
+
+
+import static org.archive.resource.ResourceConstants.PAYLOAD_LENGTH;
+import static org.archive.resource.ResourceConstants.PAYLOAD_SLOP_BYTES;
+
+import java.io.IOException;
+
+import org.archive.extract.ExtractingResourceFactoryMapper;
+import org.archive.extract.ExtractingResourceProducer;
+import org.archive.extract.ProducerUtils;
+import org.archive.extract.ResourceFactoryMapper;
+import org.archive.resource.Resource;
+import org.archive.resource.ResourceParseException;
+import org.archive.resource.ResourceProducer;
+import org.archive.util.StreamCopy;
+
+import org.json.JSONObject;
+
+import junit.framework.TestCase;
+
+public class ARCResourceTest extends TestCase {
+
+ public void testARCResource() throws ResourceParseException, IOException {
+ String testFileName = "../../format/arc/IAH-20080430204825-00000-blackbook-truncated.arc";
+ ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath());
+ ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
+ ExtractingResourceProducer extractor = new ExtractingResourceProducer(producer, mapper);
+
+ Resource resource = extractor.getNext();
+
+ while (resource != null) {
+ JSONObject payloadMD = resource.getMetaData().getTopMetaData().getJSONObject("Envelope")
+ .getJSONObject("Payload-Metadata");
+ System.err.println(payloadMD);
+
+ if (payloadMD.has(PAYLOAD_LENGTH)) {
+ assertTrue(payloadMD.getLong(PAYLOAD_LENGTH) != -1);
+ }
+ if (payloadMD.has(PAYLOAD_SLOP_BYTES)) {
+ // does not occur with the tested ARC file
+ }
+
+ StreamCopy.readToEOF(resource.getInputStream());
+ resource = extractor.getNext();
+ }
+ }
+}
diff --git a/src/test/java/org/archive/resource/warc/WARCResourceTest.java b/src/test/java/org/archive/resource/warc/WARCResourceTest.java
new file mode 100644
index 00000000..1b935405
--- /dev/null
+++ b/src/test/java/org/archive/resource/warc/WARCResourceTest.java
@@ -0,0 +1,46 @@
+package org.archive.resource.warc;
+
+import static org.archive.resource.ResourceConstants.PAYLOAD_LENGTH;
+import static org.archive.resource.ResourceConstants.PAYLOAD_SLOP_BYTES;
+
+import java.io.IOException;
+
+import org.archive.extract.ExtractingResourceFactoryMapper;
+import org.archive.extract.ExtractingResourceProducer;
+import org.archive.extract.ProducerUtils;
+import org.archive.extract.ResourceFactoryMapper;
+import org.archive.resource.Resource;
+import org.archive.resource.ResourceParseException;
+import org.archive.resource.ResourceProducer;
+import org.archive.util.StreamCopy;
+
+import org.json.JSONObject;
+
+import junit.framework.TestCase;
+
+public class WARCResourceTest extends TestCase {
+
+ public void testWARCResource() throws ResourceParseException, IOException {
+ String testFileName = "../../format/warc/IAH-urls-wget.warc";
+ ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath());
+ ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
+ ExtractingResourceProducer extractor = new ExtractingResourceProducer(producer, mapper);
+
+ Resource resource = extractor.getNext();
+
+ while (resource != null) {
+ JSONObject payloadMD = resource.getMetaData().getTopMetaData().getJSONObject("Envelope")
+ .getJSONObject("Payload-Metadata");
+
+ if (payloadMD.has(PAYLOAD_LENGTH)) {
+ assertTrue(payloadMD.getLong(PAYLOAD_LENGTH) != -1);
+ }
+ if (payloadMD.has(PAYLOAD_SLOP_BYTES)) {
+ assertEquals(4, payloadMD.getLong(PAYLOAD_SLOP_BYTES));
+ }
+
+ StreamCopy.readToEOF(resource.getInputStream());
+ resource = extractor.getNext();
+ }
+ }
+}
From c5b779128edd1f0fad2709d4ab1b797326c2cb6c Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Fri, 20 Dec 2024 14:10:44 +0900
Subject: [PATCH 125/240] Update CHANGES.md for 1.3.0
---
CHANGES.md | 37 +++++++++++++++++++++++++++++++++++++
1 file changed, 37 insertions(+)
diff --git a/CHANGES.md b/CHANGES.md
index e3afd137..8a0a7d20 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,3 +1,40 @@
+1.3.0
+-----
+
+#### URL Canonicalization Changed
+
+The output of WaybackURLKeyMaker and other canonicalizers based on BasicURLCanonicalizer has changed for URLs that
+contain non UTF-8 percent encoded sequences. For example when a URL contains "%C3%23" it will now be normalised to
+"%c3%23" whereas previous releases produced "%25c3%23". This change brings webarchive-commons more inline with pywb,
+surt (Python), warcio.js and RFC 3986. While CDX file compatibility with these newer tools should improve, note that CDX
+files generated by the new release which contain such URLs may not work correctly with existing versions of
+OpenWayback that use the older webarchive-commons. [#102](https://github.com/iipc/webarchive-commons/pull/102)
+
+#### Bug fixes
+
+* WAT: Duplicated payload metadata values for "Actual-Content-Length" and "Trailing-Slop-Length" [#103](https://github.com/iipc/webarchive-commons/pull/103)
+* ObjectPlusFilesOutputStream.hardlinkOrCopy now uses `Files.createLink()` instead of executing `ln`. This
+ prevents the potential for security vulnerabilities from command line option injection and improves portability.
+
+#### Dependency upgrades
+
+* fastutil removed
+* dsiutils removed
+
+#### Deprecations
+
+The following classes and enum members have been marked deprecated as a step towards removal of the dependency on
+Apache Commons HttpClient 3.1.
+
+* org.archive.httpclient.HttpRecorderGetMethod
+* org.archive.httpclient.HttpRecorderMethod
+* org.archive.httpclient.HttpRecorderPostMethod
+* org.archive.httpclient.SingleHttpConnectionManager
+* org.archive.httpclient.ThreadLocalHttpConnectionManager
+* org.archive.util.binsearch.impl.http.ApacheHttp31SLR
+* org.archive.util.binsearch.impl.http.ApacheHttp31SLRFactory
+* org.archive.util.binsearch.impl.http.HTTPSeekableLineReaderFactory.HttpLibs.APACHE_31
+
1.2.0
-----
From eee48cc18017dde59b1d12f11654a2c752c63d45 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Fri, 20 Dec 2024 14:12:09 +0900
Subject: [PATCH 126/240] [maven-release-plugin] prepare release
webarchive-commons-1.3.0
---
pom.xml | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/pom.xml b/pom.xml
index 6dec154c..f489826c 100644
--- a/pom.xml
+++ b/pom.xml
@@ -3,7 +3,7 @@
org.netpreserve.commonswebarchive-commons
- 1.2.1-SNAPSHOT
+ 1.3.0jarwebarchive-commons
@@ -41,7 +41,7 @@
scm:git:git@github.com:iipc/webarchive-commons.gitscm:git:git@github.com:iipc/webarchive-commons.githttps://github.com/iipc/webarchive-commons
- HEAD
+ webarchive-commons-1.3.0
From a8fd8a74b83d3327bc074cf783f6315659fbc715 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Fri, 20 Dec 2024 14:12:13 +0900
Subject: [PATCH 127/240] [maven-release-plugin] prepare for next development
iteration
---
pom.xml | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/pom.xml b/pom.xml
index f489826c..74a4bbe6 100644
--- a/pom.xml
+++ b/pom.xml
@@ -3,7 +3,7 @@
org.netpreserve.commonswebarchive-commons
- 1.3.0
+ 1.3.1-SNAPSHOTjarwebarchive-commons
@@ -41,7 +41,7 @@
scm:git:git@github.com:iipc/webarchive-commons.gitscm:git:git@github.com:iipc/webarchive-commons.githttps://github.com/iipc/webarchive-commons
- webarchive-commons-1.3.0
+ HEAD
From a3a39598fc7b6947e38161e9f27f6842eed95456 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Tue, 11 Mar 2025 10:20:00 +0100
Subject: [PATCH 128/240] Upgrade GitHub workflow actions cache
---
.github/workflows/maven.yml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.github/workflows/maven.yml b/.github/workflows/maven.yml
index 8bb55c4e..60fac096 100644
--- a/.github/workflows/maven.yml
+++ b/.github/workflows/maven.yml
@@ -24,7 +24,7 @@ jobs:
distribution: 'temurin'
cache: maven
- name: Cache local Maven repository
- uses: actions/cache@v2
+ uses: actions/cache@v4
with:
path: ~/.m2/repository
key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}
From c427a12e82f3cebd6ba57152209d0bb5b9de2619 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Sun, 18 May 2025 09:39:48 +0900
Subject: [PATCH 129/240] Upgrade to JUnit 5
---
CHANGES.md | 7 +
pom.xml | 7 +-
.../java/org/archive/util/TmpDirTestCase.java | 119 ----
.../extract/RealCDXExtractorOutputTest.java | 31 +-
.../format/dns/DNSResponseParserTest.java | 10 +-
.../format/gzip/GZIPMemberSeriesTest.java | 38 +-
.../format/gzip/GZIPMemberWriterTest.java | 5 +-
.../format/gzip/zipnum/ZipNumWriterTest.java | 16 +-
.../http/HttpRequestMessageParserTest.java | 12 +-
.../format/http/HttpResponseParserTest.java | 14 +-
.../json/CompoundORJSONPathSpecTest.java | 5 +-
.../format/json/JSONPathSpecFactoryTest.java | 5 +-
.../org/archive/format/json/JSONViewTest.java | 9 +-
.../format/json/SimpleJSONPathSpecTest.java | 5 +-
.../format/text/html/CDATALexerTest.java | 14 +-
.../archive/io/ArchiveReaderFactoryTest.java | 27 +-
.../io/BufferedSeekInputStreamTest.java | 9 +-
.../archive/io/HeaderedArchiveRecordTest.java | 22 +-
.../archive/io/RecordingInputStreamTest.java | 39 +-
.../archive/io/RecordingOutputStreamTest.java | 74 ++-
.../archive/io/ReplayCharSequenceTest.java | 110 ++--
.../io/RepositionableInputStreamTest.java | 20 +-
.../archive/io/arc/ARCReaderFactoryTest.java | 13 +-
.../org/archive/io/arc/ARCWriterPoolTest.java | 41 +-
.../org/archive/io/arc/ARCWriterTest.java | 121 ++--
.../io/warc/WARCReaderFactoryTest.java | 7 +-
.../org/archive/io/warc/WARCWriterTest.java | 67 ++-
.../org/archive/net/PublicSuffixesTest.java | 55 +-
.../org/archive/resource/MetaDataTest.java | 21 +-
.../archive/resource/arc/ARCResourceTest.java | 6 +-
.../html/ExtractingParseObserverTest.java | 24 +-
.../resource/html/HTMLMetaDataTest.java | 12 +-
.../resource/warc/WARCResourceTest.java | 7 +-
.../org/archive/uid/UUIDGeneratorTest.java | 7 +-
.../url/AggressiveIAURLCanonicalizerTest.java | 9 +-
.../url/BasicURLCanonicalizerTest.java | 39 +-
.../java/org/archive/url/HandyURLTest.java | 13 +-
.../archive/url/IAURLCanonicalizerTest.java | 13 +-
.../url/OrdinaryIAURLCanonicalizerTest.java | 10 +-
.../java/org/archive/url/URLParserTest.java | 11 +-
.../archive/url/URLRegexTransformerTest.java | 45 +-
.../org/archive/url/UsableURIFactoryTest.java | 564 +++++++++---------
.../java/org/archive/url/UsableURITest.java | 16 +-
.../archive/url/WaybackURLKeyMakerTest.java | 7 +-
.../org/archive/util/ArchiveUtilsTest.java | 231 ++++---
.../java/org/archive/util/ByteOpTest.java | 14 +-
.../org/archive/util/CrossProductTest.java | 8 +-
.../java/org/archive/util/FileUtilsTest.java | 69 ++-
.../util/InterruptibleCharSequenceTest.java | 21 +-
.../org/archive/util/MimetypeUtilsTest.java | 63 +-
.../org/archive/util/PropertyUtilsTest.java | 11 +-
.../util/StringFieldExtractorTest.java | 10 +-
src/test/java/org/archive/util/TestUtils.java | 17 +-
.../org/archive/util/anvl/ANVLRecordTest.java | 56 +-
.../util/binsearch/SortedTextFileTest.java | 8 +-
.../iterator/CachingStringFilterTest.java | 5 +-
.../iterator/FilterStringIteratorTest.java | 25 +-
.../iterator/SortedCompositeIteratorTest.java | 8 +-
.../util/zip/GZIPMembersInputStreamTest.java | 157 ++---
59 files changed, 1236 insertions(+), 1173 deletions(-)
delete mode 100644 src/main/java/org/archive/util/TmpDirTestCase.java
diff --git a/CHANGES.md b/CHANGES.md
index 8a0a7d20..478238bf 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,3 +1,10 @@
+Unreleased
+----------
+
+#### Dependency upgrades
+
+- **junit**: 4.13.2 → 5.12.2
+
1.3.0
-----
diff --git a/pom.xml b/pom.xml
index 74a4bbe6..c70a2cd7 100644
--- a/pom.xml
+++ b/pom.xml
@@ -52,9 +52,10 @@
- junit
- junit
- 4.13.2
+ org.junit.jupiter
+ junit-jupiter
+ 5.12.2
+ test
diff --git a/src/main/java/org/archive/util/TmpDirTestCase.java b/src/main/java/org/archive/util/TmpDirTestCase.java
deleted file mode 100644
index 09ec345b..00000000
--- a/src/main/java/org/archive/util/TmpDirTestCase.java
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * This file is part of the Heritrix web crawler (crawler.archive.org).
- *
- * Licensed to the Internet Archive (IA) by one or more individual
- * contributors.
- *
- * The IA licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.archive.util;
-
-import java.io.File;
-import java.io.IOException;
-
-import junit.framework.TestCase;
-
-
-/**
- * Base class for TestCases that want access to a tmp dir for the writing
- * of files.
- *
- * @author stack
- */
-public abstract class TmpDirTestCase extends TestCase
-{
- /**
- * Name of the system property that holds pointer to tmp directory into
- * which we can safely write files.
- */
- public static final String TEST_TMP_SYSTEM_PROPERTY_NAME = "testtmpdir";
-
- /**
- * Default test tmp.
- */
- public static final String DEFAULT_TEST_TMP_DIR = File.separator + "tmp" +
- File.separator + "heritrix-junit-tests";
-
- /**
- * Directory to write temporary files to.
- */
- private File tmpDir = null;
-
-
- public TmpDirTestCase()
- {
- super();
- }
-
- public TmpDirTestCase(String testName)
- {
- super(testName);
- }
-
- /*
- * @see TestCase#setUp()
- */
- protected void setUp() throws Exception {
- super.setUp();
- this.tmpDir = tmpDir();
- }
-
- /**
- * @return Returns the tmpDir.
- */
- public File getTmpDir()
- {
- return this.tmpDir;
- }
-
- /**
- * Delete any files left over from previous run.
- *
- * @param basename Base name of files we're to clean up.
- */
- public void cleanUpOldFiles(String basename) {
- cleanUpOldFiles(getTmpDir(), basename);
- }
-
- /**
- * Delete any files left over from previous run.
- *
- * @param prefix Base name of files we're to clean up.
- * @param basedir Directory to start cleaning in.
- */
- public void cleanUpOldFiles(File basedir, String prefix) {
- File [] files = FileUtils.getFilesWithPrefix(basedir, prefix);
- if (files != null) {
- for (int i = 0; i < files.length; i++) {
- org.apache.commons.io.FileUtils.deleteQuietly(files[i]);
- }
- }
- }
-
-
- public static File tmpDir() throws IOException {
- String tmpDirStr = System.getProperty(TEST_TMP_SYSTEM_PROPERTY_NAME);
- tmpDirStr = (tmpDirStr == null)? DEFAULT_TEST_TMP_DIR: tmpDirStr;
- File tmpDir = new File(tmpDirStr);
- FileUtils.ensureWriteableDirectory(tmpDir);
-
- if (!tmpDir.canWrite())
- {
- throw new IOException(tmpDir.getAbsolutePath() +
- " is unwriteable.");
- }
-
- return tmpDir;
- }
-}
diff --git a/src/test/java/org/archive/extract/RealCDXExtractorOutputTest.java b/src/test/java/org/archive/extract/RealCDXExtractorOutputTest.java
index 14f8489d..a716df82 100644
--- a/src/test/java/org/archive/extract/RealCDXExtractorOutputTest.java
+++ b/src/test/java/org/archive/extract/RealCDXExtractorOutputTest.java
@@ -1,28 +1,29 @@
package org.archive.extract;
-import java.net.MalformedURLException;
import java.net.URI;
-import java.net.URISyntaxException;
-import java.net.URL;
-import java.net.URLEncoder;
-import junit.framework.TestCase;
+import org.junit.jupiter.api.Test;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
-public class RealCDXExtractorOutputTest extends TestCase {
+public class RealCDXExtractorOutputTest {
+
+ @Test
public void testEscapeResolvedUrl() throws Exception {
- String context ="http://www.uni-giessen.de/cms/studium/dateien/informationberatung/merkblattpdf";
- String spec = "http://fss.plone.uni-giessen.de/fß/studium/dateien/informationberatung/merkblattpdf/file/Mérkblatt zur Gestaltung von Nachteilsausgleichen.pdf?föo=bar#änchor";
- String escaped = RealCDXExtractorOutput.resolve(context, spec);
- assertTrue(escaped.indexOf(" ") < 0);
- URI parsed = new URI(escaped);
- assertEquals("änchor", parsed.getFragment());
+ String context = "http://www.uni-giessen.de/cms/studium/dateien/informationberatung/merkblattpdf";
+ String spec = "http://fss.plone.uni-giessen.de/fß/studium/dateien/informationberatung/merkblattpdf/file/Mérkblatt zur Gestaltung von Nachteilsausgleichen.pdf?föo=bar#änchor";
+ String escaped = RealCDXExtractorOutput.resolve(context, spec);
+ assertTrue(escaped.indexOf(" ") < 0);
+ URI parsed = new URI(escaped);
+ assertEquals("änchor", parsed.getFragment());
}
+ @Test
public void testNoDoubleEscaping() throws Exception {
- String spec = "https://www.google.com/search?q=java+escape+url+spaces&ie=utf-8&oe=utf-8";
- String resolved = RealCDXExtractorOutput.resolve(spec, spec);
- assertTrue(spec.equals(resolved));
+ String spec = "https://www.google.com/search?q=java+escape+url+spaces&ie=utf-8&oe=utf-8";
+ String resolved = RealCDXExtractorOutput.resolve(spec, spec);
+ assertTrue(spec.equals(resolved));
}
}
diff --git a/src/test/java/org/archive/format/dns/DNSResponseParserTest.java b/src/test/java/org/archive/format/dns/DNSResponseParserTest.java
index 27d0fdad..7ade0ad5 100644
--- a/src/test/java/org/archive/format/dns/DNSResponseParserTest.java
+++ b/src/test/java/org/archive/format/dns/DNSResponseParserTest.java
@@ -3,15 +3,13 @@
import java.io.ByteArrayInputStream;
import java.io.IOException;
-import org.archive.format.dns.DNSParseException;
-import org.archive.format.dns.DNSRecord;
-import org.archive.format.dns.DNSResponse;
-import org.archive.format.dns.DNSResponseParser;
+import org.junit.jupiter.api.Test;
-import junit.framework.TestCase;
+import static org.junit.jupiter.api.Assertions.assertEquals;
-public class DNSResponseParserTest extends TestCase {
+public class DNSResponseParserTest {
DNSResponseParser parser = new DNSResponseParser();
+ @Test
public void testParse() throws DNSParseException, IOException {
verifyResults("20110328212258\nfarm6.static.flickr.a06.yahoodns.net.\t300\tIN\tA\t98.136.170.121\n",
"20110328212258",new String[][] {{"farm6.static.flickr.a06.yahoodns.net.","300","IN","A","98.136.170.121"}});
diff --git a/src/test/java/org/archive/format/gzip/GZIPMemberSeriesTest.java b/src/test/java/org/archive/format/gzip/GZIPMemberSeriesTest.java
index 2eec46ec..6f218ebb 100644
--- a/src/test/java/org/archive/format/gzip/GZIPMemberSeriesTest.java
+++ b/src/test/java/org/archive/format/gzip/GZIPMemberSeriesTest.java
@@ -9,9 +9,6 @@
import org.archive.util.ByteOp;
import org.archive.util.IAUtils;
import org.archive.util.TestUtils;
-import org.archive.format.gzip.GZIPFormatException;
-import org.archive.format.gzip.GZIPMemberSeries;
-import org.archive.format.gzip.GZIPSeriesMember;
import org.archive.streamcontext.ByteArrayWrappedStream;
import org.archive.streamcontext.SimpleStream;
import org.archive.streamcontext.Stream;
@@ -19,10 +16,13 @@
import com.google.common.io.ByteStreams;
import com.google.common.primitives.Bytes;
-import junit.framework.TestCase;
+import org.junit.jupiter.api.Test;
-public class GZIPMemberSeriesTest extends TestCase {
+import static org.junit.jupiter.api.Assertions.*;
+public class GZIPMemberSeriesTest {
+
+ @Test
public void testSingle() throws IndexOutOfBoundsException, FileNotFoundException, IOException {
InputStream is = getClass().getResourceAsStream("abcd.gz");
@@ -38,6 +38,7 @@ public void testSingle() throws IndexOutOfBoundsException, FileNotFoundException
assertNull(s.getNextMember());
}
+ @Test
public void testSingleEmpty() throws IndexOutOfBoundsException, FileNotFoundException, IOException {
InputStream is = getClass().getResourceAsStream("empty.gz");
@@ -59,6 +60,7 @@ public void testSingleEmpty() throws IndexOutOfBoundsException, FileNotFoundExce
assertTrue(s.gotEOF());
}
+ @Test
public void testDouble() throws IndexOutOfBoundsException, FileNotFoundException, IOException {
InputStream is = getClass().getResourceAsStream("abcd.gz");
@@ -81,14 +83,14 @@ public void testDouble() throws IndexOutOfBoundsException, FileNotFoundException
assertNull(s.getNextMember());
}
-
+ @Test
public void testSingleCRCStrict() throws IndexOutOfBoundsException, FileNotFoundException, IOException {
InputStream is = getClass().getResourceAsStream("abcd.gz");
byte abcd[] = ByteStreams.toByteArray(is);
byte oldb = abcd[abcd.length-1];
abcd[abcd.length-1] = (byte) (abcd[abcd.length-1] + 1);
- assertFalse(oldb == abcd[abcd.length-1]);
+ assertNotEquals(oldb, abcd[abcd.length - 1]);
ByteArrayInputStream bais = new ByteArrayInputStream(abcd);
Stream stream = new SimpleStream(bais);
@@ -117,14 +119,15 @@ public void testSingleCRCStrict() throws IndexOutOfBoundsException, FileNotFound
}
assertNotNull(e);
}
-
+
+ @Test
public void testSingleCRCLAX() throws IndexOutOfBoundsException, FileNotFoundException, IOException {
InputStream is = getClass().getResourceAsStream("abcd.gz");
byte abcd[] = ByteStreams.toByteArray(is);
byte oldb = abcd[abcd.length-1];
abcd[abcd.length-1] = (byte) (abcd[abcd.length-1] + 1);
- assertFalse(oldb == abcd[abcd.length-1]);
+ assertNotEquals(oldb, abcd[abcd.length - 1]);
ByteArrayInputStream bais = new ByteArrayInputStream(abcd);
Stream stream = new SimpleStream(bais);
@@ -154,7 +157,8 @@ public void testSingleCRCLAX() throws IndexOutOfBoundsException, FileNotFoundExc
assertNull(e);
assertNull(s.getNextMember());
}
-
+
+ @Test
public void testDoubleCRC1LAX() throws IndexOutOfBoundsException, FileNotFoundException, IOException {
InputStream is = getClass().getResourceAsStream("abcd.gz");
@@ -162,7 +166,7 @@ public void testDoubleCRC1LAX() throws IndexOutOfBoundsException, FileNotFoundEx
byte abcdorig[] = ByteOp.copy(abcd);
byte oldb = abcd[abcd.length-1];
abcd[abcd.length-1] = (byte) (abcd[abcd.length-1] + 1);
- assertFalse(oldb == abcd[abcd.length-1]);
+ assertNotEquals(oldb, abcd[abcd.length - 1]);
byte both[] = Bytes.concat(abcd,abcdorig);
@@ -195,7 +199,8 @@ public void testDoubleCRC1LAX() throws IndexOutOfBoundsException, FileNotFoundEx
assertNotNull(m);
TestUtils.assertStreamEquals(m,"abcd".getBytes(IAUtils.UTF8));
}
-
+
+ @Test
public void testSingleDeflateError() throws IndexOutOfBoundsException, IOException {
InputStream is = getClass().getResourceAsStream("abcd.gz");
@@ -240,7 +245,7 @@ public void testSingleDeflateError() throws IndexOutOfBoundsException, IOExcepti
assertNull(m);
}
-
+ @Test
public void testDoubleDeflateError() throws IndexOutOfBoundsException, IOException {
InputStream is = getClass().getResourceAsStream("abcd.gz");
@@ -290,7 +295,8 @@ public void testDoubleDeflateError() throws IndexOutOfBoundsException, IOExcepti
assertFalse(s.gotIOError());
}
-
+
+ @Test
public void testDoubleBiggerDeflateErrOnFirst() throws IOException {
String resource = "double-single-inflate-error.gz";
InputStream is = getClass().getResourceAsStream(resource);
@@ -333,7 +339,8 @@ public void testDoubleBiggerDeflateErrOnFirst() throws IOException {
}
-
+
+ @Test
public void testAutoSkip() throws IOException {
InputStream is = getClass().getResourceAsStream("abcd.gz");
byte abcd[] = ByteStreams.toByteArray(is);
@@ -375,6 +382,7 @@ public void testAutoSkip() throws IOException {
assertTrue(s.gotEOF());
}
+ @Test
public void testWgetProblem() throws IndexOutOfBoundsException, FileNotFoundException, IOException {
InputStream is = getClass().getResourceAsStream("IAH-urls-wget.warc.gz");
new GZIPDecoder().parseHeader(is);
diff --git a/src/test/java/org/archive/format/gzip/GZIPMemberWriterTest.java b/src/test/java/org/archive/format/gzip/GZIPMemberWriterTest.java
index 483d2baf..45bc18e4 100644
--- a/src/test/java/org/archive/format/gzip/GZIPMemberWriterTest.java
+++ b/src/test/java/org/archive/format/gzip/GZIPMemberWriterTest.java
@@ -7,10 +7,11 @@
import org.archive.util.IAUtils;
-import junit.framework.TestCase;
+import org.junit.jupiter.api.Test;
-public class GZIPMemberWriterTest extends TestCase {
+public class GZIPMemberWriterTest {
+ @Test
public void testWrite() throws IOException {
File outFile = File.createTempFile("tmp", ".gz");
GZIPMemberWriter gzw = new GZIPMemberWriter(new FileOutputStream(outFile));
diff --git a/src/test/java/org/archive/format/gzip/zipnum/ZipNumWriterTest.java b/src/test/java/org/archive/format/gzip/zipnum/ZipNumWriterTest.java
index cfadbd79..25a5eaa7 100644
--- a/src/test/java/org/archive/format/gzip/zipnum/ZipNumWriterTest.java
+++ b/src/test/java/org/archive/format/gzip/zipnum/ZipNumWriterTest.java
@@ -10,19 +10,21 @@
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
-import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
import org.archive.format.gzip.GZIPMemberSeries;
import org.archive.format.gzip.GZIPSeriesMember;
import org.archive.streamcontext.SimpleStream;
-import junit.framework.TestCase;
+import org.junit.jupiter.api.Test;
-public class ZipNumWriterTest extends TestCase {
+import static org.junit.jupiter.api.Assertions.assertEquals;
+public class ZipNumWriterTest {
+
+ @Test
public void testAddRecord() throws IOException {
- Charset UTF8 = Charset.forName("UTF-8");
- File main = File.createTempFile("test-znw",".main");
+ File main = File.createTempFile("test-znw",".main");
File summ = File.createTempFile("test-znw",".summ");
main.deleteOnExit();
summ.deleteOnExit();
@@ -31,11 +33,11 @@ public void testAddRecord() throws IOException {
ZipNumWriter znw = new ZipNumWriter(new FileOutputStream(main,false),
new FileOutputStream(summ,false), limit);
for(int i = 0; i < 1000; i++) {
- znw.addRecord(String.format("%06d\n",i).getBytes(UTF8));
+ znw.addRecord(String.format("%06d\n",i).getBytes(StandardCharsets.UTF_8));
}
znw.close();
InputStreamReader isr =
- new InputStreamReader(new FileInputStream(summ),UTF8);
+ new InputStreamReader(new FileInputStream(summ), StandardCharsets.UTF_8);
BufferedReader br = new BufferedReader(isr);
String line = null;
int count = 0;
diff --git a/src/test/java/org/archive/format/http/HttpRequestMessageParserTest.java b/src/test/java/org/archive/format/http/HttpRequestMessageParserTest.java
index 50df9dde..9a5d69af 100644
--- a/src/test/java/org/archive/format/http/HttpRequestMessageParserTest.java
+++ b/src/test/java/org/archive/format/http/HttpRequestMessageParserTest.java
@@ -3,16 +3,16 @@
import java.io.ByteArrayInputStream;
import java.io.IOException;
-import org.archive.format.http.HttpConstants;
-import org.archive.format.http.HttpParseException;
-import org.archive.format.http.HttpRequestMessage;
-import org.archive.format.http.HttpRequestMessageParser;
import org.archive.util.IAUtils;
-import junit.framework.TestCase;
+import org.junit.jupiter.api.Test;
-public class HttpRequestMessageParserTest extends TestCase implements HttpConstants {
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+public class HttpRequestMessageParserTest implements HttpConstants {
HttpRequestMessageParser parser = new HttpRequestMessageParser();
+
+ @Test
public void testParse() throws IOException {
assertParse("GET / HTTP/1.0\r\n", METHOD_GET, "/", VERSION_0);
assertParse("GET / HTTP/1.1\r\n", METHOD_GET, "/", VERSION_1);
diff --git a/src/test/java/org/archive/format/http/HttpResponseParserTest.java b/src/test/java/org/archive/format/http/HttpResponseParserTest.java
index ea076a69..631d67c7 100644
--- a/src/test/java/org/archive/format/http/HttpResponseParserTest.java
+++ b/src/test/java/org/archive/format/http/HttpResponseParserTest.java
@@ -5,16 +5,14 @@
import org.archive.util.IAUtils;
import org.archive.util.TestUtils;
-import org.archive.format.http.HttpHeader;
-import org.archive.format.http.HttpHeaders;
-import org.archive.format.http.HttpParseException;
-import org.archive.format.http.HttpResponse;
-import org.archive.format.http.HttpResponseParser;
-import junit.framework.TestCase;
+import org.junit.jupiter.api.Test;
-public class HttpResponseParserTest extends TestCase {
+import static org.junit.jupiter.api.Assertions.*;
+public class HttpResponseParserTest {
+
+ @Test
public void testParse() throws IOException {
HttpResponseParser parser = new HttpResponseParser();
@@ -38,6 +36,7 @@ public void testParse() throws IOException {
}
+ @Test
public void testParseWithLf() throws IOException {
HttpResponseParser parser = new HttpResponseParser();
@@ -57,6 +56,7 @@ public void testParseWithLf() throws IOException {
}
+ @Test
public void testParseEmptyHeaderField() throws IOException {
HttpResponseParser parser = new HttpResponseParser();
diff --git a/src/test/java/org/archive/format/json/CompoundORJSONPathSpecTest.java b/src/test/java/org/archive/format/json/CompoundORJSONPathSpecTest.java
index 57c21965..ef8c2fa0 100644
--- a/src/test/java/org/archive/format/json/CompoundORJSONPathSpecTest.java
+++ b/src/test/java/org/archive/format/json/CompoundORJSONPathSpecTest.java
@@ -6,11 +6,12 @@
import org.json.JSONException;
import org.json.JSONObject;
-import junit.framework.TestCase;
+import org.junit.jupiter.api.Test;
-public class CompoundORJSONPathSpecTest extends TestCase {
+public class CompoundORJSONPathSpecTest {
String json1S = "{\"a\":\"A\"}";
String json2S = "{\"b\":\"B\"}";
+ @Test
public void testExtract() throws JSONException {
JSONObject json1 = new JSONObject(json1S);
JSONObject json2 = new JSONObject(json2S);
diff --git a/src/test/java/org/archive/format/json/JSONPathSpecFactoryTest.java b/src/test/java/org/archive/format/json/JSONPathSpecFactoryTest.java
index ab999dca..257cb112 100644
--- a/src/test/java/org/archive/format/json/JSONPathSpecFactoryTest.java
+++ b/src/test/java/org/archive/format/json/JSONPathSpecFactoryTest.java
@@ -4,9 +4,9 @@
import org.json.JSONException;
import org.json.JSONObject;
-import junit.framework.TestCase;
+import org.junit.jupiter.api.Test;
-public class JSONPathSpecFactoryTest extends TestCase {
+public class JSONPathSpecFactoryTest {
String json1S = "{\"a\":\"A\"}";
String json2S = "{\"b\":\"B\"}";
@@ -14,6 +14,7 @@ public class JSONPathSpecFactoryTest extends TestCase {
String json4S = "{\"b\":[{\"x\":\"x1\", \"y\":\"y1\"},{\"x\":\"x2\", \"y\":\"y2\"}]}";
+ @Test
public void testGet() throws JSONException {
JSONObject json1 = new JSONObject(json1S);
JSONObject json2 = new JSONObject(json2S);
diff --git a/src/test/java/org/archive/format/json/JSONViewTest.java b/src/test/java/org/archive/format/json/JSONViewTest.java
index 20bd4fe6..aabbe7df 100644
--- a/src/test/java/org/archive/format/json/JSONViewTest.java
+++ b/src/test/java/org/archive/format/json/JSONViewTest.java
@@ -4,14 +4,15 @@
import org.json.JSONException;
import org.json.JSONObject;
-import junit.framework.TestCase;
+import org.junit.jupiter.api.Test;
-public class JSONViewTest extends TestCase {
+public class JSONViewTest {
public int getInt(byte b[]) {
return b[0] & 0xff;
}
-
+
+ @Test
public void testBytes() throws JSONException {
JSONObject o = new JSONObject();
o.append("name1", "val\\rue1");
@@ -28,6 +29,8 @@ public void testBytes() throws JSONException {
System.out.format("I(%d) gi(%d)\n",i,gi);
}
}
+
+ @Test
public void testApply() throws JSONException {
String json1S = "{\"url\":\"a\",\"link\":[{\"zz\":\"1\",\"qq\":\"qa\"},{\"zz2\":\"2\",\"qq\":\"qb\"},{\"zz\":\"3\",\"qq\":\"qc\"},{\"zz\":\"4\"}]}";
JSONObject json1 = new JSONObject(json1S);
diff --git a/src/test/java/org/archive/format/json/SimpleJSONPathSpecTest.java b/src/test/java/org/archive/format/json/SimpleJSONPathSpecTest.java
index a703b49a..640a5a80 100644
--- a/src/test/java/org/archive/format/json/SimpleJSONPathSpecTest.java
+++ b/src/test/java/org/archive/format/json/SimpleJSONPathSpecTest.java
@@ -4,15 +4,16 @@
import org.json.JSONException;
import org.json.JSONObject;
-import junit.framework.TestCase;
+import org.junit.jupiter.api.Test;
-public class SimpleJSONPathSpecTest extends TestCase {
+public class SimpleJSONPathSpecTest {
String json1 = "{\"a\": { \"b\": \"Foo\" }}";
String json2 = "{\"a\": { \"b\": [{\"a\":\"1\"},{\"a\":\"2\"}] }}";
String json3 = "{\"a\": { \"b\": {\"A\":\"11\",\"B\":\"22\"} }}";
String json4 = "{\"a\": { \"b\": [{\"A\":\"11\",\"B\":\"22\"},{\"A\":\"33\",\"B\":\"44\"}] }}";
+ @Test
public void testExtract() throws JSONException {
JSONObject json = new JSONObject(json1);
JSONPathSpec spec = new SimpleJSONPathSpec("a.b");
diff --git a/src/test/java/org/archive/format/text/html/CDATALexerTest.java b/src/test/java/org/archive/format/text/html/CDATALexerTest.java
index 481a3eda..856576ba 100644
--- a/src/test/java/org/archive/format/text/html/CDATALexerTest.java
+++ b/src/test/java/org/archive/format/text/html/CDATALexerTest.java
@@ -1,17 +1,16 @@
package org.archive.format.text.html;
-import org.archive.format.text.html.CDATALexer;
-import org.archive.format.text.html.NodeUtils;
import org.htmlparser.Node;
import org.htmlparser.lexer.Page;
-//import org.htmlparser.nodes.RemarkNode;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.util.ParserException;
-import junit.framework.TestCase;
+import org.junit.jupiter.api.Test;
-public class CDATALexerTest extends TestCase {
+import static org.junit.jupiter.api.Assertions.*;
+
+public class CDATALexerTest {
CDATALexer l;
Node n;
private CDATALexer makeLexer(String html) {
@@ -19,7 +18,8 @@ private CDATALexer makeLexer(String html) {
t.setPage(new Page(html));
return t;
}
-
+
+ @Test
public void testNextNode() throws ParserException {
l = makeLexer("blem");
n = l.nextNode();
@@ -35,6 +35,7 @@ public void testNextNode() throws ParserException {
assertNull(l.nextNode());
}
+ @Test
public void testInJS() throws ParserException {
l = makeLexer("");
assertFalse(l.inCSS());
@@ -54,6 +55,7 @@ public void testInJS() throws ParserException {
assertTrue(NodeUtils.isCloseTagNodeNamed(n, "SCRIPT"));
}
+ @Test
public void testInCSS() throws ParserException {
l = makeLexer("");
assertFalse(l.inCSS());
diff --git a/src/test/java/org/archive/io/ArchiveReaderFactoryTest.java b/src/test/java/org/archive/io/ArchiveReaderFactoryTest.java
index 2313868c..f7ad75d2 100644
--- a/src/test/java/org/archive/io/ArchiveReaderFactoryTest.java
+++ b/src/test/java/org/archive/io/ArchiveReaderFactoryTest.java
@@ -21,29 +21,34 @@
import java.io.File;
import java.io.IOException;
-import java.net.MalformedURLException;
import java.net.URL;
import java.util.Iterator;
import org.apache.commons.lang.StringUtils;
-import org.archive.io.ArchiveRecord;
import org.archive.io.arc.ARCWriterTest;
-import org.archive.util.TmpDirTestCase;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+public class ArchiveReaderFactoryTest {
+ @TempDir
+ File tempDir;
-public class ArchiveReaderFactoryTest extends TmpDirTestCase {
/**
* Test local file as URL
* @throws IOException
*/
+ @Test
public void testGetFileURL() throws IOException {
- File arc = ARCWriterTest.createARCFile(getTmpDir(), true);
+ File arc = ARCWriterTest.createARCFile(tempDir, true);
ArchiveReader reader = null;
try {
reader = ArchiveReaderFactory.
get(new URL("file:////" + arc.getAbsolutePath()));
for (Iterator i = reader.iterator(); i.hasNext();) {
ArchiveRecord r = (ArchiveRecord)i.next();
- assertTrue("mime unread",StringUtils.isNotBlank(r.getHeader().getMimetype()));
+ assertTrue(StringUtils.isNotBlank(r.getHeader().getMimetype()),"mime unread");
}
} finally {
if (reader != null) {
@@ -56,14 +61,15 @@ public void testGetFileURL() throws IOException {
* Test local file as File
* @throws IOException
*/
+ @Test
public void testGetFile() throws IOException {
- File arc = ARCWriterTest.createARCFile(getTmpDir(), true);
+ File arc = ARCWriterTest.createARCFile(tempDir, true);
ArchiveReader reader = null;
try {
reader = ArchiveReaderFactory.get(arc.getAbsoluteFile());
for (Iterator i = reader.iterator(); i.hasNext();) {
ArchiveRecord r = (ArchiveRecord)i.next();
- assertTrue("mime unread",StringUtils.isNotBlank(r.getHeader().getMimetype()));
+ assertTrue(StringUtils.isNotBlank(r.getHeader().getMimetype()),"mime unread");
}
} finally {
if (reader != null) {
@@ -76,14 +82,15 @@ public void testGetFile() throws IOException {
* Test local file as String path
* @throws IOException
*/
+ @Test
public void testGetPath() throws IOException {
- File arc = ARCWriterTest.createARCFile(getTmpDir(), true);
+ File arc = ARCWriterTest.createARCFile(tempDir, true);
ArchiveReader reader = null;
try {
reader = ArchiveReaderFactory.get(arc.getAbsoluteFile().getAbsolutePath());
for (Iterator i = reader.iterator(); i.hasNext();) {
ArchiveRecord r = (ArchiveRecord)i.next();
- assertTrue("mime unread",StringUtils.isNotBlank(r.getHeader().getMimetype()));
+ assertTrue(StringUtils.isNotBlank(r.getHeader().getMimetype()),"mime unread");
}
} finally {
if (reader != null) {
diff --git a/src/test/java/org/archive/io/BufferedSeekInputStreamTest.java b/src/test/java/org/archive/io/BufferedSeekInputStreamTest.java
index 270e45e0..f7e8e0b2 100644
--- a/src/test/java/org/archive/io/BufferedSeekInputStreamTest.java
+++ b/src/test/java/org/archive/io/BufferedSeekInputStreamTest.java
@@ -18,9 +18,11 @@
*/
package org.archive.io;
+import org.junit.jupiter.api.Test;
+
import java.util.Random;
-import junit.framework.TestCase;
+import static org.junit.jupiter.api.Assertions.assertEquals;
/**
@@ -29,11 +31,12 @@
*
* @author pjack
*/
-public class BufferedSeekInputStreamTest extends TestCase {
+public class BufferedSeekInputStreamTest {
private static byte[] TEST_DATA = makeTestData();
-
+
+ @Test
public void testPosition() throws Exception {
Random random = new Random();
ArraySeekInputStream asis = new ArraySeekInputStream(TEST_DATA);
diff --git a/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java b/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java
index 9f7e2a15..7988cb2b 100644
--- a/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java
+++ b/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java
@@ -26,13 +26,15 @@
import java.util.Map;
import java.util.Set;
-import junit.framework.TestCase;
-
import org.apache.commons.httpclient.Header;
import org.archive.io.arc.ARCRecord;
import org.archive.io.warc.WARCRecord;
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
-public class HeaderedArchiveRecordTest extends TestCase {
+public class HeaderedArchiveRecordTest {
private static final String HTTPHEADER = "HTTP/1.1 200 OK\r\n"
+ "Last-Modified: Sun, 28 Aug 2005 14:10:55 GMT\r\n"
+ "Content-Length: 108\r\n" + "Connection: close\r\n"
@@ -41,6 +43,7 @@ public class HeaderedArchiveRecordTest extends TestCase {
+ " Neue Seite 1\r\n" + " \r\n"
+ " \r\n" + " \r\n" + "