From f7cd67d9f4ac252385517770fce63dea767f9203 Mon Sep 17 00:00:00 2001 From: RogerMathisen Date: Tue, 23 Sep 2014 13:06:44 +0200 Subject: [PATCH 001/240] - Replaced direct references to "/tmp" with generic temporary directory reference using File.createTempFile(). Fixes bug reported in iipc/webarchive-commons Issue #2. --- .../archive/format/gzip/GZIPMemberWriterTest.java | 4 ++-- .../util/binsearch/SortedTextFileTest.java | 2 +- .../iterator/SortedCompositeIteratorTest.java | 15 ++++++--------- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/src/test/java/org/archive/format/gzip/GZIPMemberWriterTest.java b/src/test/java/org/archive/format/gzip/GZIPMemberWriterTest.java index 5cd75ccf..483d2baf 100644 --- a/src/test/java/org/archive/format/gzip/GZIPMemberWriterTest.java +++ b/src/test/java/org/archive/format/gzip/GZIPMemberWriterTest.java @@ -12,8 +12,8 @@ public class GZIPMemberWriterTest extends TestCase { public void testWrite() throws IOException { - String outPath = "/tmp/tmp.gz"; - GZIPMemberWriter gzw = new GZIPMemberWriter(new FileOutputStream(new File(outPath))); + File outFile = File.createTempFile("tmp", ".gz"); + GZIPMemberWriter gzw = new GZIPMemberWriter(new FileOutputStream(outFile)); gzw.write(new ByteArrayInputStream("Here is record 1".getBytes(IAUtils.UTF8))); gzw.write(new ByteArrayInputStream("Here is record 2".getBytes(IAUtils.UTF8))); } diff --git a/src/test/java/org/archive/util/binsearch/SortedTextFileTest.java b/src/test/java/org/archive/util/binsearch/SortedTextFileTest.java index 2c9d19e8..8f812b75 100644 --- a/src/test/java/org/archive/util/binsearch/SortedTextFileTest.java +++ b/src/test/java/org/archive/util/binsearch/SortedTextFileTest.java @@ -25,7 +25,7 @@ private void createFile(File target, int max) throws FileNotFoundException { public void testGetRecordIteratorStringBoolean() throws IOException { - File test = new File("/tmp/test.tmp"); + File test = File.createTempFile("test", null); int max = 1000000; createFile(test,max); RandomAccessFileSeekableLineReaderFactory factory = diff --git a/src/test/java/org/archive/util/iterator/SortedCompositeIteratorTest.java b/src/test/java/org/archive/util/iterator/SortedCompositeIteratorTest.java index f1c2a0ec..0f4dc68a 100644 --- a/src/test/java/org/archive/util/iterator/SortedCompositeIteratorTest.java +++ b/src/test/java/org/archive/util/iterator/SortedCompositeIteratorTest.java @@ -4,6 +4,7 @@ import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; +import java.io.IOException; import java.io.PrintWriter; import java.util.Comparator; @@ -11,21 +12,16 @@ public class SortedCompositeIteratorTest extends TestCase { - public void testHasNext() throws FileNotFoundException { + public void testHasNext() throws FileNotFoundException, IOException { long t = 210000; long c = 134; float f = (float)c / (float)t; System.err.format("F(%f)\n",f); - File a = new File("/tmp/a"); - File b = new File("/tmp/b"); - if(a.isFile()) { - a.delete(); - } - if(b.isFile()) { - b.delete(); - } + File a = File.createTempFile("filea", null); + File b = File.createTempFile("fileb", null); + PrintWriter apw = new PrintWriter(a); PrintWriter bpw = new PrintWriter(b); apw.println("1"); @@ -38,6 +34,7 @@ public void testHasNext() throws FileNotFoundException { BufferedReader bbr = new BufferedReader(new FileReader(b)); SortedCompositeIterator sci = new SortedCompositeIterator(new Comparator() { + @Override public int compare(String o1, String o2) { return o1.compareTo(o2); } From 077abb783d77b8a556112a6617911d0ee7006595 Mon Sep 17 00:00:00 2001 From: thomase Date: Tue, 23 Sep 2014 14:48:48 +0200 Subject: [PATCH 002/240] * changed newline to System.lineSeparator --- .../org/archive/net/PublicSuffixesTest.java | 386 +++++++++--------- 1 file changed, 193 insertions(+), 193 deletions(-) diff --git a/src/test/java/org/archive/net/PublicSuffixesTest.java b/src/test/java/org/archive/net/PublicSuffixesTest.java index b88acb6d..a82bab22 100644 --- a/src/test/java/org/archive/net/PublicSuffixesTest.java +++ b/src/test/java/org/archive/net/PublicSuffixesTest.java @@ -1,193 +1,193 @@ -/* - * This file is part of the Heritrix web crawler (crawler.archive.org). - * - * Licensed to the Internet Archive (IA) by one or more individual - * contributors. - * - * The IA licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.archive.net; - -import java.io.PrintWriter; -import java.io.StringWriter; -import java.util.ArrayList; -import java.util.regex.Matcher; - -import junit.framework.TestCase; - -import org.archive.net.PublicSuffixes.Node; - -/** - * Test cases for PublicSuffixes utility. Confirm expected matches/nonmatches - * from constructed regex. - * - * @author gojomo - */ -public class PublicSuffixesTest extends TestCase { - // test of low level implementation - - public void testCompare() { - Node n = new Node("hoge"); - assertTrue(n.compareTo('a') > 0); - assertEquals(-1, n.compareTo('*')); - assertEquals(-1, n.compareTo('!')); - assertEquals(-1, n.compareTo(new Node("*,"))); - assertEquals(-1, n.compareTo(new Node("!muga,"))); - assertEquals(-1, n.compareTo(new Node(""))); - - n = new Node("*,"); - assertEquals(1, n.compareTo('a')); - assertEquals(0, n.compareTo('*')); - assertEquals(1, n.compareTo('!')); - assertEquals(0, n.compareTo(new Node("*,"))); - assertEquals(1, n.compareTo(new Node("!muga,"))); - assertEquals(-1, n.compareTo(new Node(""))); - - n = new Node("!hoge"); - assertEquals(1, n.compareTo('a')); - assertEquals(-1, n.compareTo('*')); - assertEquals(0, n.compareTo('!')); - assertEquals(-1, n.compareTo(new Node("*,"))); - assertEquals(0, n.compareTo(new Node("!muga,"))); - assertEquals(-1, n.compareTo(new Node(""))); - - n = new Node(""); - assertEquals(1, n.compareTo('a')); - assertEquals(1, n.compareTo('*')); - assertEquals(1, n.compareTo('!')); - assertEquals(0, n.compareTo(new Node(""))); - } - - protected String dump(Node alt) { - StringWriter w = new StringWriter(); - PublicSuffixes.dump(alt, 0, new PrintWriter(w)); - return w.toString(); - } - public void testTrie1() { - Node alt = new Node(null, new ArrayList()); - alt.addBranch("ac,"); - // specifically, should not have empty string as match. - assertEquals("(null)\n" + - " \"ac,\"\n", dump(alt)); - alt.addBranch("ac,com,"); - assertEquals("(null)\n" + - " \"ac,\"\n" + - " \"com,\"\n" + - " \"\"\n", dump(alt)); - alt.addBranch("ac,edu,"); - assertEquals("(null)\n" + - " \"ac,\"\n" + - " \"com,\"\n" + - " \"edu,\"\n" + - " \"\"\n", dump(alt)); - } - public void testTrie2() { - Node alt = new Node(null, new ArrayList()); - alt.addBranch("ac,"); - alt.addBranch("*,"); - assertEquals("(null)\n" + - " \"ac,\"\n" + - " \"*,\"\n", dump(alt)); - } - - public void testTrie3() { - Node alt = new Node(null, new ArrayList()); - alt.addBranch("ac,"); - alt.addBranch("ac,!hoge,"); - alt.addBranch("ac,*,"); - // exception goes first. - assertEquals("(null)\n" + - " \"ac,\"\n" + - " \"!hoge,\"\n" + - " \"*,\"\n" + - " \"\"\n", dump(alt)); - } - - // test of higher-level functionality - - Matcher m = PublicSuffixes.getTopmostAssignedSurtPrefixPattern() - .matcher(""); - - public void testBasics() { - matchPrefix("com,example,www,", "com,example,"); - matchPrefix("com,example,", "com,example,"); - matchPrefix("org,archive,www,", "org,archive,"); - matchPrefix("org,archive,", "org,archive,"); - matchPrefix("fr,yahoo,www,", "fr,yahoo,"); - matchPrefix("fr,yahoo,", "fr,yahoo,"); - matchPrefix("au,com,foobar,www,", "au,com,foobar,"); - matchPrefix("au,com,foobar,", "au,com,foobar,"); - matchPrefix("uk,co,virgin,www,", "uk,co,virgin,"); - matchPrefix("uk,co,virgin,", "uk,co,virgin,"); - matchPrefix("au,com,example,www,", "au,com,example,"); - matchPrefix("au,com,example,", "au,com,example,"); - matchPrefix("jp,yokohama,public,assigned,www,", - "jp,yokohama,public,assigned,"); - matchPrefix("jp,yokohama,public,assigned,", "jp,yokohama,public,assigned,"); - } - - public void testDomainWithDash() { - matchPrefix("de,bad-site,www", "de,bad-site,"); - } - - public void testDomainWithNumbers() { - matchPrefix("de,archive4u,www", "de,archive4u,"); - } - - public void testIPV4() { - assertEquals("unexpected reduction", - "1.2.3.4", - PublicSuffixes.reduceSurtToAssignmentLevel("1.2.3.4")); - } - - public void testIPV6() { - assertEquals("unexpected reduction", - "[2001:0db8:85a3:08d3:1319:8a2e:0370:7344]", - PublicSuffixes.reduceSurtToAssignmentLevel( - "[2001:0db8:85a3:08d3:1319:8a2e:0370:7344]")); - } - - public void testExceptions() { - matchPrefix("uk,bl,www,", "uk,bl,"); - matchPrefix("uk,bl,", "uk,bl,"); - matchPrefix("jp,tokyo,city,subdomain,", "jp,tokyo,city,"); - matchPrefix("jp,tokyo,city,", "jp,tokyo,city,"); - } - - public void testFakeTLD() { - // we assume any new/unknonwn TLD should be assumed as 2-level; - // this is preferable for our grouping purpose but might not be - // for a cookie-assigning browser (original purpose of publicsuffixlist) - matchPrefix("zzz,example,www,", "zzz,example,"); - } - - public void testUnsegmentedHostname() { - m.reset("example"); - assertFalse("unexpected match found in 'example'", m.find()); - } - - public void testTopmostAssignedCaching() { - assertSame("topmostAssignedSurtPrefixPattern not cached",PublicSuffixes.getTopmostAssignedSurtPrefixPattern(),PublicSuffixes.getTopmostAssignedSurtPrefixPattern()); - assertSame("topmostAssignedSurtPrefixRegex not cached",PublicSuffixes.getTopmostAssignedSurtPrefixRegex(),PublicSuffixes.getTopmostAssignedSurtPrefixRegex()); - } - - // TODO: test UTF domains? - - protected void matchPrefix(String surtDomain, String expectedAssignedPrefix) { - m.reset(surtDomain); - assertTrue("expected match not found in '" + surtDomain, m.find()); - assertEquals("expected match not found", expectedAssignedPrefix, m - .group()); - } -} +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.net; + +import java.io.PrintWriter; +import java.io.StringWriter; +import java.util.ArrayList; +import java.util.regex.Matcher; + +import junit.framework.TestCase; + +import org.archive.net.PublicSuffixes.Node; + +/** + * Test cases for PublicSuffixes utility. Confirm expected matches/nonmatches + * from constructed regex. + * + * @author gojomo + */ +public class PublicSuffixesTest extends TestCase { + // test of low level implementation + private final String NL = System.lineSeparator(); + + public void testCompare() { + Node n = new Node("hoge"); + assertTrue(n.compareTo('a') > 0); + assertEquals(-1, n.compareTo('*')); + assertEquals(-1, n.compareTo('!')); + assertEquals(-1, n.compareTo(new Node("*,"))); + assertEquals(-1, n.compareTo(new Node("!muga,"))); + assertEquals(-1, n.compareTo(new Node(""))); + + n = new Node("*,"); + assertEquals(1, n.compareTo('a')); + assertEquals(0, n.compareTo('*')); + assertEquals(1, n.compareTo('!')); + assertEquals(0, n.compareTo(new Node("*,"))); + assertEquals(1, n.compareTo(new Node("!muga,"))); + assertEquals(-1, n.compareTo(new Node(""))); + + n = new Node("!hoge"); + assertEquals(1, n.compareTo('a')); + assertEquals(-1, n.compareTo('*')); + assertEquals(0, n.compareTo('!')); + assertEquals(-1, n.compareTo(new Node("*,"))); + assertEquals(0, n.compareTo(new Node("!muga,"))); + assertEquals(-1, n.compareTo(new Node(""))); + + n = new Node(""); + assertEquals(1, n.compareTo('a')); + assertEquals(1, n.compareTo('*')); + assertEquals(1, n.compareTo('!')); + assertEquals(0, n.compareTo(new Node(""))); + } + + protected String dump(Node alt) { + StringWriter w = new StringWriter(); + PublicSuffixes.dump(alt, 0, new PrintWriter(w)); + return w.toString(); + } + public void testTrie1() { + Node alt = new Node(null, new ArrayList()); + alt.addBranch("ac,"); + // specifically, should not have empty string as match. + assertEquals("(null)" + NL + " \"ac,\"" + NL, dump(alt)); + alt.addBranch("ac,com,"); + assertEquals("(null)" + NL + + " \"ac,\"" + NL + + " \"com,\"" + NL + + " \"\"" + NL, dump(alt)); + alt.addBranch("ac,edu,"); + assertEquals("(null)" + NL + + " \"ac,\"" + NL + + " \"com,\"" + NL + + " \"edu,\"" + NL + + " \"\"" + NL, dump(alt)); + } + public void testTrie2() { + Node alt = new Node(null, new ArrayList()); + alt.addBranch("ac,"); + alt.addBranch("*,"); + assertEquals("(null)" + NL + + " \"ac,\"" + NL + + " \"*,\"" + NL, dump(alt)); + } + + public void testTrie3() { + Node alt = new Node(null, new ArrayList()); + alt.addBranch("ac,"); + alt.addBranch("ac,!hoge,"); + alt.addBranch("ac,*,"); + // exception goes first. + assertEquals("(null)" + NL + + " \"ac,\"" + NL + + " \"!hoge,\"" + NL + + " \"*,\"" + NL + + " \"\"" + NL, dump(alt)); + } + + // test of higher-level functionality + + Matcher m = PublicSuffixes.getTopmostAssignedSurtPrefixPattern() + .matcher(""); + + public void testBasics() { + matchPrefix("com,example,www,", "com,example,"); + matchPrefix("com,example,", "com,example,"); + matchPrefix("org,archive,www,", "org,archive,"); + matchPrefix("org,archive,", "org,archive,"); + matchPrefix("fr,yahoo,www,", "fr,yahoo,"); + matchPrefix("fr,yahoo,", "fr,yahoo,"); + matchPrefix("au,com,foobar,www,", "au,com,foobar,"); + matchPrefix("au,com,foobar,", "au,com,foobar,"); + matchPrefix("uk,co,virgin,www,", "uk,co,virgin,"); + matchPrefix("uk,co,virgin,", "uk,co,virgin,"); + matchPrefix("au,com,example,www,", "au,com,example,"); + matchPrefix("au,com,example,", "au,com,example,"); + matchPrefix("jp,yokohama,public,assigned,www,", + "jp,yokohama,public,assigned,"); + matchPrefix("jp,yokohama,public,assigned,", "jp,yokohama,public,assigned,"); + } + + public void testDomainWithDash() { + matchPrefix("de,bad-site,www", "de,bad-site,"); + } + + public void testDomainWithNumbers() { + matchPrefix("de,archive4u,www", "de,archive4u,"); + } + + public void testIPV4() { + assertEquals("unexpected reduction", + "1.2.3.4", + PublicSuffixes.reduceSurtToAssignmentLevel("1.2.3.4")); + } + + public void testIPV6() { + assertEquals("unexpected reduction", + "[2001:0db8:85a3:08d3:1319:8a2e:0370:7344]", + PublicSuffixes.reduceSurtToAssignmentLevel( + "[2001:0db8:85a3:08d3:1319:8a2e:0370:7344]")); + } + + public void testExceptions() { + matchPrefix("uk,bl,www,", "uk,bl,"); + matchPrefix("uk,bl,", "uk,bl,"); + matchPrefix("jp,tokyo,city,subdomain,", "jp,tokyo,city,"); + matchPrefix("jp,tokyo,city,", "jp,tokyo,city,"); + } + + public void testFakeTLD() { + // we assume any new/unknonwn TLD should be assumed as 2-level; + // this is preferable for our grouping purpose but might not be + // for a cookie-assigning browser (original purpose of publicsuffixlist) + matchPrefix("zzz,example,www,", "zzz,example,"); + } + + public void testUnsegmentedHostname() { + m.reset("example"); + assertFalse("unexpected match found in 'example'", m.find()); + } + + public void testTopmostAssignedCaching() { + assertSame("topmostAssignedSurtPrefixPattern not cached",PublicSuffixes.getTopmostAssignedSurtPrefixPattern(),PublicSuffixes.getTopmostAssignedSurtPrefixPattern()); + assertSame("topmostAssignedSurtPrefixRegex not cached",PublicSuffixes.getTopmostAssignedSurtPrefixRegex(),PublicSuffixes.getTopmostAssignedSurtPrefixRegex()); + } + + // TODO: test UTF domains? + + protected void matchPrefix(String surtDomain, String expectedAssignedPrefix) { + m.reset(surtDomain); + assertTrue("expected match not found in '" + surtDomain, m.find()); + assertEquals("expected match not found", expectedAssignedPrefix, m + .group()); + } +} From 5054060e27da6fef0816efc8b90af06e4e998d9a Mon Sep 17 00:00:00 2001 From: RogerMathisen Date: Wed, 24 Sep 2014 10:04:32 +0200 Subject: [PATCH 003/240] Updated release notes. --- CHANGES.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index 65d24814..db09a463 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,3 +1,7 @@ +1.1.5 +----- +* [Removed direct reference to Unix TMP-path](https://github.com/iipc/webarchive-commons/issues/2) + 1.1.4 ----- * [All dates should be independent of locale settings](https://github.com/iipc/webarchive-commons/pull/22) From f3e12da0bb53cb4ffb0d21b2d13cda1b6918b1d1 Mon Sep 17 00:00:00 2001 From: Thomas Edvardsen Date: Wed, 24 Sep 2014 10:26:38 +0200 Subject: [PATCH 004/240] * changed newline from 0d0a to 0a in sourcfile --- .../org/archive/net/PublicSuffixesTest.java | 386 +++++++++--------- 1 file changed, 193 insertions(+), 193 deletions(-) diff --git a/src/test/java/org/archive/net/PublicSuffixesTest.java b/src/test/java/org/archive/net/PublicSuffixesTest.java index a82bab22..7528bbe1 100644 --- a/src/test/java/org/archive/net/PublicSuffixesTest.java +++ b/src/test/java/org/archive/net/PublicSuffixesTest.java @@ -1,193 +1,193 @@ -/* - * This file is part of the Heritrix web crawler (crawler.archive.org). - * - * Licensed to the Internet Archive (IA) by one or more individual - * contributors. - * - * The IA licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.archive.net; - -import java.io.PrintWriter; -import java.io.StringWriter; -import java.util.ArrayList; -import java.util.regex.Matcher; - -import junit.framework.TestCase; - -import org.archive.net.PublicSuffixes.Node; - -/** - * Test cases for PublicSuffixes utility. Confirm expected matches/nonmatches - * from constructed regex. - * - * @author gojomo - */ -public class PublicSuffixesTest extends TestCase { - // test of low level implementation - private final String NL = System.lineSeparator(); - - public void testCompare() { - Node n = new Node("hoge"); - assertTrue(n.compareTo('a') > 0); - assertEquals(-1, n.compareTo('*')); - assertEquals(-1, n.compareTo('!')); - assertEquals(-1, n.compareTo(new Node("*,"))); - assertEquals(-1, n.compareTo(new Node("!muga,"))); - assertEquals(-1, n.compareTo(new Node(""))); - - n = new Node("*,"); - assertEquals(1, n.compareTo('a')); - assertEquals(0, n.compareTo('*')); - assertEquals(1, n.compareTo('!')); - assertEquals(0, n.compareTo(new Node("*,"))); - assertEquals(1, n.compareTo(new Node("!muga,"))); - assertEquals(-1, n.compareTo(new Node(""))); - - n = new Node("!hoge"); - assertEquals(1, n.compareTo('a')); - assertEquals(-1, n.compareTo('*')); - assertEquals(0, n.compareTo('!')); - assertEquals(-1, n.compareTo(new Node("*,"))); - assertEquals(0, n.compareTo(new Node("!muga,"))); - assertEquals(-1, n.compareTo(new Node(""))); - - n = new Node(""); - assertEquals(1, n.compareTo('a')); - assertEquals(1, n.compareTo('*')); - assertEquals(1, n.compareTo('!')); - assertEquals(0, n.compareTo(new Node(""))); - } - - protected String dump(Node alt) { - StringWriter w = new StringWriter(); - PublicSuffixes.dump(alt, 0, new PrintWriter(w)); - return w.toString(); - } - public void testTrie1() { - Node alt = new Node(null, new ArrayList()); - alt.addBranch("ac,"); - // specifically, should not have empty string as match. - assertEquals("(null)" + NL + " \"ac,\"" + NL, dump(alt)); - alt.addBranch("ac,com,"); - assertEquals("(null)" + NL + - " \"ac,\"" + NL + - " \"com,\"" + NL + - " \"\"" + NL, dump(alt)); - alt.addBranch("ac,edu,"); - assertEquals("(null)" + NL + - " \"ac,\"" + NL + - " \"com,\"" + NL + - " \"edu,\"" + NL + - " \"\"" + NL, dump(alt)); - } - public void testTrie2() { - Node alt = new Node(null, new ArrayList()); - alt.addBranch("ac,"); - alt.addBranch("*,"); - assertEquals("(null)" + NL + - " \"ac,\"" + NL + - " \"*,\"" + NL, dump(alt)); - } - - public void testTrie3() { - Node alt = new Node(null, new ArrayList()); - alt.addBranch("ac,"); - alt.addBranch("ac,!hoge,"); - alt.addBranch("ac,*,"); - // exception goes first. - assertEquals("(null)" + NL + - " \"ac,\"" + NL + - " \"!hoge,\"" + NL + - " \"*,\"" + NL + - " \"\"" + NL, dump(alt)); - } - - // test of higher-level functionality - - Matcher m = PublicSuffixes.getTopmostAssignedSurtPrefixPattern() - .matcher(""); - - public void testBasics() { - matchPrefix("com,example,www,", "com,example,"); - matchPrefix("com,example,", "com,example,"); - matchPrefix("org,archive,www,", "org,archive,"); - matchPrefix("org,archive,", "org,archive,"); - matchPrefix("fr,yahoo,www,", "fr,yahoo,"); - matchPrefix("fr,yahoo,", "fr,yahoo,"); - matchPrefix("au,com,foobar,www,", "au,com,foobar,"); - matchPrefix("au,com,foobar,", "au,com,foobar,"); - matchPrefix("uk,co,virgin,www,", "uk,co,virgin,"); - matchPrefix("uk,co,virgin,", "uk,co,virgin,"); - matchPrefix("au,com,example,www,", "au,com,example,"); - matchPrefix("au,com,example,", "au,com,example,"); - matchPrefix("jp,yokohama,public,assigned,www,", - "jp,yokohama,public,assigned,"); - matchPrefix("jp,yokohama,public,assigned,", "jp,yokohama,public,assigned,"); - } - - public void testDomainWithDash() { - matchPrefix("de,bad-site,www", "de,bad-site,"); - } - - public void testDomainWithNumbers() { - matchPrefix("de,archive4u,www", "de,archive4u,"); - } - - public void testIPV4() { - assertEquals("unexpected reduction", - "1.2.3.4", - PublicSuffixes.reduceSurtToAssignmentLevel("1.2.3.4")); - } - - public void testIPV6() { - assertEquals("unexpected reduction", - "[2001:0db8:85a3:08d3:1319:8a2e:0370:7344]", - PublicSuffixes.reduceSurtToAssignmentLevel( - "[2001:0db8:85a3:08d3:1319:8a2e:0370:7344]")); - } - - public void testExceptions() { - matchPrefix("uk,bl,www,", "uk,bl,"); - matchPrefix("uk,bl,", "uk,bl,"); - matchPrefix("jp,tokyo,city,subdomain,", "jp,tokyo,city,"); - matchPrefix("jp,tokyo,city,", "jp,tokyo,city,"); - } - - public void testFakeTLD() { - // we assume any new/unknonwn TLD should be assumed as 2-level; - // this is preferable for our grouping purpose but might not be - // for a cookie-assigning browser (original purpose of publicsuffixlist) - matchPrefix("zzz,example,www,", "zzz,example,"); - } - - public void testUnsegmentedHostname() { - m.reset("example"); - assertFalse("unexpected match found in 'example'", m.find()); - } - - public void testTopmostAssignedCaching() { - assertSame("topmostAssignedSurtPrefixPattern not cached",PublicSuffixes.getTopmostAssignedSurtPrefixPattern(),PublicSuffixes.getTopmostAssignedSurtPrefixPattern()); - assertSame("topmostAssignedSurtPrefixRegex not cached",PublicSuffixes.getTopmostAssignedSurtPrefixRegex(),PublicSuffixes.getTopmostAssignedSurtPrefixRegex()); - } - - // TODO: test UTF domains? - - protected void matchPrefix(String surtDomain, String expectedAssignedPrefix) { - m.reset(surtDomain); - assertTrue("expected match not found in '" + surtDomain, m.find()); - assertEquals("expected match not found", expectedAssignedPrefix, m - .group()); - } -} +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.net; + +import java.io.PrintWriter; +import java.io.StringWriter; +import java.util.ArrayList; +import java.util.regex.Matcher; + +import junit.framework.TestCase; + +import org.archive.net.PublicSuffixes.Node; + +/** + * Test cases for PublicSuffixes utility. Confirm expected matches/nonmatches + * from constructed regex. + * + * @author gojomo + */ +public class PublicSuffixesTest extends TestCase { + // test of low level implementation + private final String NL = System.lineSeparator(); + + public void testCompare() { + Node n = new Node("hoge"); + assertTrue(n.compareTo('a') > 0); + assertEquals(-1, n.compareTo('*')); + assertEquals(-1, n.compareTo('!')); + assertEquals(-1, n.compareTo(new Node("*,"))); + assertEquals(-1, n.compareTo(new Node("!muga,"))); + assertEquals(-1, n.compareTo(new Node(""))); + + n = new Node("*,"); + assertEquals(1, n.compareTo('a')); + assertEquals(0, n.compareTo('*')); + assertEquals(1, n.compareTo('!')); + assertEquals(0, n.compareTo(new Node("*,"))); + assertEquals(1, n.compareTo(new Node("!muga,"))); + assertEquals(-1, n.compareTo(new Node(""))); + + n = new Node("!hoge"); + assertEquals(1, n.compareTo('a')); + assertEquals(-1, n.compareTo('*')); + assertEquals(0, n.compareTo('!')); + assertEquals(-1, n.compareTo(new Node("*,"))); + assertEquals(0, n.compareTo(new Node("!muga,"))); + assertEquals(-1, n.compareTo(new Node(""))); + + n = new Node(""); + assertEquals(1, n.compareTo('a')); + assertEquals(1, n.compareTo('*')); + assertEquals(1, n.compareTo('!')); + assertEquals(0, n.compareTo(new Node(""))); + } + + protected String dump(Node alt) { + StringWriter w = new StringWriter(); + PublicSuffixes.dump(alt, 0, new PrintWriter(w)); + return w.toString(); + } + public void testTrie1() { + Node alt = new Node(null, new ArrayList()); + alt.addBranch("ac,"); + // specifically, should not have empty string as match. + assertEquals("(null)" + NL + " \"ac,\"" + NL, dump(alt)); + alt.addBranch("ac,com,"); + assertEquals("(null)" + NL + + " \"ac,\"" + NL + + " \"com,\"" + NL + + " \"\"" + NL, dump(alt)); + alt.addBranch("ac,edu,"); + assertEquals("(null)" + NL + + " \"ac,\"" + NL + + " \"com,\"" + NL + + " \"edu,\"" + NL + + " \"\"" + NL, dump(alt)); + } + public void testTrie2() { + Node alt = new Node(null, new ArrayList()); + alt.addBranch("ac,"); + alt.addBranch("*,"); + assertEquals("(null)" + NL + + " \"ac,\"" + NL + + " \"*,\"" + NL, dump(alt)); + } + + public void testTrie3() { + Node alt = new Node(null, new ArrayList()); + alt.addBranch("ac,"); + alt.addBranch("ac,!hoge,"); + alt.addBranch("ac,*,"); + // exception goes first. + assertEquals("(null)" + NL + + " \"ac,\"" + NL + + " \"!hoge,\"" + NL + + " \"*,\"" + NL + + " \"\"" + NL, dump(alt)); + } + + // test of higher-level functionality + + Matcher m = PublicSuffixes.getTopmostAssignedSurtPrefixPattern() + .matcher(""); + + public void testBasics() { + matchPrefix("com,example,www,", "com,example,"); + matchPrefix("com,example,", "com,example,"); + matchPrefix("org,archive,www,", "org,archive,"); + matchPrefix("org,archive,", "org,archive,"); + matchPrefix("fr,yahoo,www,", "fr,yahoo,"); + matchPrefix("fr,yahoo,", "fr,yahoo,"); + matchPrefix("au,com,foobar,www,", "au,com,foobar,"); + matchPrefix("au,com,foobar,", "au,com,foobar,"); + matchPrefix("uk,co,virgin,www,", "uk,co,virgin,"); + matchPrefix("uk,co,virgin,", "uk,co,virgin,"); + matchPrefix("au,com,example,www,", "au,com,example,"); + matchPrefix("au,com,example,", "au,com,example,"); + matchPrefix("jp,yokohama,public,assigned,www,", + "jp,yokohama,public,assigned,"); + matchPrefix("jp,yokohama,public,assigned,", "jp,yokohama,public,assigned,"); + } + + public void testDomainWithDash() { + matchPrefix("de,bad-site,www", "de,bad-site,"); + } + + public void testDomainWithNumbers() { + matchPrefix("de,archive4u,www", "de,archive4u,"); + } + + public void testIPV4() { + assertEquals("unexpected reduction", + "1.2.3.4", + PublicSuffixes.reduceSurtToAssignmentLevel("1.2.3.4")); + } + + public void testIPV6() { + assertEquals("unexpected reduction", + "[2001:0db8:85a3:08d3:1319:8a2e:0370:7344]", + PublicSuffixes.reduceSurtToAssignmentLevel( + "[2001:0db8:85a3:08d3:1319:8a2e:0370:7344]")); + } + + public void testExceptions() { + matchPrefix("uk,bl,www,", "uk,bl,"); + matchPrefix("uk,bl,", "uk,bl,"); + matchPrefix("jp,tokyo,city,subdomain,", "jp,tokyo,city,"); + matchPrefix("jp,tokyo,city,", "jp,tokyo,city,"); + } + + public void testFakeTLD() { + // we assume any new/unknonwn TLD should be assumed as 2-level; + // this is preferable for our grouping purpose but might not be + // for a cookie-assigning browser (original purpose of publicsuffixlist) + matchPrefix("zzz,example,www,", "zzz,example,"); + } + + public void testUnsegmentedHostname() { + m.reset("example"); + assertFalse("unexpected match found in 'example'", m.find()); + } + + public void testTopmostAssignedCaching() { + assertSame("topmostAssignedSurtPrefixPattern not cached",PublicSuffixes.getTopmostAssignedSurtPrefixPattern(),PublicSuffixes.getTopmostAssignedSurtPrefixPattern()); + assertSame("topmostAssignedSurtPrefixRegex not cached",PublicSuffixes.getTopmostAssignedSurtPrefixRegex(),PublicSuffixes.getTopmostAssignedSurtPrefixRegex()); + } + + // TODO: test UTF domains? + + protected void matchPrefix(String surtDomain, String expectedAssignedPrefix) { + m.reset(surtDomain); + assertTrue("expected match not found in '" + surtDomain, m.find()); + assertEquals("expected match not found", expectedAssignedPrefix, m + .group()); + } +} From faec599fc4a1cc8f09523e78cab073ed570b8adc Mon Sep 17 00:00:00 2001 From: RogerMathisen Date: Wed, 24 Sep 2014 11:03:55 +0200 Subject: [PATCH 005/240] - Removed pointless code. --- .../archive/util/iterator/SortedCompositeIteratorTest.java | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/test/java/org/archive/util/iterator/SortedCompositeIteratorTest.java b/src/test/java/org/archive/util/iterator/SortedCompositeIteratorTest.java index 0f4dc68a..11ea1229 100644 --- a/src/test/java/org/archive/util/iterator/SortedCompositeIteratorTest.java +++ b/src/test/java/org/archive/util/iterator/SortedCompositeIteratorTest.java @@ -14,11 +14,6 @@ public class SortedCompositeIteratorTest extends TestCase { public void testHasNext() throws FileNotFoundException, IOException { - long t = 210000; - long c = 134; - float f = (float)c / (float)t; - System.err.format("F(%f)\n",f); - File a = File.createTempFile("filea", null); File b = File.createTempFile("fileb", null); From 595851a1138529b4d7e633b2cfd5e4e28b6b6204 Mon Sep 17 00:00:00 2001 From: John Erik Halse Date: Tue, 30 Sep 2014 14:45:33 +0200 Subject: [PATCH 006/240] Require the oldest recommended version of Maven 3 --- pom.xml | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/pom.xml b/pom.xml index 6664efd8..0eee2ed2 100644 --- a/pom.xml +++ b/pom.xml @@ -214,7 +214,29 @@ + + org.apache.maven.plugins + maven-enforcer-plugin + 1.3.1 + + + enforce-maven + + enforce + + + + + This project requires Maven 3 + 3.0.5 + + + + + + + src/main/resources From 46d0f6ffbad1b02fd7917c0e218eeed6557f3d9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kristinn=20Sigur=C3=B0sson?= Date: Tue, 30 Sep 2014 15:10:48 +0000 Subject: [PATCH 007/240] Java 6 compatibility System.lineSeparator() was introducted in Java 7 --- src/test/java/org/archive/net/PublicSuffixesTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/java/org/archive/net/PublicSuffixesTest.java b/src/test/java/org/archive/net/PublicSuffixesTest.java index 7528bbe1..ca6e6408 100644 --- a/src/test/java/org/archive/net/PublicSuffixesTest.java +++ b/src/test/java/org/archive/net/PublicSuffixesTest.java @@ -36,7 +36,7 @@ */ public class PublicSuffixesTest extends TestCase { // test of low level implementation - private final String NL = System.lineSeparator(); + private final String NL = System.getProperty("line.separator"); public void testCompare() { Node n = new Node("hoge"); From 6556c7f14e54d07f13fe49c4c1bc6ee88c18f134 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kristinn=20Sigur=C3=B0sson?= Date: Tue, 30 Sep 2014 16:09:54 +0000 Subject: [PATCH 008/240] Change test value to get around Java 8 bug Fixes issue #31 which relates to changes in how Java rounds doubles in some edge cases. --- .../java/org/archive/util/ArchiveUtilsTest.java | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/test/java/org/archive/util/ArchiveUtilsTest.java b/src/test/java/org/archive/util/ArchiveUtilsTest.java index 8251615a..586a1821 100644 --- a/src/test/java/org/archive/util/ArchiveUtilsTest.java +++ b/src/test/java/org/archive/util/ArchiveUtilsTest.java @@ -229,16 +229,19 @@ public void testByteArrayEquals() { /** test doubleToString() */ public void testDoubleToString(){ - double test = 12.345; - assertTrue( + double test = 12.121d; + assertEquals( "cecking zero precision", - ArchiveUtils.doubleToString(test, 0).equals("12")); - assertTrue( + "12", + ArchiveUtils.doubleToString(test, 0)); + assertEquals( "cecking 2 character precision", - ArchiveUtils.doubleToString(test, 2).equals("12.34")); - assertTrue( + "12.12", + ArchiveUtils.doubleToString(test, 2)); + assertEquals( "cecking precision higher then the double has", - ArchiveUtils.doubleToString(test, 65).equals("12.345")); + "12.121", + ArchiveUtils.doubleToString(test, 65)); } From fbf4df7117e3fe5b812a047736836e6531936897 Mon Sep 17 00:00:00 2001 From: John Erik Halse Date: Wed, 1 Oct 2014 07:48:22 +0200 Subject: [PATCH 009/240] Changed message. --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 0eee2ed2..f6730625 100644 --- a/pom.xml +++ b/pom.xml @@ -227,7 +227,7 @@ - This project requires Maven 3 + This project requires Maven 3.0.5 or higher 3.0.5 From fbbaab079b06260aa84b8b2d896a34db3a6872e3 Mon Sep 17 00:00:00 2001 From: John Erik Halse Date: Wed, 1 Oct 2014 12:54:59 +0200 Subject: [PATCH 010/240] Update CHANGES.md --- CHANGES.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index db09a463..a84f579e 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,6 +1,6 @@ 1.1.5 ----- -* [Removed direct reference to Unix TMP-path](https://github.com/iipc/webarchive-commons/issues/2) +* [Tests fail on Windows](https://github.com/iipc/webarchive-commons/issues/2) 1.1.4 ----- From 7914bdf04dbf5d0b431065b650a91773684ae757 Mon Sep 17 00:00:00 2001 From: John Erik Halse Date: Wed, 1 Oct 2014 12:58:31 +0200 Subject: [PATCH 011/240] Update CHANGES.md --- CHANGES.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGES.md b/CHANGES.md index a84f579e..8e787634 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,6 +1,7 @@ 1.1.5 ----- * [Tests fail on Windows](https://github.com/iipc/webarchive-commons/issues/2) +* [Test fails on Java 8](https://github.com/iipc/webarchive-commons/issues/31) 1.1.4 ----- From 166656eb4b0dbfb16611a8b74e79c35b8954e72a Mon Sep 17 00:00:00 2001 From: John Erik Halse Date: Mon, 6 Oct 2014 14:03:17 +0200 Subject: [PATCH 012/240] Added method to UsableUri to get the IDN in non-puny form --- src/main/java/org/archive/url/UsableURI.java | 48 +++++++++++++++++++ .../java/org/archive/url/UsableURITest.java | 12 +++++ 2 files changed, 60 insertions(+) diff --git a/src/main/java/org/archive/url/UsableURI.java b/src/main/java/org/archive/url/UsableURI.java index b9c4ff9d..fa1de57a 100644 --- a/src/main/java/org/archive/url/UsableURI.java +++ b/src/main/java/org/archive/url/UsableURI.java @@ -18,6 +18,7 @@ */ package org.archive.url; +import gnu.inet.encoding.IDNA; import java.io.File; import java.io.IOException; import java.io.ObjectOutputStream; @@ -271,6 +272,53 @@ public String toString() { return toCustomString(); } + /** + * In the case of a puny encoded IDN, this method returns the decoded Unicode version. + * @return decoded IDN version of URI + */ + public String toUnicodeHostString() { + if (!_is_hostname) { + return toString(); + } + + try { + StringBuilder buf = new StringBuilder(); + + if (_scheme != null) { + buf.append(_scheme); + buf.append(':'); + } + if (_is_net_path) { + buf.append("//"); + if (_authority != null) { // has_authority + if (_userinfo != null) { + buf.append(_userinfo).append('@'); + } + buf.append(IDNA.toUnicode(getHost())); + if (_port >= 0) { + buf.append(':').append(_port); + } + this._authority = buf.toString().toCharArray(); + } + } + if (_opaque != null && _is_opaque_part) { + buf.append(_opaque); + } else if (_path != null) { + // _is_hier_part or _is_relativeURI + if (_path.length != 0) { + buf.append(_path); + } + } + if (_query != null) { // has_query + buf.append('?'); + buf.append(_query); + } + return buf.toString(); + } catch (URIException ex) { + throw new RuntimeException(ex); + } + } + public synchronized String getEscapedURI() { if (this.cachedEscapedURI == null) { this.cachedEscapedURI = super.getEscapedURI(); diff --git a/src/test/java/org/archive/url/UsableURITest.java b/src/test/java/org/archive/url/UsableURITest.java index 2aec0e96..7588f03c 100644 --- a/src/test/java/org/archive/url/UsableURITest.java +++ b/src/test/java/org/archive/url/UsableURITest.java @@ -53,4 +53,16 @@ public void testSchemalessRelative() throws URIException { UsableURI test = new UsableURI(base, relative); assertEquals("http://www.facebook.com/?href=http://www.archive.org/a", test.toString()); } + + /** + * Test of toUnicodeHostString method, of class UsableURI. + */ + public void testToUnicodeHostString() throws URIException { + assertEquals("http://øx.dk", new UsableURI("http://xn--x-4ga.dk", true, "UTF-8").toUnicodeHostString()); + assertEquals("xn--x-4ga.dk", new UsableURI("xn--x-4ga.dk", true, "UTF-8").toUnicodeHostString()); + assertEquals("http://user:pass@øx.dk:8080", new UsableURI("http://user:pass@xn--x-4ga.dk:8080", true, "UTF-8").toUnicodeHostString()); + assertEquals("http://user@øx.dk:8080", new UsableURI("http://user@xn--x-4ga.dk:8080", true, "UTF-8").toUnicodeHostString()); + assertEquals("http://øx.dk/foo/bar?query=q", new UsableURI("http://xn--x-4ga.dk/foo/bar?query=q", true, "UTF-8").toUnicodeHostString()); + assertEquals("http://127.0.0.1/foo/bar?query=q", new UsableURI("http://127.0.0.1/foo/bar?query=q", true, "UTF-8").toUnicodeHostString()); + } } From 619412c284baf78e8fbb3e2391687e226c4ea0f1 Mon Sep 17 00:00:00 2001 From: John Erik Halse Date: Mon, 10 Nov 2014 12:12:07 +0100 Subject: [PATCH 013/240] Fixed bug which changed the URI after calling toUnicodeHostString. --- src/main/java/org/archive/url/UsableURI.java | 4 +++- src/test/java/org/archive/url/UsableURITest.java | 15 +++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/archive/url/UsableURI.java b/src/main/java/org/archive/url/UsableURI.java index fa1de57a..ed40f41a 100644 --- a/src/main/java/org/archive/url/UsableURI.java +++ b/src/main/java/org/archive/url/UsableURI.java @@ -274,6 +274,9 @@ public String toString() { /** * In the case of a puny encoded IDN, this method returns the decoded Unicode version. + *

+ * Most of this implementation is copied from {@link org.apache.commons.httpclient.URI#setURI()}. + * * @return decoded IDN version of URI */ public String toUnicodeHostString() { @@ -298,7 +301,6 @@ public String toUnicodeHostString() { if (_port >= 0) { buf.append(':').append(_port); } - this._authority = buf.toString().toCharArray(); } } if (_opaque != null && _is_opaque_part) { diff --git a/src/test/java/org/archive/url/UsableURITest.java b/src/test/java/org/archive/url/UsableURITest.java index 7588f03c..73694f79 100644 --- a/src/test/java/org/archive/url/UsableURITest.java +++ b/src/test/java/org/archive/url/UsableURITest.java @@ -64,5 +64,20 @@ public void testToUnicodeHostString() throws URIException { assertEquals("http://user@øx.dk:8080", new UsableURI("http://user@xn--x-4ga.dk:8080", true, "UTF-8").toUnicodeHostString()); assertEquals("http://øx.dk/foo/bar?query=q", new UsableURI("http://xn--x-4ga.dk/foo/bar?query=q", true, "UTF-8").toUnicodeHostString()); assertEquals("http://127.0.0.1/foo/bar?query=q", new UsableURI("http://127.0.0.1/foo/bar?query=q", true, "UTF-8").toUnicodeHostString()); + + // test idn round trip + // XXX fails because idn is not handled here (it is converted to punycode in UsableURIFactory.fixupDomainlabel()) + // assertEquals("http://øx.dk", new UsableURI("http://øx.dk", false, "UTF-8").toUnicodeHostString()); + // To check the round trip it is then necessary to use the factory method in UsableURIFactory. + assertEquals("http://øx.dk/", UsableURIFactory.getInstance("http://øx.dk/", "UTF-8").toUnicodeHostString()); + + // non-idn domain name + assertEquals("http://example.org", new UsableURI("http://example.org", true, "UTF-8").toUnicodeHostString()); + + // ensure a call to toUnicodeHostString() has no effect on toString() + UsableURI uri = new UsableURI("http://xn--x-4ga.dk", true, "UTF-8"); + assertEquals("http://øx.dk", uri.toUnicodeHostString()); + uri.setPath(uri.getPath()); // force toString() cached value to be recomputed + assertEquals("http://xn--x-4ga.dk", uri.toString()); } } From 61f5a8cb7233f48196ea8fa305492d6b9f637b7f Mon Sep 17 00:00:00 2001 From: John Erik Halse Date: Mon, 10 Nov 2014 12:14:08 +0100 Subject: [PATCH 014/240] Fixed bug that prevented the https scheme from using static string. --- src/main/java/org/archive/url/LaxURI.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/archive/url/LaxURI.java b/src/main/java/org/archive/url/LaxURI.java index 807333d3..e1cea9b7 100644 --- a/src/main/java/org/archive/url/LaxURI.java +++ b/src/main/java/org/archive/url/LaxURI.java @@ -211,7 +211,7 @@ protected void setURI() { if (_scheme.length == 4 && Arrays.equals(_scheme, HTTP_SCHEME)) { _scheme = HTTP_SCHEME; } else if (_scheme.length == 5 - && Arrays.equals(_scheme, HTTP_SCHEME)) { + && Arrays.equals(_scheme, HTTPS_SCHEME)) { _scheme = HTTPS_SCHEME; } } From 6b7971f86eda7255c1d5ab05f7883da30db7fced Mon Sep 17 00:00:00 2001 From: John Erik Halse Date: Mon, 10 Nov 2014 15:39:15 +0100 Subject: [PATCH 015/240] Removed unnecessary import --- src/test/java/org/archive/url/UsableURITest.java | 1 - 1 file changed, 1 deletion(-) diff --git a/src/test/java/org/archive/url/UsableURITest.java b/src/test/java/org/archive/url/UsableURITest.java index 73694f79..2a2f41f5 100644 --- a/src/test/java/org/archive/url/UsableURITest.java +++ b/src/test/java/org/archive/url/UsableURITest.java @@ -21,7 +21,6 @@ import java.net.URISyntaxException; import org.apache.commons.httpclient.URIException; -import org.archive.url.UsableURI; import junit.framework.TestCase; From 363a3c51b40a5d559bfa6eb7d2f038b9258f577a Mon Sep 17 00:00:00 2001 From: Gerhard Gossen Date: Wed, 17 Dec 2014 16:39:24 +0100 Subject: [PATCH 016/240] Improve URL escaping in CDX writer --- .../extract/RealCDXExtractorOutput.java | 9 ++++-- .../extract/RealCDXExtractorOutputTest.java | 28 +++++++++++++++++++ 2 files changed, 34 insertions(+), 3 deletions(-) create mode 100644 src/test/java/org/archive/extract/RealCDXExtractorOutputTest.java diff --git a/src/main/java/org/archive/extract/RealCDXExtractorOutput.java b/src/main/java/org/archive/extract/RealCDXExtractorOutput.java index 62a423c5..8ca3ff82 100644 --- a/src/main/java/org/archive/extract/RealCDXExtractorOutput.java +++ b/src/main/java/org/archive/extract/RealCDXExtractorOutput.java @@ -4,6 +4,7 @@ import java.io.OutputStream; import java.io.PrintWriter; import java.net.MalformedURLException; +import java.net.URI; import java.net.URISyntaxException; import java.net.URL; import java.util.List; @@ -307,12 +308,14 @@ private String extractHTMLMetaRefresh(String origUrl, MetaData m) { return "-"; } - private String resolve(String context, String spec) { + static String resolve(String context, String spec) { // TODO: test! try { URL cUrl = new URL(context); - URL resolved = new URL(cUrl,spec); - return resolved.toURI().toASCIIString(); + URL url = new URL(cUrl, spec); + // this constructor escapes its arguments, if necessary + URI uri = new URI(url.getProtocol(), url.getHost(), url.getPath(), url.getQuery(), url.getRef()); + return uri.toASCIIString(); } catch (URISyntaxException e) { } catch (MalformedURLException e) { diff --git a/src/test/java/org/archive/extract/RealCDXExtractorOutputTest.java b/src/test/java/org/archive/extract/RealCDXExtractorOutputTest.java new file mode 100644 index 00000000..14f8489d --- /dev/null +++ b/src/test/java/org/archive/extract/RealCDXExtractorOutputTest.java @@ -0,0 +1,28 @@ +package org.archive.extract; + +import java.net.MalformedURLException; +import java.net.URI; +import java.net.URISyntaxException; +import java.net.URL; +import java.net.URLEncoder; + +import junit.framework.TestCase; + + +public class RealCDXExtractorOutputTest extends TestCase { + + public void testEscapeResolvedUrl() throws Exception { + String context ="http://www.uni-giessen.de/cms/studium/dateien/informationberatung/merkblattpdf"; + String spec = "http://fss.plone.uni-giessen.de/fß/studium/dateien/informationberatung/merkblattpdf/file/Mérkblatt zur Gestaltung von Nachteilsausgleichen.pdf?föo=bar#änchor"; + String escaped = RealCDXExtractorOutput.resolve(context, spec); + assertTrue(escaped.indexOf(" ") < 0); + URI parsed = new URI(escaped); + assertEquals("änchor", parsed.getFragment()); + } + + public void testNoDoubleEscaping() throws Exception { + String spec = "https://www.google.com/search?q=java+escape+url+spaces&ie=utf-8&oe=utf-8"; + String resolved = RealCDXExtractorOutput.resolve(spec, spec); + assertTrue(spec.equals(resolved)); + } +} From 1ee18d8a426a0b18aa502f71896d9962416262a0 Mon Sep 17 00:00:00 2001 From: Gerhard Gossen Date: Wed, 17 Dec 2014 17:12:42 +0100 Subject: [PATCH 017/240] Update CHANGES.md --- CHANGES.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGES.md b/CHANGES.md index 8e787634..7fb2f7c4 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,6 @@ 1.1.5 ----- +* [Escape redirect URLs in RealCDXExtractorOutput](https://github.com/iipc/webarchive-commons/pull/36) * [Tests fail on Windows](https://github.com/iipc/webarchive-commons/issues/2) * [Test fails on Java 8](https://github.com/iipc/webarchive-commons/issues/31) From f130aad04b255e7d8cd4eee4bac86c25b0cbbf36 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 6 Jan 2015 15:54:59 -0800 Subject: [PATCH 018/240] move RecordingOutputStreamTest.java from heritrix to webarchive-commons --- .../archive/io/RecordingOutputStreamTest.java | 260 ++++++++++++++++++ 1 file changed, 260 insertions(+) create mode 100644 src/test/java/org/archive/io/RecordingOutputStreamTest.java diff --git a/src/test/java/org/archive/io/RecordingOutputStreamTest.java b/src/test/java/org/archive/io/RecordingOutputStreamTest.java new file mode 100644 index 00000000..1c53549b --- /dev/null +++ b/src/test/java/org/archive/io/RecordingOutputStreamTest.java @@ -0,0 +1,260 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; + +import org.archive.util.TmpDirTestCase; + + +/** + * Test casesfor RecordingOutputStream. + * + * @author stack + */ +public class RecordingOutputStreamTest extends TmpDirTestCase +{ + /** + * Size of buffer used in tests. + */ + private static final int BUFFER_SIZE = 5; + + /** + * How much to write total to testing RecordingOutputStream. + */ + private static final int WRITE_TOTAL = 10; + + + /* + * @see TmpDirTestCase#setUp() + */ + protected void setUp() throws Exception + { + super.setUp(); + } + + /** + * Test reusing instance of RecordingOutputStream. + * + * @throws IOException Failed open of backing file or opening of + * input streams verifying recording. + */ + public void testReuse() + throws IOException + { + final String BASENAME = "testReuse"; + cleanUpOldFiles(BASENAME); + RecordingOutputStream ros = new RecordingOutputStream(BUFFER_SIZE, + (new File(getTmpDir(), BASENAME + "Bkg.txt")).getAbsolutePath()); + for (int i = 0; i < 3; i++) + { + reuse(BASENAME, ros, i); + } + } + + private void reuse(String baseName, RecordingOutputStream ros, int index) + throws IOException + { + final String BASENAME = baseName + Integer.toString(index); + File f = writeIntRecordedFile(ros, BASENAME, WRITE_TOTAL); + verifyRecording(ros, f, WRITE_TOTAL); + // Do again to test that I can get a new ReplayInputStream on same + // RecordingOutputStream. + verifyRecording(ros, f, WRITE_TOTAL); + } + + /** + * Method to test for void write(int). + * + * Uses small buffer size and small write size. Test mark and reset too. + * + * @throws IOException Failed open of backing file or opening of + * input streams verifying recording. + */ + public void testWriteint() + throws IOException + { + final String BASENAME = "testWriteint"; + cleanUpOldFiles(BASENAME); + RecordingOutputStream ros = new RecordingOutputStream(BUFFER_SIZE, + (new File(getTmpDir(), BASENAME + "Backing.txt")).getAbsolutePath()); + File f = writeIntRecordedFile(ros, BASENAME, WRITE_TOTAL); + verifyRecording(ros, f, WRITE_TOTAL); + // Do again to test that I can get a new ReplayInputStream on same + // RecordingOutputStream. + verifyRecording(ros, f, WRITE_TOTAL); + } + + /** + * Method to test for void write(byte []). + * + * Uses small buffer size and small write size. + * + * @throws IOException Failed open of backing file or opening of + * input streams verifying recording. + */ + public void testWritebytearray() + throws IOException + { + final String BASENAME = "testWritebytearray"; + cleanUpOldFiles(BASENAME); + RecordingOutputStream ros = new RecordingOutputStream(BUFFER_SIZE, + (new File(getTmpDir(), BASENAME + "Backing.txt")).getAbsolutePath()); + File f = writeByteRecordedFile(ros, BASENAME, WRITE_TOTAL); + verifyRecording(ros, f, WRITE_TOTAL); + // Do again to test that I can get a new ReplayInputStream on same + // RecordingOutputStream. + verifyRecording(ros, f, WRITE_TOTAL); + } + + /** + * Test mark and reset. + * @throws IOException + */ + public void testMarkReset() throws IOException + { + final String BASENAME = "testMarkReset"; + cleanUpOldFiles(BASENAME); + RecordingOutputStream ros = new RecordingOutputStream(BUFFER_SIZE, + (new File(getTmpDir(), BASENAME + "Backing.txt")).getAbsolutePath()); + File f = writeByteRecordedFile(ros, BASENAME, WRITE_TOTAL); + verifyRecording(ros, f, WRITE_TOTAL); + ReplayInputStream ris = ros.getReplayInputStream(); + ris.mark(10 /*Arbitrary value*/); + // Read from the stream. + ris.read(); + ris.read(); + ris.read(); + // Reset it. It should be back at zero. + ris.reset(); + assertEquals("Reset to zero", ris.read(), 0); + assertEquals("Reset to zero char 1", ris.read(), 1); + assertEquals("Reset to zero char 2", ris.read(), 2); + // Mark stream. Here. Next character should be '3'. + ris.mark(10 /* Arbitrary value*/); + ris.read(); + ris.read(); + ris.reset(); + assertEquals("Reset to zero char 3", ris.read(), 3); + } + + /** + * Record a file write. + * + * Write a file w/ characters that start at null and ascend to + * filesize. Record the writing w/ passed ros + * recordingoutputstream. Return the file recorded as result of method. + * The file output stream that is recorded is named + * basename + ".txt". + * + *

This method writes a character at a time. + * + * @param ros RecordingOutputStream to record with. + * @param basename Basename of file. + * @param size How many characters to write. + * @return Recorded output stream. + */ + private File writeIntRecordedFile(RecordingOutputStream ros, + String basename, int size) + throws IOException + { + File f = new File(getTmpDir(), basename + ".txt"); + FileOutputStream fos = new FileOutputStream(f); + ros.open(fos); + for (int i = 0; i < WRITE_TOTAL; i++) + { + ros.write(i); + } + ros.close(); + fos.close(); + assertEquals("Content-Length test", size, + ros.getResponseContentLength()); + return f; + } + + /** + * Record a file byte array write. + * + * Write a file w/ characters that start at null and ascend to + * filesize. Record the writing w/ passed ros + * recordingoutputstream. Return the file recorded as result of method. + * The file output stream that is recorded is named + * basename + ".txt". + * + *

This method writes using a byte array. + * + * @param ros RecordingOutputStream to record with. + * @param basename Basename of file. + * @param size How many characters to write. + * @return Recorded output stream. + */ + private File writeByteRecordedFile(RecordingOutputStream ros, + String basename, int size) + throws IOException + { + File f = new File(getTmpDir(), basename + ".txt"); + FileOutputStream fos = new FileOutputStream(f); + ros.open(fos); + byte [] b = new byte[size]; + for (int i = 0; i < size; i++) + { + b[i] = (byte)i; + } + ros.write(b); + ros.close(); + fos.close(); + assertEquals("Content-Length test", size, + ros.getResponseContentLength()); + return f; + } + + /** + * Verify what was written is both in the file written to and in the + * recording stream. + * + * @param ros Stream to check. + * @param f File that was recorded. Stream should have its content + * exactly. + * @param size Amount of bytes written. + * + * @exception IOException Failure reading streams. + */ + private void verifyRecording(RecordingOutputStream ros, File f, + int size) throws IOException + { + assertEquals("Recorded file size.", size, f.length()); + FileInputStream fis = new FileInputStream(f); + assertNotNull("FileInputStream not null", fis); + ReplayInputStream ris = ros.getReplayInputStream(); + assertNotNull("ReplayInputStream not null", ris); + for (int i = 0; i < size; i++) + { + assertEquals("ReplayInputStream content verification", i, + ris.read()); + assertEquals("Recorded file content verification", i, + fis.read()); + } + assertEquals("ReplayInputStream at EOF", -1, ris.read()); + fis.close(); + ris.close(); + } +} From da5d63d41d83fe4d5ea6d14165830e75c568c9a2 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 6 Jan 2015 15:58:31 -0800 Subject: [PATCH 019/240] fix for https://github.com/iipc/webarchive-commons/issues/38 - detect end of http protocol headers in a smarter way, to avoid calling write(byte) repeatedly; add unit tests --- .../org/archive/io/RecordingOutputStream.java | 49 +++++++-- .../archive/io/RecordingOutputStreamTest.java | 100 ++++++++++++++++++ 2 files changed, 142 insertions(+), 7 deletions(-) diff --git a/src/main/java/org/archive/io/RecordingOutputStream.java b/src/main/java/org/archive/io/RecordingOutputStream.java index fe05701c..7d2ff212 100644 --- a/src/main/java/org/archive/io/RecordingOutputStream.java +++ b/src/main/java/org/archive/io/RecordingOutputStream.java @@ -242,6 +242,26 @@ public void write(int b) throws IOException { checkLimits(); } + private int findMessageBodyBeginMark(byte[] b, int off, int len) { + if ((lastTwoBytes[1] == '\n' || lastTwoBytes[0] == '\n' && lastTwoBytes[1] == '\r') + && len >= 1 && b[off] == '\n') { + return 1; + } else if (lastTwoBytes[1] == '\n' && len >= 2 && b[off] == '\r' && b[off+1] == '\n') { + return 2; + } + + for (int i = off; i < off + len - 1; i++) { + if (b[i] == '\n' && b[i+1] == '\n') { + return i + 2; + } else if (b[i] == '\n' && b[i+1] == '\r' + && i + 2 < off + len && b[i+2] == '\n') { + return i + 3; + } + } + + return -1; + } + public void write(byte[] b, int off, int len) throws IOException { if(position < maxPosition) { if(position+len<=maxPosition) { @@ -255,20 +275,35 @@ public void write(byte[] b, int off, int len) throws IOException { off += consumeRange; len -= consumeRange; } - - // see comment on int[] lastTwoBytes - while (messageBodyBeginMark < 0 && len > 0) { - write(b[off]); - off++; - len--; + + if (messageBodyBeginMark < 0) { + // see comment on int[] lastTwoBytes + int mark = findMessageBodyBeginMark(b, off, len); + if (mark > 0) { + if(recording) { + record(b, off, mark - off); + } + if (this.out != null) { + this.out.write(b, off, mark - off); + } + markMessageBodyBegin(); + len = len - (mark - off); + off = mark; + } } - + if(recording) { record(b, off, len); } if (this.out != null) { this.out.write(b, off, len); } + if (len >= 1) { + lastTwoBytes[1] = b[off + len - 1]; + if (len >= 2) { + lastTwoBytes[0] = b[off + len - 2]; + } + } checkLimits(); } diff --git a/src/test/java/org/archive/io/RecordingOutputStreamTest.java b/src/test/java/org/archive/io/RecordingOutputStreamTest.java index 1c53549b..f697ff31 100644 --- a/src/test/java/org/archive/io/RecordingOutputStreamTest.java +++ b/src/test/java/org/archive/io/RecordingOutputStreamTest.java @@ -18,11 +18,13 @@ */ package org.archive.io; +import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; +import org.archive.util.Base32; import org.archive.util.TmpDirTestCase; @@ -257,4 +259,102 @@ private void verifyRecording(RecordingOutputStream ros, File f, fis.close(); ris.close(); } + + public void testMessageBodyBegin() throws IOException { + final String BASENAME = "testMessageBodyBegin"; + cleanUpOldFiles(BASENAME); + RecordingOutputStream ros = new RecordingOutputStream(BUFFER_SIZE, + (new File(getTmpDir(), BASENAME + "Backing.txt")).getAbsolutePath()); + ros.setSha1Digest(); + + ros.open(new ByteArrayOutputStream()); + ros.write("0123456789\n\nabcdefghij".getBytes()); + assertEquals(12, ros.getMessageBodyBegin()); + assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue())); + ros.close(); + + ros.open(new ByteArrayOutputStream()); + ros.write("0123456789\r\n\r\nabcdefghij".getBytes()); + assertEquals(14, ros.getMessageBodyBegin()); + assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue())); + ros.close(); + + ros.open(new ByteArrayOutputStream()); + ros.write("0123456789\n\r\nabcdefghij".getBytes()); + assertEquals(13, ros.getMessageBodyBegin()); + assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue())); + ros.close(); + + ros.open(new ByteArrayOutputStream()); + ros.write("0123456789\n".getBytes()); + assertEquals(-1, ros.getMessageBodyBegin()); + ros.write("\nabcdefghij".getBytes()); + assertEquals(12, ros.getMessageBodyBegin()); + assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue())); + ros.close(); + + ros.open(new ByteArrayOutputStream()); + ros.write("0123456789\n".getBytes()); + assertEquals(-1, ros.getMessageBodyBegin()); + ros.write("\r\nabcdefghij".getBytes()); + assertEquals(13, ros.getMessageBodyBegin()); + assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue())); + ros.close(); + + ros.open(new ByteArrayOutputStream()); + ros.write("0123456789\n\r".getBytes()); + assertEquals(-1, ros.getMessageBodyBegin()); + ros.write("\nabcdefghij".getBytes()); + assertEquals(13, ros.getMessageBodyBegin()); + assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue())); + ros.close(); + + ros.open(new ByteArrayOutputStream()); + ros.write("0123456789".getBytes()); + ros.write('\n'); + assertEquals(-1, ros.getMessageBodyBegin()); + ros.write("\nabcdefghij".getBytes()); + assertEquals(12, ros.getMessageBodyBegin()); + assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue())); + ros.close(); + + ros.open(new ByteArrayOutputStream()); + ros.write("0123456789".getBytes()); + ros.write('\n'); + ros.write('\n'); + for (int b: "abcdefghij".getBytes()) { + ros.write(b); + } + assertEquals(12, ros.getMessageBodyBegin()); + assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue())); + ros.close(); + + ros.open(new ByteArrayOutputStream()); + ros.write("0123456789".getBytes()); + ros.write('\n'); + ros.write('\r'); + ros.write('\n'); + for (int b: "abcdefghij".getBytes()) { + ros.write(b); + } + assertEquals(13, ros.getMessageBodyBegin()); + assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue())); + ros.close(); + + ros.open(new ByteArrayOutputStream()); + ros.write("0123456789\n".getBytes()); + ros.write('\n'); + ros.write("abcdefghij".getBytes()); + assertEquals(12, ros.getMessageBodyBegin()); + assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue())); + ros.close(); + + ros.open(new ByteArrayOutputStream()); + ros.write("0123456789\n\r".getBytes()); + ros.write('\n'); + ros.write("abcdefghij".getBytes()); + assertEquals(13, ros.getMessageBodyBegin()); + assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue())); + ros.close(); + } } From 808dcfe76002ebc126c168abb5b6f00b5d3b7e07 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 6 Jan 2015 16:08:48 -0800 Subject: [PATCH 020/240] move TmpDirTestCase.java from heritrix to webarchive-commons --- .../java/org/archive/util/TmpDirTestCase.java | 119 ++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 src/main/java/org/archive/util/TmpDirTestCase.java diff --git a/src/main/java/org/archive/util/TmpDirTestCase.java b/src/main/java/org/archive/util/TmpDirTestCase.java new file mode 100644 index 00000000..09ec345b --- /dev/null +++ b/src/main/java/org/archive/util/TmpDirTestCase.java @@ -0,0 +1,119 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.util; + +import java.io.File; +import java.io.IOException; + +import junit.framework.TestCase; + + +/** + * Base class for TestCases that want access to a tmp dir for the writing + * of files. + * + * @author stack + */ +public abstract class TmpDirTestCase extends TestCase +{ + /** + * Name of the system property that holds pointer to tmp directory into + * which we can safely write files. + */ + public static final String TEST_TMP_SYSTEM_PROPERTY_NAME = "testtmpdir"; + + /** + * Default test tmp. + */ + public static final String DEFAULT_TEST_TMP_DIR = File.separator + "tmp" + + File.separator + "heritrix-junit-tests"; + + /** + * Directory to write temporary files to. + */ + private File tmpDir = null; + + + public TmpDirTestCase() + { + super(); + } + + public TmpDirTestCase(String testName) + { + super(testName); + } + + /* + * @see TestCase#setUp() + */ + protected void setUp() throws Exception { + super.setUp(); + this.tmpDir = tmpDir(); + } + + /** + * @return Returns the tmpDir. + */ + public File getTmpDir() + { + return this.tmpDir; + } + + /** + * Delete any files left over from previous run. + * + * @param basename Base name of files we're to clean up. + */ + public void cleanUpOldFiles(String basename) { + cleanUpOldFiles(getTmpDir(), basename); + } + + /** + * Delete any files left over from previous run. + * + * @param prefix Base name of files we're to clean up. + * @param basedir Directory to start cleaning in. + */ + public void cleanUpOldFiles(File basedir, String prefix) { + File [] files = FileUtils.getFilesWithPrefix(basedir, prefix); + if (files != null) { + for (int i = 0; i < files.length; i++) { + org.apache.commons.io.FileUtils.deleteQuietly(files[i]); + } + } + } + + + public static File tmpDir() throws IOException { + String tmpDirStr = System.getProperty(TEST_TMP_SYSTEM_PROPERTY_NAME); + tmpDirStr = (tmpDirStr == null)? DEFAULT_TEST_TMP_DIR: tmpDirStr; + File tmpDir = new File(tmpDirStr); + FileUtils.ensureWriteableDirectory(tmpDir); + + if (!tmpDir.canWrite()) + { + throw new IOException(tmpDir.getAbsolutePath() + + " is unwriteable."); + } + + return tmpDir; + } +} From eda46e2554f52d0514de04b6624f81964e67289d Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 6 Jan 2015 16:24:39 -0800 Subject: [PATCH 021/240] update junit dependency since TmpDirTestCase.java is not in the "test" area --- pom.xml | 1 - 1 file changed, 1 deletion(-) diff --git a/pom.xml b/pom.xml index 6664efd8..df8d0928 100644 --- a/pom.xml +++ b/pom.xml @@ -65,7 +65,6 @@ junit junit 3.8.1 - test From c77d6f5b0dcd899f5adff3db8eab87319cc162ed Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 27 Jan 2015 14:55:45 -0800 Subject: [PATCH 022/240] update CHANGES.md --- CHANGES.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGES.md b/CHANGES.md index 7fb2f7c4..b872846d 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -3,6 +3,7 @@ * [Escape redirect URLs in RealCDXExtractorOutput](https://github.com/iipc/webarchive-commons/pull/36) * [Tests fail on Windows](https://github.com/iipc/webarchive-commons/issues/2) * [Test fails on Java 8](https://github.com/iipc/webarchive-commons/issues/31) +* [RecordingOutputStream can affect tcp packets sent in an undesirable way](https://github.com/iipc/webarchive-commons/issues/38) 1.1.4 ----- From 5df4d91d8cb7c4c2943c318eb44cb9579ac55597 Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Wed, 4 Feb 2015 10:10:11 +0000 Subject: [PATCH 023/240] [maven-release-plugin] prepare release webarchive-commons-1.1.5 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index df8d0928..0ed119b8 100644 --- a/pom.xml +++ b/pom.xml @@ -9,7 +9,7 @@ org.netpreserve.commons webarchive-commons - 1.1.5-SNAPSHOT + 1.1.5 jar webarchive-commons From 62ff2fefb02e9bd24d7c41945628006682c00ce1 Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Wed, 4 Feb 2015 10:10:14 +0000 Subject: [PATCH 024/240] [maven-release-plugin] prepare for next development iteration --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 0ed119b8..7a32de08 100644 --- a/pom.xml +++ b/pom.xml @@ -9,7 +9,7 @@ org.netpreserve.commons webarchive-commons - 1.1.5 + 1.1.6-SNAPSHOT jar webarchive-commons From c44e320ae6df411403a50a4bddfcdfa0c27898f7 Mon Sep 17 00:00:00 2001 From: Gerhard Gossen Date: Mon, 15 Jun 2015 13:25:59 +0200 Subject: [PATCH 025/240] Handle empty String argument in CharsetDetector.trimAttrValue --- .../java/org/archive/format/text/charset/CharsetDetector.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/main/java/org/archive/format/text/charset/CharsetDetector.java b/src/main/java/org/archive/format/text/charset/CharsetDetector.java index d391aac3..ae71b5fa 100644 --- a/src/main/java/org/archive/format/text/charset/CharsetDetector.java +++ b/src/main/java/org/archive/format/text/charset/CharsetDetector.java @@ -178,6 +178,9 @@ protected String getCharsetFromMeta(byte buffer[],int len) throws IOException { } private static String trimAttrValue(String value) { + if (value.isEmpty()) { + return value; + } String result = value; if (result.charAt(0) == '"') { result = result.substring(1, result.length() - 1); From c7daf46e75b7d9cebee9de9f2c54560f333e6976 Mon Sep 17 00:00:00 2001 From: Andy Jackson Date: Mon, 15 Jun 2015 22:26:08 +0100 Subject: [PATCH 026/240] Update CHANGES.md --- CHANGES.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index b872846d..c43ff93e 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,3 +1,7 @@ +1.1.6 +----- +* [Handle empty String argument in CharsetDetector.trimAttrValue](https://github.com/iipc/webarchive-commons/pull/49) + 1.1.5 ----- * [Escape redirect URLs in RealCDXExtractorOutput](https://github.com/iipc/webarchive-commons/pull/36) From 6fcd096e6563d27244b8e37423af7b5aa58a9e6f Mon Sep 17 00:00:00 2001 From: RogerMathisen Date: Tue, 14 Jul 2015 10:36:12 +0200 Subject: [PATCH 027/240] Adding commit for: Fix issues #42 #43 #44 #45 and #47 #46 --- CHANGES.md | 6 ++ pom.xml | 2 +- .../extract/RealCDXExtractorOutput.java | 2 +- .../archive/extract/ResourceExtractor.java | 14 +++- .../WARCMetadataRecordExtractorOutput.java | 2 +- .../archive/extract/WATExtractorOutput.java | 71 ++++++++++++++++--- .../archive/resource/ResourceConstants.java | 3 +- .../archive/resource/warc/WARCResource.java | 8 ++- .../record/WARCMetaDataResourceFactory.java | 4 +- src/main/java/org/archive/util/IAUtils.java | 33 +++++++++ .../resources/org/archive/commons.properties | 5 ++ 11 files changed, 128 insertions(+), 22 deletions(-) create mode 100644 src/main/resources/org/archive/commons.properties diff --git a/CHANGES.md b/CHANGES.md index c43ff93e..70f9b052 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,6 +1,12 @@ 1.1.6 ----- * [Handle empty String argument in CharsetDetector.trimAttrValue](https://github.com/iipc/webarchive-commons/pull/49) +* [WAT extractor: adding information in WAT's warcinfo](https://github.com/iipc/webarchive-commons/issues/47) +* [WAT extractor: missing WARC format version](https://github.com/iipc/webarchive-commons/issues/45) +* [WAT extractor: envelope structure does not conform to the WAT specification](https://github.com/iipc/webarchive-commons/issues/44) +* [WAT extractor: WARC-Date in all records should be the WAT record generation date](https://github.com/iipc/webarchive-commons/issues/43) +* [WAT extractor: WARC-Filename in the WAT warcinfo record should be the WAT filename itself](https://github.com/iipc/webarchive-commons/issues/42) +* [WAT extractor: Entity-Trailing-Slop-Bytes should be called Entity-Trailing-Slop-Length](https://github.com/iipc/webarchive-commons/issues/48) 1.1.5 ----- diff --git a/pom.xml b/pom.xml index d3314679..7984edde 100644 --- a/pom.xml +++ b/pom.xml @@ -9,7 +9,7 @@ org.netpreserve.commons webarchive-commons - 1.1.6-SNAPSHOT + 1.1.7-SNAPSHOT jar webarchive-commons diff --git a/src/main/java/org/archive/extract/RealCDXExtractorOutput.java b/src/main/java/org/archive/extract/RealCDXExtractorOutput.java index 8ca3ff82..e6f6e82f 100644 --- a/src/main/java/org/archive/extract/RealCDXExtractorOutput.java +++ b/src/main/java/org/archive/extract/RealCDXExtractorOutput.java @@ -104,7 +104,7 @@ public void output(Resource resource) throws IOException { String meta = "TBD"; String redir = "TBD"; - if(format.equals("WARC")) { + if(format.startsWith("WARC")) { origUrl = getWARCURL(m); date = getWARCDate(m); String type = getWARCType(m); diff --git a/src/main/java/org/archive/extract/ResourceExtractor.java b/src/main/java/org/archive/extract/ResourceExtractor.java index 7f4d6e7a..2812aa5b 100644 --- a/src/main/java/org/archive/extract/ResourceExtractor.java +++ b/src/main/java/org/archive/extract/ResourceExtractor.java @@ -1,6 +1,7 @@ package org.archive.extract; import java.io.FileNotFoundException; +import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.io.OutputStreamWriter; @@ -74,7 +75,7 @@ public int run(String[] args) if(args.length < 1) { return USAGE(1); } - if(args.length > 3) { + if(args.length > 4) { return USAGE(1); } int max = Integer.MAX_VALUE; @@ -89,7 +90,14 @@ public int run(String[] args) } } String path = args[arg]; - if(args.length == arg + 2) { + String outputFile = null; + if(args.length >= arg + 2) { + //if a output file is specified in the command line + if(args.length == arg + 3) { + outputFile = args[arg+2]; + os.close(); + os = new FileOutputStream(outputFile); + } if(args[arg].equals("-cdx")) { path = args[arg+1]; out = new RealCDXExtractorOutput(makePrintWriter(os)); @@ -100,7 +108,7 @@ public int run(String[] args) } else if(args[arg].equals("-wat")) { path = args[arg+1]; - out = new WATExtractorOutput(os); + out = new WATExtractorOutput(os, outputFile); } else { String filter = args[arg+1]; out = new JSONViewExtractorOutput(os, filter); diff --git a/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java b/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java index ff46a914..68f9d1c8 100644 --- a/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java +++ b/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java @@ -68,7 +68,7 @@ public void output(Resource resource) throws IOException { String date = "TBD"; String canUrl = "TBD"; - if(format.equals("WARC")) { + if(format.startsWith("WARC")) { origUrl = getWARCURL(m); date = getWARCDate(m); String type = getWARCType(m); diff --git a/src/main/java/org/archive/extract/WATExtractorOutput.java b/src/main/java/org/archive/extract/WATExtractorOutput.java index f4d27147..3bcfa924 100644 --- a/src/main/java/org/archive/extract/WATExtractorOutput.java +++ b/src/main/java/org/archive/extract/WATExtractorOutput.java @@ -2,11 +2,13 @@ import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; +import java.io.File; import java.io.IOException; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.nio.charset.Charset; import java.text.ParseException; +import java.net.UnknownHostException; import java.util.Date; import org.archive.format.gzip.GZIPMemberWriter; @@ -22,6 +24,12 @@ import org.archive.util.io.CommitedOutputStream; import org.json.JSONException; +import java.net.InetAddress; +import java.text.DateFormat; +import java.text.SimpleDateFormat; + +import java.util.logging.Logger; + public class WATExtractorOutput implements ExtractorOutput { WARCRecordWriter recW; private boolean wroteFirst; @@ -29,11 +37,15 @@ public class WATExtractorOutput implements ExtractorOutput { private static int DEFAULT_BUFFER_RAM = 1024 * 1024; private int bufferRAM = DEFAULT_BUFFER_RAM; private final static Charset UTF8 = Charset.forName("UTF-8"); + private String outputFile; + + private static final Logger LOG = Logger.getLogger(WATExtractorOutput.class.getName()); - public WATExtractorOutput(OutputStream out) { + public WATExtractorOutput(OutputStream out, String outputFile) { gzW = new GZIPMemberWriter(out); recW = new WARCRecordWriter(); wroteFirst = false; + this.outputFile = outputFile; } private CommitedOutputStream getOutput() { @@ -56,9 +68,9 @@ public void output(Resource resource) throws IOException { throw new IOException("Missing Envelope.Format"); } cos = getOutput(); - if(envelopeFormat.equals("ARC")) { + if(envelopeFormat.startsWith("ARC")) { writeARC(cos,top); - } else if(envelopeFormat.equals("WARC")) { + } else if(envelopeFormat.startsWith("WARC")) { writeWARC(cos,top); } else { // hrm... @@ -68,13 +80,51 @@ public void output(Resource resource) throws IOException { } private void writeWARCInfo(OutputStream recOut, MetaData md) throws IOException { - String filename = JSONUtils.extractSingle(md, "Container.Filename"); - if(filename == null) { - throw new IOException("No Container.Filename..."); + // filename is given in the command line + String filename = outputFile; + if (filename == null || filename.length() == 0) { + // if no filename by command line, we construct a default filename base on container filename + filename = JSONUtils.extractSingle(md, "Container.Filename"); + if (filename == null) { + throw new IOException("No Container.Filename..."); + } + if (filename.endsWith(".warc") || filename.endsWith(".warc.gz")) { + filename = filename.replaceFirst("\\.warc$", ".warc.wat.gz"); + filename = filename.replaceFirst("\\.warc\\.gz$", ".warc.wat.gz"); + } else if (filename.endsWith(".arc") || filename.endsWith(".arc.gz")) { + filename = filename.replaceFirst("\\.arc$", ".arc.wat.gz"); + filename = filename.replaceFirst("\\.arc\\.gz$", ".arc.wat.gz"); + } } + // removing path from filename + File tmpFile = new File(filename); + filename = tmpFile.getName(); HttpHeaders headers = new HttpHeaders(); - headers.add("Software-Info", IAUtils.COMMONS_VERSION); - headers.addDateHeader("Extracted-Date", new Date()); + headers.add("software", IAUtils.COMMONS_VERSION); + headers.addDateHeader("extractedDate", new Date()); + + // add ip, hostname + try { + InetAddress host = InetAddress.getLocalHost(); + headers.add("ip", host.getHostAddress()); + headers.add("hostname", host.getCanonicalHostName()); + } catch (UnknownHostException e) { + LOG.warning("unable to obtain local crawl engine host :\n"+e.getMessage()); + } + + headers.add("format", IAUtils.WARC_FORMAT); + headers.add("conformsTo", IAUtils.WARC_FORMAT_CONFORMS_TO); + // optional arguments + if(IAUtils.OPERATOR != null && IAUtils.OPERATOR.length() > 0) { + headers.add("operator", IAUtils.OPERATOR); + } + if(IAUtils.PUBLISHER != null && IAUtils.PUBLISHER.length() > 0) { + headers.add("publisher", IAUtils.PUBLISHER); + } + if(IAUtils.WAT_WARCINFO_DESCRIPTION != null && IAUtils.WAT_WARCINFO_DESCRIPTION.length() > 0) { + headers.add("description", IAUtils.WAT_WARCINFO_DESCRIPTION); + } + ByteArrayOutputStream baos = new ByteArrayOutputStream(); headers.write(baos); recW.writeWARCInfoRecord(recOut,filename,baos.toByteArray()); @@ -105,8 +155,9 @@ private void writeWARC(OutputStream recOut, MetaData md) throws IOException { } else { targetURI = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Target-URI"); } - String capDateString = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Date"); - capDateString = transformWARCDate(capDateString); + // handle date of generation in WARC format + DateFormat dateFormat = new SimpleDateFormat("yyyyMMddHHmmss"); + String capDateString = dateFormat.format(new Date()); String recId = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Record-ID"); writeWARCMDRecord(recOut,md,targetURI,capDateString,recId); } diff --git a/src/main/java/org/archive/resource/ResourceConstants.java b/src/main/java/org/archive/resource/ResourceConstants.java index dd04fcfe..3b8bea1c 100644 --- a/src/main/java/org/archive/resource/ResourceConstants.java +++ b/src/main/java/org/archive/resource/ResourceConstants.java @@ -31,6 +31,7 @@ public interface ResourceConstants { public static final String ENVELOPE_FORMAT = "Format"; public static final String ENVELOPE_FORMAT_ARC = "ARC"; public static final String ENVELOPE_FORMAT_WARC = "WARC"; + public static final String ENVELOPE_FORMAT_WARC_1_0 = "WARC/1.0"; public static final String WARC_HEADER_LENGTH = "WARC-Header-Length"; public static final String WARC_HEADER_METADATA = "WARC-Header-Metadata"; @@ -104,7 +105,7 @@ public interface ResourceConstants { public static final String HTTP_ENTITY_LENGTH = "Entity-Length"; public static final String HTTP_ENTITY_DIGEST = "Entity-Digest"; - public static final String HTTP_ENTITY_TRAILING_SLOP = "Entity-Trailing-Slop-Bytes"; + public static final String HTTP_ENTITY_TRAILING_SLOP = "Entity-Trailing-Slop-Length"; public static final String HTML_METADATA = "HTML-Metadata"; public static final String HTML_HEAD = "Head"; diff --git a/src/main/java/org/archive/resource/warc/WARCResource.java b/src/main/java/org/archive/resource/warc/WARCResource.java index 80929206..d538a25d 100644 --- a/src/main/java/org/archive/resource/warc/WARCResource.java +++ b/src/main/java/org/archive/resource/warc/WARCResource.java @@ -36,7 +36,7 @@ public WARCResource(MetaData metaData, ResourceContainer container, this.response = response; long length = -1; - metaData.putString(ENVELOPE_FORMAT, ENVELOPE_FORMAT_WARC); + metaData.putString(ENVELOPE_FORMAT, ENVELOPE_FORMAT_WARC_1_0); metaData.putLong(WARC_HEADER_LENGTH, response.getHeaderBytes()); MetaData fields = metaData.createChild(WARC_HEADER_METADATA); for(HttpHeader h : response.getHeaders()) { @@ -68,11 +68,11 @@ public InputStream getInputStream() { } public void notifyEOF() throws IOException { - envelope.putLong(PAYLOAD_LENGTH, countingIS.getCount()); String digString = Base32.encode(digIS.getMessageDigest().digest()); - envelope.putString(PAYLOAD_DIGEST, "sha1:"+digString); if(container.isCompressed()) { + metaData.putLong(PAYLOAD_LENGTH, countingIS.getCount()); metaData.putLong(PAYLOAD_SLOP_BYTES, StreamCopy.readToEOF(response)); + metaData.putString(PAYLOAD_DIGEST, "sha1:"+digString); } else { // consume trailing bytes if we can... InputStream raw = response.getInner(); @@ -81,7 +81,9 @@ public void notifyEOF() throws IOException { (PushBackOneByteInputStream) raw; long numNewlines = StreamCopy.skipChars(pb1bis, CR_NL_CHARS); if(numNewlines > 0) { + metaData.putLong(PAYLOAD_LENGTH, countingIS.getCount()); metaData.putLong(PAYLOAD_SLOP_BYTES, numNewlines); + metaData.putString(PAYLOAD_DIGEST, "sha1:"+digString); } } } diff --git a/src/main/java/org/archive/resource/warc/record/WARCMetaDataResourceFactory.java b/src/main/java/org/archive/resource/warc/record/WARCMetaDataResourceFactory.java index 3f502665..0dfb2834 100644 --- a/src/main/java/org/archive/resource/warc/record/WARCMetaDataResourceFactory.java +++ b/src/main/java/org/archive/resource/warc/record/WARCMetaDataResourceFactory.java @@ -33,8 +33,8 @@ public Resource getResource(InputStream is, MetaData parentMetaData, if(headers.isCorrupt()) { md.putBoolean(WARC_META_FIELDS_CORRUPT, true); } - md.putLong(PAYLOAD_SLOP_BYTES, StreamCopy.readToEOF(is)); - md.putLong(PAYLOAD_LENGTH, bytes); + parentMetaData.putLong(PAYLOAD_SLOP_BYTES, StreamCopy.readToEOF(is)); + parentMetaData.putLong(PAYLOAD_LENGTH, bytes); return new WARCMetaDataResource(md,container, headers); } catch (HttpParseException e) { diff --git a/src/main/java/org/archive/util/IAUtils.java b/src/main/java/org/archive/util/IAUtils.java index ed563d02..d3cf5cf9 100644 --- a/src/main/java/org/archive/util/IAUtils.java +++ b/src/main/java/org/archive/util/IAUtils.java @@ -24,7 +24,10 @@ import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; +import java.io.Reader; +import java.io.UnsupportedEncodingException; import java.nio.charset.Charset; +import java.util.Properties; /** * Miscellaneous useful methods. @@ -35,6 +38,11 @@ public class IAUtils { public final static Charset UTF8 = Charset.forName("utf-8"); final public static String COMMONS_VERSION = loadCommonsVersion(); + final public static String PUBLISHER = loadCommons("publisher"); + final public static String OPERATOR = loadCommons("operator"); + final public static String WAT_WARCINFO_DESCRIPTION = loadCommons("wat.warcinfo.description"); + final public static String WARC_FORMAT = loadCommons("warc.format"); + final public static String WARC_FORMAT_CONFORMS_TO = loadCommons("warc.format.conforms.to"); public static String loadCommonsVersion() { InputStream input = IAUtils.class.getResourceAsStream( @@ -57,6 +65,31 @@ public static String loadCommonsVersion() { return version.trim(); } + public static String loadCommons(String id) { + InputStream input = IAUtils.class.getResourceAsStream("/org/archive/commons.properties"); + Reader reader = null; + if (input == null) { + return "UNKNOWN"; + } + try { + reader = new InputStreamReader(input, "UTF-8"); + } catch (UnsupportedEncodingException e) { + return "UNKNOWN"; + } + Properties prop = new Properties(); + try { + prop.load(reader); + } catch (IOException e1) { + return "UNKNOWN"; + } + if (prop.getProperty(id) != null) { + return prop.getProperty(id); + } else { + return "UNKNOWN"; + } + + } + public static void closeQuietly(Object input) { if(input == null || ! (input instanceof Closeable)) { return; diff --git a/src/main/resources/org/archive/commons.properties b/src/main/resources/org/archive/commons.properties new file mode 100644 index 00000000..f115ff43 --- /dev/null +++ b/src/main/resources/org/archive/commons.properties @@ -0,0 +1,5 @@ +operator= +publisher= +wat.warcinfo.description= +warc.format=WARC File Format 1.0 +warc.format.conforms.to=http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf From d4e7730db7c3fc49c55995bfbdf5b5d89f9e2145 Mon Sep 17 00:00:00 2001 From: RogerMathisen Date: Tue, 14 Jul 2015 11:19:58 +0200 Subject: [PATCH 028/240] [maven-release-plugin] prepare release webarchive-commons-1.1.6 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 7984edde..27118d70 100644 --- a/pom.xml +++ b/pom.xml @@ -9,7 +9,7 @@ org.netpreserve.commons webarchive-commons - 1.1.7-SNAPSHOT + 1.1.6 jar webarchive-commons From 7a7cf08941d966c6e3a6fcd42f1a886552d23038 Mon Sep 17 00:00:00 2001 From: RogerMathisen Date: Tue, 14 Jul 2015 11:20:38 +0200 Subject: [PATCH 029/240] [maven-release-plugin] prepare for next development iteration --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 27118d70..7984edde 100644 --- a/pom.xml +++ b/pom.xml @@ -9,7 +9,7 @@ org.netpreserve.commons webarchive-commons - 1.1.6 + 1.1.7-SNAPSHOT jar webarchive-commons From c1545bc7bee9c9bbd8626cf1b4b8d323bd415f2c Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 28 Oct 2015 11:45:48 -0700 Subject: [PATCH 030/240] fix for HER-2089 - get rid of broken, seemingly unnecessary escapeWhitespace() step of uri fixup --- .../org/archive/url/UsableURIFactory.java | 52 +------------------ .../org/archive/url/UsableURIFactoryTest.java | 8 ++- 2 files changed, 9 insertions(+), 51 deletions(-) diff --git a/src/main/java/org/archive/url/UsableURIFactory.java b/src/main/java/org/archive/url/UsableURIFactory.java index 9118b850..1059bfbd 100644 --- a/src/main/java/org/archive/url/UsableURIFactory.java +++ b/src/main/java/org/archive/url/UsableURIFactory.java @@ -49,8 +49,8 @@ * @author stack */ public class UsableURIFactory extends URI { - - private static final long serialVersionUID = -6146295130382209042L; + + private static final long serialVersionUID = 2L; /** * Logging instance. @@ -395,9 +395,6 @@ private String fixup(String uri, final URI base, final String charset) } TextUtils.recycleMatcher(matcher); - // now, minimally escape any whitespace - uri = escapeWhitespace(uri); - // For further processing, get uri elements. See the RFC2396REGEX // comment above for explanation of group indices used in the below. // matcher = RFC2396REGEX.matcher(uri); @@ -663,51 +660,6 @@ private String ensureMinimalEscaping(String u, final String charset, return u; } - /** - * Escape any whitespace found. - * - * The parent class takes care of the bulk of escaping. But if any - * instance of escaping is found in the URI, then we ask for parent - * to do NO escaping. Here we escape any whitespace found irrespective - * of whether the uri has already been escaped. We do this for - * case where uri has been judged already-escaped only, its been - * incompletly done and whitespace remains. Spaces, etc., in the URI are - * a real pain. Their presence will break log file and ARC parsing. - * @param uri URI string to check. - * @return uri with spaces escaped if any found. - */ - protected String escapeWhitespace(String uri) { - // Just write a new string anyways. The perl '\s' is not - // as inclusive as the Character.isWhitespace so there are - // whitespace characters we could miss. So, rather than - // write some awkward regex, just go through the string - // a character at a time. Only create buffer first time - // we find a space. - MutableString buffer = null; - for (int i = 0; i < uri.length(); i++) { - char c = uri.charAt(i); - if (Character.isWhitespace(c)) { - if (buffer == null) { - buffer = new MutableString(uri.length() + - 2 /*If space, two extra characters (at least)*/); - buffer.append(uri.substring(0, i)); - } - buffer.append("%"); - String hexStr = Integer.toHexString(c); - if ((hexStr.length() % 2) > 0) { - buffer.append("0"); - } - buffer.append(hexStr); - - } else { - if (buffer != null) { - buffer.append(c); - } - } - } - return (buffer != null)? buffer.toString(): uri; - } - /** * Check port on passed http authority. Make sure the size is not larger * than allowed: See the 'port' definition on this diff --git a/src/test/java/org/archive/url/UsableURIFactoryTest.java b/src/test/java/org/archive/url/UsableURIFactoryTest.java index af190957..73f2b6db 100644 --- a/src/test/java/org/archive/url/UsableURIFactoryTest.java +++ b/src/test/java/org/archive/url/UsableURIFactoryTest.java @@ -174,7 +174,7 @@ public final void testWhitespaceEscaped() throws URIException { assertTrue("Not equal " + uuri.toString(), uuri.toString().equals(tgtUri)); uri = "http://archive.org/index%25\u001D.html"; - tgtUri = "http://archive.org/index%25%1D.html".toLowerCase(); + tgtUri = "http://archive.org/index%25%1D.html"; uuri = UsableURIFactory.getInstance(uri); assertEquals("whitespace escaping", tgtUri, uuri.toString()); uri = "http://gemini.info.usaid.gov/directory/" + @@ -185,6 +185,12 @@ public final void testWhitespaceEscaped() throws URIException { "faxResults.cfm?name=Ebenezer +Rumplestiltskin,&location=" + "RRB%20%20%20%205%2E08%2D006"); assertEquals("whitespace escaping", tgtUri, uuri.toString()); + + // https://webarchive.jira.com/browse/HER-2089 + uri = "http://archive.org/index%25\u3000.html"; + tgtUri = "http://archive.org/index%25%E3%80%80.html"; + uuri = UsableURIFactory.getInstance(uri); + assertEquals("U+3000 ideographic space escaping", tgtUri, uuri.toString()); } // public final void testFailedGetPath() throws URIException { From 86589b0fafaa0918ce2192080e68941c47b39c40 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 14 Dec 2015 10:38:49 -0800 Subject: [PATCH 031/240] flush output etc before tallying stats to fix sizeOnDisk calculation --- src/main/java/org/archive/io/warc/WARCWriter.java | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/archive/io/warc/WARCWriter.java b/src/main/java/org/archive/io/warc/WARCWriter.java index e2d28ee9..7e22e08b 100644 --- a/src/main/java/org/archive/io/warc/WARCWriter.java +++ b/src/main/java/org/archive/io/warc/WARCWriter.java @@ -236,8 +236,8 @@ public void writeRecord(WARCRecordInfo recordInfo) long totalBytes = 0; long startPosition; - try { - startPosition = getPosition(); + startPosition = getPosition(); + try { preWriteRecordTasks(); // TODO: Revisit encoding of header. @@ -261,13 +261,12 @@ public void writeRecord(WARCRecordInfo recordInfo) write(CRLF_BYTES); totalBytes += 2 * CRLF_BYTES.length; - tally(recordInfo.getType(), contentBytes, totalBytes, getPosition() - startPosition); - recordInfo.setWARCFilename(getFilenameWithoutOccupiedSuffix()); recordInfo.setWARCFileOffset(startPosition); tmpRecordLog.add(recordInfo); } finally { postWriteRecordTasks(); + tally(recordInfo.getType(), contentBytes, totalBytes, getPosition() - startPosition); } } From 1fede2354f65437825b6471261a8f0361ffba241 Mon Sep 17 00:00:00 2001 From: Jeremy Wiebe Date: Mon, 7 Mar 2016 22:35:28 -0500 Subject: [PATCH 032/240] Store origin-code in ARCRecord header; accessible through getOrigin() method. --- .../org/archive/format/ArchiveFileConstants.java | 7 ++++++- .../java/org/archive/format/arc/ARCConstants.java | 2 +- src/main/java/org/archive/io/arc/ARCRecord.java | 13 +++++++++---- .../java/org/archive/io/arc/ARCRecordMetaData.java | 9 ++++++++- 4 files changed, 24 insertions(+), 7 deletions(-) diff --git a/src/main/java/org/archive/format/ArchiveFileConstants.java b/src/main/java/org/archive/format/ArchiveFileConstants.java index b0b8aa66..df3b4465 100644 --- a/src/main/java/org/archive/format/ArchiveFileConstants.java +++ b/src/main/java/org/archive/format/ArchiveFileConstants.java @@ -44,6 +44,11 @@ public interface ArchiveFileConstants { * Key for the Archive File version field. */ public static final String VERSION_FIELD_KEY = "version"; + + /** + * Key for the Archive File origin-code field. + */ + public static final String ORIGIN_FIELD_KEY = "origin"; /** * Key for the Archive File length field. @@ -80,7 +85,7 @@ public interface ArchiveFileConstants { * Key for the Archive Record absolute offset into Archive file. */ public static final String ABSOLUTE_OFFSET_KEY = "absolute-offset"; - + public static final String READER_IDENTIFIER_FIELD_KEY = "reader-identifier"; diff --git a/src/main/java/org/archive/format/arc/ARCConstants.java b/src/main/java/org/archive/format/arc/ARCConstants.java index a336ddeb..5987b49f 100755 --- a/src/main/java/org/archive/format/arc/ARCConstants.java +++ b/src/main/java/org/archive/format/arc/ARCConstants.java @@ -196,7 +196,7 @@ public interface ARCConstants extends ArchiveFileConstants { .asList(new String[] { URL_FIELD_KEY, IP_HEADER_FIELD_KEY, DATE_FIELD_KEY, MIMETYPE_FIELD_KEY, LENGTH_FIELD_KEY, VERSION_FIELD_KEY, - ABSOLUTE_OFFSET_KEY }); + ORIGIN_FIELD_KEY, ABSOLUTE_OFFSET_KEY }); /** * Minimum possible record length. diff --git a/src/main/java/org/archive/io/arc/ARCRecord.java b/src/main/java/org/archive/io/arc/ARCRecord.java index 21bea07c..2d9c9bf4 100644 --- a/src/main/java/org/archive/io/arc/ARCRecord.java +++ b/src/main/java/org/archive/io/arc/ARCRecord.java @@ -200,7 +200,7 @@ public ARCRecord(InputStream in, ArchiveRecordHeader metaData, public ARCRecord(InputStream in, final String identifier, final long offset, boolean digest, boolean strict, final boolean parseHttpHeaders, - final boolean isAlignedOnFirstRecord, String version) + final boolean isAlignedOnFirstRecord, String version) throws IOException { super(in, null, 0, digest, strict); setHeader(parseHeaders(in, identifier, offset, strict, isAlignedOnFirstRecord, version)); @@ -243,6 +243,7 @@ private ArchiveRecordHeader parseHeaders(final InputStream in, getTokenizedHeaderLine(in, firstLineValues); int bodyOffset = 0; + String origin = ""; if (offset == 0 && isAlignedOnFirstRecord) { // If offset is zero and we were aligned at first record on // creation (See #alignedOnFirstRecord for more on this), then no @@ -263,6 +264,7 @@ private ArchiveRecordHeader parseHeaders(final InputStream in, bodyOffset += getTokenizedHeaderLine(in, secondLineValues); version = ((String)secondLineValues.get(0) + "." + (String)secondLineValues.get(1)); + origin = (String)secondLineValues.get(2); // Just read over the 3rd line. We used to parse it and use // values found here but now we just hardcode them to avoid // having to read this 3rd line even for random arc file accesses. @@ -271,7 +273,8 @@ private ArchiveRecordHeader parseHeaders(final InputStream in, } setBodyOffset(bodyOffset); - return computeMetaData(this.headerFieldNameKeys, firstLineValues, version, offset, identifier); + return computeMetaData(this.headerFieldNameKeys, firstLineValues, + version, origin, offset, identifier); } /** @@ -362,7 +365,8 @@ private int getTokenizedHeaderLine(final InputStream stream, * @exception IOException If no. of keys doesn't match no. of values. */ private ARCRecordMetaData computeMetaData(List keys, - List values, String v, long offset, final String identifier) + List values, String v, String origin, + long offset, final String identifier) throws IOException { if (keys.size() != values.size()) { List originalValues = values; @@ -423,6 +427,7 @@ private ARCRecordMetaData computeMetaData(List keys, } headerFields.put(VERSION_FIELD_KEY, v); + headerFields.put(ORIGIN_FIELD_KEY, origin); headerFields.put(ABSOLUTE_OFFSET_KEY, new Long(offset)); return new ARCRecordMetaData(identifier, headerFields); @@ -832,4 +837,4 @@ protected String getDigest4Cdx(ArchiveRecordHeader h) { } return (result != null) ? result: super.getDigest4Cdx(h); } -} \ No newline at end of file +} diff --git a/src/main/java/org/archive/io/arc/ARCRecordMetaData.java b/src/main/java/org/archive/io/arc/ARCRecordMetaData.java index 3f617041..02b368e4 100644 --- a/src/main/java/org/archive/io/arc/ARCRecordMetaData.java +++ b/src/main/java/org/archive/io/arc/ARCRecordMetaData.java @@ -168,6 +168,13 @@ public String getVersion() { return (String)this.headerFields.get(VERSION_FIELD_KEY); } + /** + * @return Arcfile origin code. + */ + public String getOrigin() { + return (String)this.headerFields.get(ORIGIN_FIELD_KEY); + } + /** * @return Offset into arcfile at which this record begins. */ @@ -264,4 +271,4 @@ public int getContentBegin() { protected void setContentBegin(final int offset) { this.contentBegin = offset; } -} \ No newline at end of file +} From 28c9a1b2b04c9f392247690c7112ae20882d8cbc Mon Sep 17 00:00:00 2001 From: Jeremy Wiebe Date: Fri, 11 Mar 2016 11:45:42 -0500 Subject: [PATCH 033/240] Update ArchiveFileConstants.java --- src/main/java/org/archive/format/ArchiveFileConstants.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/archive/format/ArchiveFileConstants.java b/src/main/java/org/archive/format/ArchiveFileConstants.java index df3b4465..89e1308c 100644 --- a/src/main/java/org/archive/format/ArchiveFileConstants.java +++ b/src/main/java/org/archive/format/ArchiveFileConstants.java @@ -46,7 +46,7 @@ public interface ArchiveFileConstants { public static final String VERSION_FIELD_KEY = "version"; /** - * Key for the Archive File origin-code field. + * Key for the Archive File origin-code field. This value is often hard-coded, so use with care. */ public static final String ORIGIN_FIELD_KEY = "origin"; From 7ef8aa95bc758d96b60a30d036dd0c32de20937c Mon Sep 17 00:00:00 2001 From: Jeremy Wiebe Date: Mon, 14 Mar 2016 17:10:23 -0400 Subject: [PATCH 034/240] Update CHANGES.md --- CHANGES.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index 70f9b052..3c9f4c8b 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,3 +1,7 @@ +1.1.7 +----- +* [Store origin-code of ARC file header](https://github.com/iipc/webarchive-commons/pull/52/) + 1.1.6 ----- * [Handle empty String argument in CharsetDetector.trimAttrValue](https://github.com/iipc/webarchive-commons/pull/49) From 5cfff50a03263208520ca2d260229eefb2aec2f7 Mon Sep 17 00:00:00 2001 From: Hunter Stern Date: Mon, 21 Mar 2016 17:30:30 -0700 Subject: [PATCH 035/240] Make canonicalizer be able to strip session id params even if they are the first params in the query string. And add session id strip test. And change IAURLCanonicalizer.java to ensure that if after transformations on the query string have completed and the query is empty, there is not a ? added to the end of the url. --- .../org/archive/url/IAURLCanonicalizer.java | 35 +++++++++---------- .../org/archive/url/URLRegexTransformer.java | 10 +++--- .../archive/url/IAURLCanonicalizerTest.java | 10 ++++++ 3 files changed, 32 insertions(+), 23 deletions(-) diff --git a/src/main/java/org/archive/url/IAURLCanonicalizer.java b/src/main/java/org/archive/url/IAURLCanonicalizer.java index 029598f6..0cf7c8a4 100644 --- a/src/main/java/org/archive/url/IAURLCanonicalizer.java +++ b/src/main/java/org/archive/url/IAURLCanonicalizer.java @@ -63,25 +63,24 @@ public void canonicalize(HandyURL url) { String query = url.getQuery(); if(query != null) { - if(query.equals("")) { - if(rules.isSet(QUERY_SETTINGS, QUERY_STRIP_EMPTY)) { - query = null; - } - } else { - // we have a query... what to do with it? + // we have a query... what to do with it? - // first remove uneeded: - if(rules.isSet(QUERY_SETTINGS, QUERY_STRIP_SESSION_ID)) { - query = URLRegexTransformer.stripQuerySessionID(query); - } - // lower-case: - if(rules.isSet(QUERY_SETTINGS, QUERY_LOWERCASE)) { - query = query.toLowerCase(); - } - // re-order? - if(rules.isSet(QUERY_SETTINGS, QUERY_ALPHA_REORDER)) { - query = alphaReorderQuery(query); - } + // first remove uneeded: + if(rules.isSet(QUERY_SETTINGS, QUERY_STRIP_SESSION_ID)) { + query = URLRegexTransformer.stripQuerySessionID(query); + } + // lower-case: + if(rules.isSet(QUERY_SETTINGS, QUERY_LOWERCASE)) { + query = query.toLowerCase(); + } + // re-order? + if(rules.isSet(QUERY_SETTINGS, QUERY_ALPHA_REORDER)) { + query = alphaReorderQuery(query); + } + if(query.equals("")) { + if(rules.isSet(QUERY_SETTINGS, QUERY_STRIP_EMPTY)) { + query = null; + } } url.setQuery(query); } diff --git a/src/main/java/org/archive/url/URLRegexTransformer.java b/src/main/java/org/archive/url/URLRegexTransformer.java index c5505a74..617e0225 100644 --- a/src/main/java/org/archive/url/URLRegexTransformer.java +++ b/src/main/java/org/archive/url/URLRegexTransformer.java @@ -16,11 +16,11 @@ public class URLRegexTransformer { private static final OptimizedPattern QUERY_OPTS[] = { - new OptimizedPattern("(?i)^(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2), - new OptimizedPattern("(?i)^(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2), - new OptimizedPattern("(?i)^(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2), - new OptimizedPattern("(?i)^(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2), - new OptimizedPattern("(?i)^(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", "cftoken=", 1, 2), + new OptimizedPattern("(?i)^(.*)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2), + new OptimizedPattern("(?i)^(.*)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2), + new OptimizedPattern("(?i)^(.*)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2), + new OptimizedPattern("(?i)^(.*)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2), + new OptimizedPattern("(?i)^(.*)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", "cftoken=", 1, 2), }; diff --git a/src/test/java/org/archive/url/IAURLCanonicalizerTest.java b/src/test/java/org/archive/url/IAURLCanonicalizerTest.java index 3263edc7..91751b4a 100644 --- a/src/test/java/org/archive/url/IAURLCanonicalizerTest.java +++ b/src/test/java/org/archive/url/IAURLCanonicalizerTest.java @@ -53,5 +53,15 @@ public void testGetDefaultPort() { assertEquals(80,IAURLCanonicalizer.getDefaultPort("http")); assertEquals(443,IAURLCanonicalizer.getDefaultPort("https")); } + + public void testStripSessionId() throws URISyntaxException { + IAURLCanonicalizer iaC = new IAURLCanonicalizer(new DefaultIACanonicalizerRules()); + compCan(iaC, + "http://www.nsf.gov/statistics/sed/2009/SED_2009.zip?CFID=14387305&CFTOKEN=72942008&jsessionid=f030eacc7e49c4ca0b077922347418418766", + "http://nsf.gov/statistics/sed/2009/sed_2009.zip?jsessionid=f030eacc7e49c4ca0b077922347418418766"); + compCan(iaC, + "http://www.nsf.gov/statistics/sed/2009/SED_2009.zip?CFID=14387305&CFTOKEN=72942008", + "http://nsf.gov/statistics/sed/2009/sed_2009.zip"); + } } From 02e6e29fb735b1fdd0957196d264b40d29e6fa6d Mon Sep 17 00:00:00 2001 From: John Erik Halse Date: Fri, 17 Jun 2016 09:54:36 +0200 Subject: [PATCH 036/240] Updated release notes --- CHANGES.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index 3c9f4c8b..52c40f42 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,6 +1,9 @@ 1.1.7 ----- +* [Make canonicalizer be able to strip session id params even if they are the first params in the query string](https://github.com/iipc/webarchive-commons/pull/54) * [Store origin-code of ARC file header](https://github.com/iipc/webarchive-commons/pull/52/) +* [Flush output etc before tallying stats to fix sizeOnDisk calculation](https://github.com/iipc/webarchive-commons/pull/51) +* [Get rid of broken, seemingly unnecessary escapeWhitespace() step of uri fixup](https://github.com/iipc/webarchive-commons/pull/50) 1.1.6 ----- From a55391dfe1855259939d118c49b84cf386c0960f Mon Sep 17 00:00:00 2001 From: John Erik Halse Date: Fri, 17 Jun 2016 10:25:23 +0200 Subject: [PATCH 037/240] [maven-release-plugin] prepare release webarchive-commons-1.1.7 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 7984edde..f842a09c 100644 --- a/pom.xml +++ b/pom.xml @@ -9,7 +9,7 @@ org.netpreserve.commons webarchive-commons - 1.1.7-SNAPSHOT + 1.1.7 jar webarchive-commons From bb36b6a7375453e1cb8073211041ca3f955ab217 Mon Sep 17 00:00:00 2001 From: John Erik Halse Date: Fri, 17 Jun 2016 10:25:28 +0200 Subject: [PATCH 038/240] [maven-release-plugin] prepare for next development iteration --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index f842a09c..24780063 100644 --- a/pom.xml +++ b/pom.xml @@ -9,7 +9,7 @@ org.netpreserve.commons webarchive-commons - 1.1.7 + 1.1.8-SNAPSHOT jar webarchive-commons From 0cbca57bc87f9bd55844977a480ead400a40920d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kristinn=20Sigur=C3=B0sson?= Date: Wed, 13 Jul 2016 12:21:42 +0000 Subject: [PATCH 039/240] Remove invalid constant The PROFILE_REVISIT_URI_AGNOSTIC_IDENTICAL_DIGEST does not exist in the WARC specification. This file shouldn't include non-standard items. And, in any case, use of PROFILE_REVISIT_IDENTICAL_DIGEST is appropriate, even when using 'uri agnostic' deduplication. --- src/main/java/org/archive/format/warc/WARCConstants.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/main/java/org/archive/format/warc/WARCConstants.java b/src/main/java/org/archive/format/warc/WARCConstants.java index c9f6cbf3..93a81f96 100644 --- a/src/main/java/org/archive/format/warc/WARCConstants.java +++ b/src/main/java/org/archive/format/warc/WARCConstants.java @@ -183,8 +183,6 @@ enum WARCRecordType { public static final String HEADER_KEY_REFERS_TO_FILENAME = "WARC-Refers-To-Filename"; public static final String HEADER_KEY_REFERS_TO_FILE_OFFSET = "WARC-Refers-To-File-Offset"; - public static final String PROFILE_REVISIT_URI_AGNOSTIC_IDENTICAL_DIGEST = - "http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest"; public static final String PROFILE_REVISIT_IDENTICAL_DIGEST = "http://netpreserve.org/warc/1.0/revisit/identical-payload-digest"; public static final String PROFILE_REVISIT_NOT_MODIFIED = From a23cfebe24a959c929b1fcf9fbb6fc37eae31c76 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Sun, 7 Aug 2016 16:49:47 +0200 Subject: [PATCH 040/240] Make regular expression to extract URLs from CSS more restrictive (allow only `"`, `'`, `\"` or `\'` in front of or after the URL). Avoid long-runners when matching the regex due to heavy back-tracking. --- .../java/org/archive/resource/html/ExtractingParseObserver.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java index e1f57b55..df3742fa 100644 --- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java +++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java @@ -21,7 +21,7 @@ public class ExtractingParseObserver implements ParseObserver { boolean inTitle = false; protected static String cssUrlPatString = - "url\\s*\$\\s*([\\\\\"']*.+?[\\\\\"']*)\\s*\$"; + "url\\s*\$\\s*((?:\\\\?[\"'])?.+?(?:\\\\?[\"'])?)\\s*\$"; protected static String cssImportNoUrlPatString = "@import\\s+(('[^']+')|(\"[^\"]+\")|(\$'[^']+'\$)|(\$\"[^\"]+\"\$)|(\$[^)]+\$)|([a-z0-9_.:/\\\\-]+))\\s*;"; From 9d7abed43aef409e19842f875914c50a0b58ccf8 Mon Sep 17 00:00:00 2001 From: David Portabella Date: Wed, 21 Sep 2016 11:54:18 +0200 Subject: [PATCH 041/240] fix: last header was lost if LF LF (intead of CRLF CRLF) --- .../archive/format/http/HttpHeaderParser.java | 1 + .../format/http/HttpResponseParserTest.java | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/src/main/java/org/archive/format/http/HttpHeaderParser.java b/src/main/java/org/archive/format/http/HttpHeaderParser.java index fdec62f2..d63ec405 100755 --- a/src/main/java/org/archive/format/http/HttpHeaderParser.java +++ b/src/main/java/org/archive/format/http/HttpHeaderParser.java @@ -231,6 +231,7 @@ public ParseState handleByte(byte b, HttpHeaderParser parser) if(b == LF) { // TODO: this is lax, is LFLF an OK terminator? // that's all folks! + parser.headerFinished(); parser.parseFinished(); return parser.endState; } diff --git a/src/test/java/org/archive/format/http/HttpResponseParserTest.java b/src/test/java/org/archive/format/http/HttpResponseParserTest.java index 2850fe44..c0d13230 100644 --- a/src/test/java/org/archive/format/http/HttpResponseParserTest.java +++ b/src/test/java/org/archive/format/http/HttpResponseParserTest.java @@ -38,4 +38,23 @@ public void testParse() throws IOException { } + public void testParseWithLf() throws IOException { + + HttpResponseParser parser = new HttpResponseParser(); + String message = "200 OK\nContent-Type: text/plain\n\nHi there"; + try { + HttpResponse response = + parser.parse(new ByteArrayInputStream(message.getBytes(IAUtils.UTF8))); + assertNotNull(response); + HttpHeaders headers = response.getHeaders(); + assertNotNull(headers); + assertEquals(1,headers.size()); + + } catch (HttpParseException e) { + e.printStackTrace(); + fail(); + } + + } + } From 5f223d60c365a53533b2ad7217deaa65b3a91667 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Thu, 24 Nov 2016 11:51:10 +0100 Subject: [PATCH 042/240] Use CharsetDetector to guess encoding of HTML document --- .../resource/html/HTMLResourceFactory.java | 32 +++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/archive/resource/html/HTMLResourceFactory.java b/src/main/java/org/archive/resource/html/HTMLResourceFactory.java index 935843f1..34062ed9 100644 --- a/src/main/java/org/archive/resource/html/HTMLResourceFactory.java +++ b/src/main/java/org/archive/resource/html/HTMLResourceFactory.java @@ -1,9 +1,14 @@ package org.archive.resource.html; +import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; import java.io.UnsupportedEncodingException; +import org.archive.format.http.HttpHeaders; +import org.archive.format.json.JSONUtils; +import org.archive.format.text.charset.CharsetDetector; +import org.archive.format.text.charset.StandardCharsetDetector; import org.archive.format.text.html.CDATALexer; import org.archive.format.text.html.LexParser; import org.archive.resource.MetaData; @@ -13,17 +18,40 @@ import org.archive.resource.ResourceParseException; import org.htmlparser.lexer.Page; import org.htmlparser.util.ParserException; +import org.json.JSONException; +import org.json.JSONObject; public class HTMLResourceFactory implements ResourceFactory { + protected static final int CHARSET_GUESS_CHUNK_SIZE = 8192; + protected static final String HTTP_HEADER_PATH = "Envelope.Payload-Metadata.HTTP-Response-Metadata.Headers"; + + protected CharsetDetector charSetDetector = new StandardCharsetDetector(); + + public Resource getResource(InputStream is, MetaData parentMetaData, ResourceContainer container) throws ResourceParseException, IOException { HTMLMetaData hmd = new HTMLMetaData(parentMetaData); ExtractingParseObserver epo = new ExtractingParseObserver(hmd); LexParser parser = new LexParser(epo); CDATALexer lex = new CDATALexer(); - // TODO: figure out charset: - String charset = "UTF-8"; + + // guess charset based on HTTP header and sniffed content chunk + is = new BufferedInputStream(is, CHARSET_GUESS_CHUNK_SIZE); + byte[] chunk = new byte[CHARSET_GUESS_CHUNK_SIZE]; + is.mark(0); + int chunkSize = is.read(chunk, 0, CHARSET_GUESS_CHUNK_SIZE); + is.reset(); + JSONObject headers = JSONUtils.extractObject(hmd.getTopMetaData(), HTTP_HEADER_PATH); + HttpHeaders httpHeaders = new HttpHeaders(); + if (headers.has("Content-Type")) { + try { + httpHeaders.add("Content-Type", headers.getString("Content-Type")); + } catch (JSONException e) { } + } + + String charset = charSetDetector.getCharset(chunk, chunkSize, httpHeaders); + Page page; try { page = new Page(is, charset); From 607acaa734183b72c816359c588bbf157485d5ba Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Thu, 24 Nov 2016 12:44:53 +0100 Subject: [PATCH 043/240] HTML encoding detection: fix errors with empty content or empty charset values --- .../format/text/charset/CharsetDetector.java | 2 ++ .../resource/html/HTMLResourceFactory.java | 24 +++++++++++++------ 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/src/main/java/org/archive/format/text/charset/CharsetDetector.java b/src/main/java/org/archive/format/text/charset/CharsetDetector.java index ae71b5fa..0534ff85 100644 --- a/src/main/java/org/archive/format/text/charset/CharsetDetector.java +++ b/src/main/java/org/archive/format/text/charset/CharsetDetector.java @@ -182,6 +182,8 @@ private static String trimAttrValue(String value) { return value; } String result = value; + if (result.isEmpty()) + return result; if (result.charAt(0) == '"') { result = result.substring(1, result.length() - 1); } else if (result.charAt(0) == '\'') { diff --git a/src/main/java/org/archive/resource/html/HTMLResourceFactory.java b/src/main/java/org/archive/resource/html/HTMLResourceFactory.java index 34062ed9..afb1c850 100644 --- a/src/main/java/org/archive/resource/html/HTMLResourceFactory.java +++ b/src/main/java/org/archive/resource/html/HTMLResourceFactory.java @@ -5,6 +5,8 @@ import java.io.InputStream; import java.io.UnsupportedEncodingException; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.archive.format.http.HttpHeaders; import org.archive.format.json.JSONUtils; import org.archive.format.text.charset.CharsetDetector; @@ -23,6 +25,8 @@ public class HTMLResourceFactory implements ResourceFactory { + public static final Log LOG = LogFactory.getLog(HTMLResourceFactory.class); + protected static final int CHARSET_GUESS_CHUNK_SIZE = 8192; protected static final String HTTP_HEADER_PATH = "Envelope.Payload-Metadata.HTTP-Response-Metadata.Headers"; @@ -37,21 +41,27 @@ public Resource getResource(InputStream is, MetaData parentMetaData, CDATALexer lex = new CDATALexer(); // guess charset based on HTTP header and sniffed content chunk + String charset = "UTF-8"; is = new BufferedInputStream(is, CHARSET_GUESS_CHUNK_SIZE); byte[] chunk = new byte[CHARSET_GUESS_CHUNK_SIZE]; is.mark(0); int chunkSize = is.read(chunk, 0, CHARSET_GUESS_CHUNK_SIZE); is.reset(); - JSONObject headers = JSONUtils.extractObject(hmd.getTopMetaData(), HTTP_HEADER_PATH); - HttpHeaders httpHeaders = new HttpHeaders(); - if (headers.has("Content-Type")) { + if (chunkSize > 0) { + JSONObject headers = JSONUtils.extractObject(hmd.getTopMetaData(), HTTP_HEADER_PATH); + HttpHeaders httpHeaders = new HttpHeaders(); + if (headers.has("Content-Type")) { + try { + httpHeaders.add("Content-Type", headers.getString("Content-Type")); + } catch (JSONException e) { } + } try { - httpHeaders.add("Content-Type", headers.getString("Content-Type")); - } catch (JSONException e) { } + charset = charSetDetector.getCharset(chunk, chunkSize, httpHeaders); + } catch (Exception e) { + LOG.error("Failed to guess charset: " + e.getMessage()); + } } - String charset = charSetDetector.getCharset(chunk, chunkSize, httpHeaders); - Page page; try { page = new Page(is, charset); From 824dd82f5f9c9e60392ece498f8e5d44a7e431b9 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Thu, 24 Nov 2016 14:05:55 +0100 Subject: [PATCH 044/240] Match http-equiv meta elements with unquoted attribute values, e.g. --- .../org/archive/format/text/charset/CharsetDetector.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/archive/format/text/charset/CharsetDetector.java b/src/main/java/org/archive/format/text/charset/CharsetDetector.java index 0534ff85..9b4c8523 100644 --- a/src/main/java/org/archive/format/text/charset/CharsetDetector.java +++ b/src/main/java/org/archive/format/text/charset/CharsetDetector.java @@ -60,7 +60,8 @@ public abstract class CharsetDetector { private final static String META_CONTENT_ATTR_PATTERN_STRING = "\\b" + META_CONTENT_ATTRIBUTE + "\\s*=\\s*(" + ANY_ATTR_VALUE + ")(?:\\s|>)?"; private final static String META_HTTP_EQUIV_ATTR_PATTERN_STRING = "\\b" + - META_HTTP_EQUIV_ATTRIBUTE + "\\s*=\\s*(" + ANY_ATTR_VALUE + ")(?:\\s|>)?"; + META_HTTP_EQUIV_ATTRIBUTE + "\\s*=\\s*(" + META_CONTENT_TYPE + "|" + + ANY_ATTR_VALUE + ")(?:\\s|>)?"; @@ -183,7 +184,7 @@ private static String trimAttrValue(String value) { } String result = value; if (result.isEmpty()) - return result; + return result; if (result.charAt(0) == '"') { result = result.substring(1, result.length() - 1); } else if (result.charAt(0) == '\'') { @@ -232,7 +233,6 @@ public static String findMetaContentType(String pageSample) { protected String getCharsetFromBytes(byte buffer[], int len) throws IOException { String charsetName = null; - UniversalDetector detector = new UniversalDetector(null); detector.handleData(buffer, 0, len); detector.dataEnd(); From 9e41abcb36c585dd1cd9622f0eeeaddb0faae111 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Fri, 9 Dec 2016 15:35:10 +0100 Subject: [PATCH 045/240] Strip empty port, do not fail --- src/main/java/org/archive/url/URLParser.java | 24 +++++++++++-------- .../archive/url/IAURLCanonicalizerTest.java | 1 + .../archive/url/WaybackURLKeyMakerTest.java | 1 + 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/src/main/java/org/archive/url/URLParser.java b/src/main/java/org/archive/url/URLParser.java index 98e4c1aa..83d3c386 100644 --- a/src/main/java/org/archive/url/URLParser.java +++ b/src/main/java/org/archive/url/URLParser.java @@ -246,16 +246,20 @@ public static HandyURL parse(String urlString) throws URISyntaxException { colonPort = uriAuthority.substring(portColonIndex); } if(colonPort != null) { - if(colonPort.startsWith(":")) { - try { - port = Integer.parseInt(colonPort.substring(1)); - } catch(NumberFormatException e) { - throw new URISyntaxException(urlString, "bad port " - + colonPort.substring(1)); - } - } else { - // XXX: what's happened?! - } + if(colonPort.startsWith(":")) { + if (colonPort.length() == 1) { + // a bare colon (http://example.com:/), use default port + } else { + try { + port = Integer.parseInt(colonPort.substring(1)); + } catch(NumberFormatException e) { + throw new URISyntaxException(urlString, "bad port " + + colonPort.substring(1)); + } + } + } else { + // XXX: what's happened?! + } } if(userInfo != null) { int passColonIndex = userInfo.indexOf(COLON); diff --git a/src/test/java/org/archive/url/IAURLCanonicalizerTest.java b/src/test/java/org/archive/url/IAURLCanonicalizerTest.java index 91751b4a..e2c46258 100644 --- a/src/test/java/org/archive/url/IAURLCanonicalizerTest.java +++ b/src/test/java/org/archive/url/IAURLCanonicalizerTest.java @@ -12,6 +12,7 @@ public void testFull() throws URISyntaxException { compCan(iaC,"https://www.archive.org:80/","https://archive.org:80/"); compCan(iaC,"http://www.archive.org:443/","http://archive.org:443/"); compCan(iaC,"https://www.archive.org:443/","https://archive.org/"); + compCan(iaC,"http://www.archive.org:/","http://archive.org/"); compCan(iaC,"http://www.archive.org/big/","http://archive.org/big"); compCan(iaC,"dns:www.archive.org","dns:www.archive.org"); diff --git a/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java b/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java index 34bfe625..26161456 100644 --- a/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java +++ b/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java @@ -22,6 +22,7 @@ public void testMakeKey() throws URISyntaxException { assertEquals("org,archive)/goo", km.makeKey("http://archive.org/goo/?")); assertEquals("org,archive)/goo?a&b", km.makeKey("http://archive.org/goo/?b&a")); assertEquals("org,archive)/goo?a=1&a=2&b", km.makeKey("http://archive.org/goo/?a=2&b&a=1")); + assertEquals("org,archive)/", km.makeKey("http://archive.org:/")); } } From b918f7f18e94c58a4a74d97e98f3c19465466595 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Wed, 4 Jan 2017 18:21:22 +0100 Subject: [PATCH 046/240] Improve clipping of quotation marks in CSS link extraction - clip multiple quotation marks Fix StringIndexOutOfBoundsException in patternCSSExtract - correct check for min. required URL lenght when stripping 4 characters (2 at each end) - simplified code, use non-capturing groups in regular expression --- .../html/ExtractingParseObserver.java | 79 ++++++++++--------- .../html/ExtractingParseObserverTest.java | 48 ++++++----- 2 files changed, 70 insertions(+), 57 deletions(-) diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java index df3742fa..45a48808 100644 --- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java +++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java @@ -23,7 +23,7 @@ public class ExtractingParseObserver implements ParseObserver { protected static String cssUrlPatString = "url\\s*\$\\s*((?:\\\\?[\"'])?.+?(?:\\\\?[\"'])?)\\s*\$"; protected static String cssImportNoUrlPatString = - "@import\\s+(('[^']+')|(\"[^\"]+\")|(\$'[^']+'\$)|(\$\"[^\"]+\"\$)|(\$[^)]+\$)|([a-z0-9_.:/\\\\-]+))\\s*;"; + "@import\\s+((?:'[^']+')|(?:\"[^\"]+\")|(?:\$'[^']+'\$)|(?:\$\"[^\"]+\"\$)|(?:\$[^)]+\$)|(?:[a-z0-9_.:/\\\\-]+))\\s*;"; protected static Pattern cssImportNoUrlPattern = Pattern .compile(cssImportNoUrlPatString); @@ -368,40 +368,45 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs } } } - private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String content) { - Matcher m = pattern.matcher(content); - int idx = 0; - int contentLen = content.length(); - while((idx < contentLen) && m.find(idx)) { - String url = m.group(1); - int origUrlLength = url.length(); - int urlStart = m.start(1); - int urlEnd = m.end(1); - idx = urlEnd; - if(url.length() < 2) { - continue; - } - if ((url.charAt(0) == '(') - && (url.charAt(origUrlLength-1) == ')')) { - url = url.substring(1, origUrlLength - 1); - urlStart += 1; - origUrlLength -= 2; - } - if (url.charAt(0) == '"') { - url = url.substring(1, origUrlLength - 1); - urlStart += 1; - } else if (url.charAt(0) == '\'') { - url = url.substring(1, origUrlLength - 1); - urlStart += 1; - } else if (url.charAt(0) == '\\') { - if(url.length() == 2) - continue; - url = url.substring(2, origUrlLength - 2); - urlStart += 2; - } - int urlLength = url.length(); - data.addHref("path","STYLE/#text","href",url); - idx += urlLength; - } - } + private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String content) { + Matcher m = pattern.matcher(content); + int idx = 0; + int contentLen = content.length(); + if (contentLen > 100000) + // extract URLs only from the first 100 kB + contentLen = 100000; + FIND: + while((idx < contentLen) && m.find()) { + idx = m.end(); + String url = m.group(1); + if(url.length() < 2) { + continue; + } + if ((url.charAt(0) == '(') + && (url.charAt(url.length()-1) == ')')) { + url = url.substring(1, url.length() - 1); + } + CLIP: + while (url.length() > 1) { + if ((url.charAt(0) == '"' || url.charAt(0) == '\'') + && (url.charAt(url.length() - 1) == '"' + || url.charAt(url.length() - 1) == '\'')) { + if(url.length() <= 2) { + // empty URL + continue FIND; + } + url = url.substring(1, url.length() - 1); + } else if (url.charAt(0) == '\\') { + if(url.length() <= 4) { + // empty URL + continue FIND; + } + url = url.substring(2, url.length() - 2); + } else { + break CLIP; + } + } + data.addHref("path","STYLE/#text","href",url); + } + } } diff --git a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java index 24b6c18a..236b964b 100644 --- a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java +++ b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java @@ -19,7 +19,9 @@ public void testHandleStyleNodeExceptions() throws Exception { "url (' ')", "url('\")", "url(')", - "url('\"')" + "url('\"')", + "url('\\\"\"')", + "url(''''')" }; boolean except = false; HTMLMetaData md = new HTMLMetaData(new MetaData()); @@ -37,6 +39,7 @@ public void testHandleStyleNodeExceptions() throws Exception { assertFalse(except); } } + public void testHandleStyleNode() throws Exception { String[][] tests = { {""}, @@ -45,31 +48,35 @@ public void testHandleStyleNode() throws Exception { {"url(\"foo.gif\")","foo.gif"}, {"url(\\\"foo.gif\\\")","foo.gif"}, {"url(\\'foo.gif\\')","foo.gif"}, - - }; + {"url(''foo.gif'')","foo.gif"}, + {"url( foo.gif )","foo.gif"}, + {"url('''')"} + }; for(String[] testa : tests) { checkExtract(testa); } - // boolean except = false; -// HTMLMetaData md = new HTMLMetaData(new MetaData()); -// ExtractingParseObserver epo = new ExtractingParseObserver(md); -// for(String css : tests) { -// try { -// TextNode tn = new TextNode(css); -// epo.handleStyleNode(tn); -// } catch(Exception e) { -// System.err.format("And the winner is....(%s)\n", css); -// e.printStackTrace(); -// except = true; -// throw e; -// } -// assertFalse(except); -// } } + + /** + * Test whether the pattern matcher does extract nothing and also does not + * not hang-up if an overlong CSS link is truncated. + */ + public void testHandleStyleNodeNoHangupTruncated() throws Exception { + StringBuilder sb = new StringBuilder(); + sb.append("url("); + for (int i = 0; i < 500000; i++) + sb.append('\''); + sb.append("foo.gif"); + for (int i = 0; i < 499000; i++) + sb.append('\''); + String[] test = new String[1]; + test[0] = sb.toString(); + checkExtract(test); + } + private void checkExtract(String[] data) throws JSONException { // System.err.format("CSS(%s) want[0](%s)\n",css,want[0]); String css = data[0]; - boolean except = false; HTMLMetaData md = new HTMLMetaData(new MetaData()); ExtractingParseObserver epo = new ExtractingParseObserver(md); try { @@ -87,7 +94,8 @@ private void checkExtract(String[] data) throws JSONException { assertTrue(o instanceof JSONObject); JSONObject jo = (JSONObject) o; - assertEquals(data[i],jo.getString("href")); + assertEquals("CSS link extraction failed for <" + css + ">", + data[i], jo.getString("href")); } } else { assertNull(a); From 194a1faecf30905c840d71d0bc22b6ea5d6a61fe Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Wed, 18 Jan 2017 12:29:43 +0100 Subject: [PATCH 047/240] CSS link extraction: clip also unpaired leading and trailing quotation marks --- .../html/ExtractingParseObserver.java | 64 +++++++------------ .../html/ExtractingParseObserverTest.java | 9 +-- 2 files changed, 27 insertions(+), 46 deletions(-) diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java index 45a48808..deb8c8c0 100644 --- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java +++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java @@ -22,13 +22,18 @@ public class ExtractingParseObserver implements ParseObserver { protected static String cssUrlPatString = "url\\s*\$\\s*((?:\\\\?[\"'])?.+?(?:\\\\?[\"'])?)\\s*\$"; + protected static String cssUrlTrimPatString = + "^(?:\\\\?[\"'])+|(?:\\\\?[\"'])+$"; protected static String cssImportNoUrlPatString = - "@import\\s+((?:'[^']+')|(?:\"[^\"]+\")|(?:\$'[^']+'\$)|(?:\$\"[^\"]+\"\$)|(?:\$[^)]+\$)|(?:[a-z0-9_.:/\\\\-]+))\\s*;"; + "@import\\s+((?:'[^']+')|(?:\"[^\"]+\")|(?:\$'[^']+'\$)|(?:\$\"[^\"]+\"\$)|(?:\$[^)]+\$)|(?:[a-z0-9_.:/\\\\-]+))\\s*;"; protected static Pattern cssImportNoUrlPattern = Pattern .compile(cssImportNoUrlPatString); protected static Pattern cssUrlPattern = Pattern.compile(cssUrlPatString); + + protected static Pattern cssUrlTrimPattern = Pattern.compile(cssUrlTrimPatString); + private final static int MAX_TEXT_LEN = 100; // private static String GLOBAL_ATTR[] = {"background"}; @@ -368,45 +373,20 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs } } } - private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String content) { - Matcher m = pattern.matcher(content); - int idx = 0; - int contentLen = content.length(); - if (contentLen > 100000) - // extract URLs only from the first 100 kB - contentLen = 100000; - FIND: - while((idx < contentLen) && m.find()) { - idx = m.end(); - String url = m.group(1); - if(url.length() < 2) { - continue; - } - if ((url.charAt(0) == '(') - && (url.charAt(url.length()-1) == ')')) { - url = url.substring(1, url.length() - 1); - } - CLIP: - while (url.length() > 1) { - if ((url.charAt(0) == '"' || url.charAt(0) == '\'') - && (url.charAt(url.length() - 1) == '"' - || url.charAt(url.length() - 1) == '\'')) { - if(url.length() <= 2) { - // empty URL - continue FIND; - } - url = url.substring(1, url.length() - 1); - } else if (url.charAt(0) == '\\') { - if(url.length() <= 4) { - // empty URL - continue FIND; - } - url = url.substring(2, url.length() - 2); - } else { - break CLIP; - } - } - data.addHref("path","STYLE/#text","href",url); - } - } + private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String content) { + Matcher m = pattern.matcher(content); + int idx = 0; + int contentLen = content.length(); + if (contentLen > 100000) + // extract URLs only from the first 100 kB + contentLen = 100000; + while((idx < contentLen) && m.find()) { + idx = m.end(); + String url = m.group(1); + url = cssUrlTrimPattern.matcher(url).replaceAll(""); + if (!url.isEmpty()) { + data.addHref("path","STYLE/#text","href", url); + } + } + } } diff --git a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java index 236b964b..bfbd6f02 100644 --- a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java +++ b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java @@ -20,8 +20,8 @@ public void testHandleStyleNodeExceptions() throws Exception { "url('\")", "url(')", "url('\"')", - "url('\\\"\"')", - "url(''''')" + "url('\\\"\"')", + "url(''''')" }; boolean except = false; HTMLMetaData md = new HTMLMetaData(new MetaData()); @@ -50,7 +50,8 @@ public void testHandleStyleNode() throws Exception { {"url(\\'foo.gif\\')","foo.gif"}, {"url(''foo.gif'')","foo.gif"}, {"url( foo.gif )","foo.gif"}, - {"url('''')"} + {"url('''')"}, + {"url('foo.gif'')","foo.gif"}, }; for(String[] testa : tests) { checkExtract(testa); @@ -98,7 +99,7 @@ private void checkExtract(String[] data) throws JSONException { data[i], jo.getString("href")); } } else { - assertNull(a); + assertNull("Expected no extracted link for <" + css + ">", a); } } From 038402885f85a426601d5f85936e210e4f55636f Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Fri, 27 Jan 2017 08:59:25 +0100 Subject: [PATCH 048/240] CharsetDetector: remove unnecessary check for empty string (contributed by @ldko) --- .../java/org/archive/format/text/charset/CharsetDetector.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/main/java/org/archive/format/text/charset/CharsetDetector.java b/src/main/java/org/archive/format/text/charset/CharsetDetector.java index 9b4c8523..690f8b99 100644 --- a/src/main/java/org/archive/format/text/charset/CharsetDetector.java +++ b/src/main/java/org/archive/format/text/charset/CharsetDetector.java @@ -183,8 +183,6 @@ private static String trimAttrValue(String value) { return value; } String result = value; - if (result.isEmpty()) - return result; if (result.charAt(0) == '"') { result = result.substring(1, result.length() - 1); } else if (result.charAt(0) == '\'') { From 1364716a83911369de7256aa1718a236acb75973 Mon Sep 17 00:00:00 2001 From: Lauren Ko Date: Tue, 14 Feb 2017 17:07:36 -0600 Subject: [PATCH 049/240] Logging changes for next release. --- CHANGES.md | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 52c40f42..fee29e16 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,3 +1,11 @@ +1.1.8 +----- +* [Strip empty port via URLParser](https://github.com/iipc/webarchive-commons/pull/69/) +* [Use CharsetDetector to guess encoding of HTML documents](https://github.com/iipc/webarchive-commons/pull/68/) +* [Fix last header was lost if LF LF](https://github.com/iipc/webarchive-commons/pull/65/) +* [Make regular expression to extract URLs from CSS more restrictive](https://github.com/iipc/webarchive-commons/pull/63) +* [Remove invalid constant `PROFILE_REVISIT_URI_AGNOSTIC_IDENTICAL_DIGEST`](https://github.com/iipc/webarchive-commons/pull/62) + 1.1.7 ----- * [Make canonicalizer be able to strip session id params even if they are the first params in the query string](https://github.com/iipc/webarchive-commons/pull/54) @@ -36,10 +44,10 @@ 1.1.2 ----- -* Fixed support for reading uncompressed WARCs, along with some unit testing. (https://github.com/iipc/webarchive-commons/pull/12) +* [Fixed support for reading uncompressed WARCs, along with some unit testing.](https://github.com/iipc/webarchive-commons/pull/12) 1.1.1 ----- -* Renamed from commons-webarchive to webarchive-commons (https://github.com/iipc/webarchive-commons/pull/8) -* Cope with malformed GZip extra fields as produced by wget 1.14 (https://github.com/iipc/webarchive-commons/pull/10) -* Switch to httpcomponents, and add IA deployment information. (https://github.com/iipc/webarchive-commons/pull/11) +* [Renamed from commons-webarchive to webarchive-commons](https://github.com/iipc/webarchive-commons/pull/8) +* [Cope with malformed GZip extra fields as produced by wget 1.14](https://github.com/iipc/webarchive-commons/pull/10) +* [Switch to httpcomponents, and add IA deployment information.](https://github.com/iipc/webarchive-commons/pull/11) From 11579c2baab0db08f14341f70b848353eed17269 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Wed, 22 Feb 2017 13:11:13 +0100 Subject: [PATCH 050/240] Improve HTML link extraction - add extractors for more elements which can take URLs as attribute values, add missing attributes - generalize extraction of "global" attributes (`background`) - add custom data attributes frequently used for linking (`data-href`, `data-uri`) - add unit test to cover link extraction --- .../html/ExtractingParseObserver.java | 79 ++++- .../html/ExtractingParseObserverTest.java | 161 +++++++++ .../resource/html/link-extraction-test.warc | 320 ++++++++++++++++++ 3 files changed, 551 insertions(+), 9 deletions(-) create mode 100644 src/test/resources/org/archive/resource/html/link-extraction-test.warc diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java index deb8c8c0..826851e0 100644 --- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java +++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java @@ -2,12 +2,17 @@ import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; +import java.util.Locale; import java.util.Map; +import java.util.Set; import java.util.Stack; +import java.util.Vector; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.archive.format.text.html.ParseObserver; +import org.htmlparser.Attribute; import org.htmlparser.nodes.RemarkNode; import org.htmlparser.nodes.TagNode; import org.htmlparser.nodes.TextNode; @@ -36,11 +41,10 @@ public class ExtractingParseObserver implements ParseObserver { private final static int MAX_TEXT_LEN = 100; -// private static String GLOBAL_ATTR[] = {"background"}; - private static final String PATH = "path"; private static final String PATH_SEPARATOR = "@/"; - private final static Map extractors; + private static final Map extractors; + private static final Set globalHrefAttributes; static { extractors = new HashMap(); extractors.put("A", new AnchorTagExtractor()); @@ -57,6 +61,22 @@ public class ExtractingParseObserver implements ParseObserver { extractors.put("META", new MetaTagExtractor()); extractors.put("OBJECT", new ObjectTagExtractor()); extractors.put("SCRIPT", new ScriptTagExtractor()); + extractors.put("Q", new QuotationLinkTagExtractor()); + extractors.put("BLOCKQUOTE", new QuotationLinkTagExtractor()); + extractors.put("DEL", new QuotationLinkTagExtractor()); + extractors.put("INS", new QuotationLinkTagExtractor()); + // HTML5: + extractors.put("BUTTON", new ButtonTagExtractor()); + extractors.put("MENUITEM", new MenuitemTagExtractor()); + extractors.put("VIDEO", new EmbedVideoTagExtractor()); + extractors.put("AUDIO", new EmbedTagExtractor()); + extractors.put("TRACK", new EmbedTagExtractor()); + extractors.put("SOURCE", new EmbedTagExtractor()); + + globalHrefAttributes = new HashSet(); + globalHrefAttributes.add("background"); + globalHrefAttributes.add("data-href"); + globalHrefAttributes.add("data-uri"); } @@ -84,11 +104,19 @@ public void handleTagOpen(TagNode tag) { inTitle = !tag.isEmptyXmlTag(); return; } + // first the global attributes: - // background - String v = tag.getAttribute("background"); - if(v != null) { - data.addHref(PATH,makePath(name,"background"),"url",v); + Vector attributes = tag.getAttributesEx(); + for (Attribute a : attributes) { + String attrName = a.getName(); + String attrValue = a.getValue(); + if (attrName == null || attrValue == null) { + continue; + } + attrName = attrName.toLowerCase(Locale.ROOT); + if (globalHrefAttributes.contains(attrName)) { + data.addHref(PATH,makePath(name,attrName),"url",attrValue); + } } // TODO: style attribute, BASE(href) tag, Resolve URLs @@ -296,12 +324,24 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs } } + private static class ButtonTagExtractor implements TagExtractor { + public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { + addBasicHrefs(data,node,"formaction"); + } + } + private static class EmbedTagExtractor implements TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { addBasicHrefs(data,node,"src"); } } + private static class EmbedVideoTagExtractor implements TagExtractor { + public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { + addBasicHrefs(data,node,"src","poster"); + } + } + private static class FormTagExtractor implements TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { ArrayList l = new ArrayList(); @@ -329,21 +369,26 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs addBasicHrefs(data,node,"src"); } } + private static class IFrameTagExtractor implements TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { addBasicHrefs(data,node,"src"); } } + private static class ImgTagExtractor implements TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { addHrefWithAttrs(data,node,"src","alt","title"); + addBasicHrefs(data,node,"longdesc"); } } + private static class InputTagExtractor implements TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { - addBasicHrefs(data,node,"src"); + addBasicHrefs(data,node,"src","formaction"); } } + private static class LinkTagExtractor implements TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { ArrayList l = getAttrListUrl(node,"href","rel","type"); @@ -352,6 +397,13 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs } } } + + private static class MenuitemTagExtractor implements TagExtractor { + public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { + addBasicHrefs(data,node,"icon"); + } + } + private static class MetaTagExtractor implements TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { ArrayList l = getAttrList(node,"name","rel","content","http-equiv"); @@ -360,11 +412,19 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs } } } + private static class ObjectTagExtractor implements TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { - addBasicHrefs(data,node,"codebase","cdata"); + addBasicHrefs(data,node,"codebase","cdata","data"); } } + + private static class QuotationLinkTagExtractor implements TagExtractor { + public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { + addBasicHrefs(data,node,"cite"); + } + } + private static class ScriptTagExtractor implements TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { ArrayList l = getAttrListUrl(node,"src","type"); @@ -373,6 +433,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs } } } + private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String content) { Matcher m = pattern.matcher(content); int idx = 0; diff --git a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java index bfbd6f02..8f690a06 100644 --- a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java +++ b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java @@ -1,15 +1,33 @@ package org.archive.resource.html; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.logging.Logger; + +import org.archive.extract.ExtractingResourceFactoryMapper; +import org.archive.extract.ExtractingResourceProducer; +import org.archive.extract.ProducerUtils; +import org.archive.extract.ResourceFactoryMapper; import org.archive.resource.MetaData; +import org.archive.resource.Resource; +import org.archive.resource.ResourceParseException; +import org.archive.resource.ResourceProducer; import org.htmlparser.nodes.TextNode; import org.json.JSONArray; import org.json.JSONException; import org.json.JSONObject; +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.Multimap; + import junit.framework.TestCase; public class ExtractingParseObserverTest extends TestCase { + private static final Logger LOG = + Logger.getLogger(ExtractingParseObserverTest.class.getName()); + public void testHandleStyleNodeExceptions() throws Exception { String[] tests = { "some css", @@ -103,5 +121,148 @@ private void checkExtract(String[] data) throws JSONException { } } + private void checkLink(Multimap links, String url, String path) { + assertTrue("Link with URL " + url + " not found", links.containsKey(url)); + assertTrue("Wrong path " + path + " for " + url, links.get(url).contains(path)); + } + + private void checkLinks(Resource resource, String[][] expectedLinks) { + assertNotNull(resource); + assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource); + MetaData md = resource.getMetaData(); + LOG.info(md.toString()); + Multimap links = ArrayListMultimap.create(); + JSONObject head = md.optJSONObject("Head"); + if (head != null) { + // + String baseUrl = (String) head.opt("Base"); + if (baseUrl != null) { + links.put(baseUrl, "__base__"); + } + // + JSONArray metas = head.optJSONArray("Metas"); + if (metas != null) { + for (int i = 0; i < metas.length(); i++) { + JSONObject o = (JSONObject) metas.optJSONObject(i); + String httpEquiv = o.optString("http-equiv"); + if (httpEquiv != null && httpEquiv.equalsIgnoreCase("Refresh")) { + String metaRefreshTarget = o.optString("content"); + if (metaRefreshTarget != null) { + metaRefreshTarget = metaRefreshTarget.replaceFirst("(?i)(?:^\\d+\\s*;)?\\s*url=", ""); + links.put(metaRefreshTarget, "__meta_refresh__"); + } + } + } + } + } + // extract outlinks + List linkArrays = new ArrayList(); + if (md.optJSONArray("Links") != null) { + linkArrays.add(md.optJSONArray("Links")); + } + try { + if (md.getJSONObject("Head") != null && md.getJSONObject("Head").getJSONArray("Link") != null) { + linkArrays.add(md.getJSONObject("Head").getJSONArray("Link")); + } + } catch (JSONException e1) { + } + for (JSONArray ldata : linkArrays) { + for (int i = 0; i < ldata.length(); i++) { + JSONObject o = (JSONObject) ldata.optJSONObject(i); + try { + String url = o.getString("url"); + links.put(url, o.getString("path")); + LOG.info(" found link: " + o.getString("url") + " " + o.getString("path")); + } catch (JSONException e) { + fail("Failed to extract URL from link: " + e.getMessage()); + } + } + } + assertEquals("Unexpected number of links", expectedLinks.length, links.size()); + for (String[] l : expectedLinks) { + checkLink(links, l[0], l[1]); + } + } + + public void testLinkExtraction() throws ResourceParseException, IOException { + String testFileName = "link-extraction-test.warc"; + ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath()); + ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper(); + ExtractingResourceProducer extractor = + new ExtractingResourceProducer(producer, mapper); + extractor.getNext(); // skip warcinfo record + String[][] html4links = { + {"http://www.example.com/", "__base__"}, + {"http://www.example.com/redirected.html", "__meta_refresh__"}, + {"background.jpg", "BODY@/background"}, + {"http://www.example.com/a-href.html", "A@/href"}, + {"#anchor", "A@/href"}, + {"image.png", "IMG@/src"}, + {"image.gif", "IMG@/src"}, + {"http://example.com/image-description.html#image.gif", "IMG@/longdesc"}, + {"helloworld.swf", "OBJECT@/data"}, + {"http://www.example.com/shakespeare.html", "Q@/cite"}, + {"http://www.example.com/shakespeare-long.html", "BLOCKQUOTE@/cite"} + }; + checkLinks(extractor.getNext(), html4links); + String[][] html5links = { + {"http:///www.example.com/video.html", "LINK@/href", "canonical"}, + {"video.rss", "LINK@/href", "alternate"}, + {"https://archive.org/download/WebmVp8Vorbis/webmvp8.gif", "VIDEO@/poster"}, + {"https://archive.org/download/WebmVp8Vorbis/webmvp8.webm", "SOURCE@/src"}, + {"https://archive.org/download/WebmVp8Vorbis/webmvp8_512kb.mp4", "SOURCE@/src"}, + {"https://archive.org/download/WebmVp8Vorbis/webmvp8.ogv", "SOURCE@/src"} + }; + checkLinks(extractor.getNext(), html5links); + String[][] html5links2 = { + {"http://www.example.com/", "A@/href"}, + }; + checkLinks(extractor.getNext(), html5links2); + String[][] fbVideoLinks = { + {"https://www.facebook.com/facebook/videos/10153231379946729/", "BLOCKQUOTE@/cite"}, + {"https://www.facebook.com/facebook/videos/10153231379946729/", "A@/href"}, + {"https://www.facebook.com/facebook/", "A@/href"}, + {"https://www.facebook.com/facebook/videos/10153231379946729/", "DIV@/data-href"} + }; + checkLinks(extractor.getNext(), fbVideoLinks); + String[][] dataHrefLinks = { + {"standard.css", "LINK@/href", "stylesheet"}, + {"https://www.facebook.com/elegantthemes/videos/10153760379211923/", "DIV@/data-href"}, + {"https://www.facebook.com/facebook/videos/10153231379946729/", "DIV@/data-href"}, + {"https://www.facebook.com/facebook/videos/10153231379946729/", "BLOCKQUOTE@/cite"}, + {"https://www.facebook.com/facebook/videos/10153231379946729/", "A@/href"}, + {"https://www.facebook.com/facebook/", "A@/href"}, + {"//edge.flowplayer.org/bauhaus.webm", "SOURCE@/src"}, + {"//edge.flowplayer.org/bauhaus.mp4", "SOURCE@/src"}, + {"//edge.flowplayer.org/functional.webm", "BUTTON@/data-href"}, + {"/content-page", "ARTICLE@/data-href"}, + {"/content-page", "A@/href"}, + {"/tags/content","A@/href"}, + {"/tags/headlines", "A@/href"}, + {"http://grabaperch.com", "DIV@/data-href"}, + {"green.css", "LINK@/data-href"}, + {"blue.css", "LINK@/data-href"}, + {"http://codecanyon.net/user/CodingJack", "A@/data-href"}, + {"jackbox/img/thumbs/4.jpg", "IMG@/src"}, + {"//venobox-destination", "A@/data-href"}, + {"#", "A@/href"}, + {"http://www.youtube.com/v/itTskyFLSS8&rel=0&autohide=1&showinfo=0&autoplay=1", "DIV@/data-href"}, + {"#", "A@/href"}, + {"http://www.youtube.com/v/itTskyFLSS8&rel=0&autohide=1&showinfo=0", "IFRAME@/src"} + }; + checkLinks(extractor.getNext(), dataHrefLinks); + String[][] fbSocialLinks = { + {"http://www.your-domain.com/your-page.html", "DIV@/data-uri"}, + {"https://developers.facebook.com/docs/plugins/comments#configurator", "DIV@/data-href"}, + {"https://www.facebook.com/zuck/posts/10102735452532991?comment_id=1070233703036185", "DIV@/data-href"}, + {"https://www.facebook.com/zuck", "DIV@/data-href"}, + {"https://developers.facebook.com/docs/plugins/", "DIV@/data-href"}, + {"https://www.facebook.com/facebook", "DIV@/data-href"}, + {"https://www.facebook.com/facebook", "BLOCKQUOTE@/cite"}, + {"https://www.facebook.com/facebook", "A@/href"}, + {"http://www.your-domain.com/your-page.html", "DIV@/data-href"} + }; + checkLinks(extractor.getNext(), fbSocialLinks); + } } diff --git a/src/test/resources/org/archive/resource/html/link-extraction-test.warc b/src/test/resources/org/archive/resource/html/link-extraction-test.warc new file mode 100644 index 00000000..ab0e54c8 --- /dev/null +++ b/src/test/resources/org/archive/resource/html/link-extraction-test.warc @@ -0,0 +1,320 @@ +WARC/1.0 +WARC-Type: warcinfo +Content-Type: application/warc-fields +WARC-Date: 2017-02-20T14:00:56Z +Content-Length: 128 + +format: WARC File Format 1.0 +conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf +robots: classic + + + +WARC/1.0 +WARC-Type: response +WARC-Date: 2017-02-20T14:00:56Z +WARC-Target-URI: http://www.example.com/html4.html +Content-Type: application/http; msgtype=response +Content-Length: 1243 + +HTTP/1.1 200 OK +Date: Mon, 20 Feb 2017 14:00:56 GMT +Content-Length: 1125 +Content-Type: application/xhtml+xml + + + + + + + +Test XHTML Link Extraction + + +A@/href +

+ anchor only + IMG@/src + IMG@/longdesc + +

+ To be or not to be. +

+To be, or not to be, that is the question:
+Whether 'tis nobler in the mind to suffer
+The slings and arrows of outrageous fortune, … +

+ + + + +WARC/1.0 +WARC-Type: response +WARC-Target-URI: http://www.example.com/link-extraction-test-html5-video.html +WARC-Date: 2017-02-20T21:35:03Z +Content-Type: application/http; msgtype=response +Content-Length: 890 + +HTTP/1.1 200 OK +Date: Mon, 20 Feb 2017 21:35:03 GMT +Content-Length: 789 +Content-Type: text/html + + + + +Test HTML5 Video Tag + + + + + + +

+ + + + +WARC/1.0 +WARC-Type: response +WARC-Target-URI: http://www.example.com/poor_html5.html +WARC-Date: 2017-02-21T15:50:40Z +Content-Type: application/http; msgtype=response +Content-Length: 594 + +HTTP/1.1 200 OK +Date: Tue, 21 Feb 2017 15:50:40 GMT +Content-Length: 486 +Content-Type: text/html + + +Testing poor HTML5 + + + + + +This is valid HTML5! + + + +

header

+ +

headline

+ +

paragraph one with link. + + +WARC/1.0 +WARC-Type: response +WARC-Target-URI: http://www.example.com/fb-video.html +WARC-Date: 2017-02-20T16:58:50Z +Content-Type: application/http; msgtype=response +Content-Length: 1330 + +HTTP/1.1 200 OK +Date: Mon, 20 Feb 2017 16:58:50 GMT +Content-Length: 1194 +Content-Type: text/html + + + + + fb-video - Embedded Videos - Social Plugins + + + + +

+ + + +

+ How to Share With Just Friends +
How to share with just friends.
+ Posted by Facebook on Friday, December 5, 2014 +

+ + + + + +WARC/1.0 +WARC-Type: response +WARC-Target-URI: http://www.example.com/data-href.examples.html +WARC-Date: 2017-02-21T21:05:10Z +Content-Type: application/http; msgtype=response +Content-Length: 3160 + +HTTP/1.1 200 OK +Date: Tue, 21 Feb 2017 21:05:10 GMT +Content-Length: 3057 +Content-Type: text/html + + + + + + + + + + + + +

+ + +

+ How to Share With Just Friends +
How to share with just friends.
+ Posted by Facebook on Friday, December 5, 2014 +

+ + +

+ +

+ + +

Headline goes here.

And here goes a bit of copy about the content of the article.

+ Tags: content, headlines +

+ + +

+ + + +

+ + + +venobox + + + + + +WARC/1.0 +WARC-Type: response +WARC-Target-URI: http://www.example.com/fb-social-plugins.html +WARC-Date: 2017-02-22T09:33:02Z +Content-Type: application/http; msgtype=response +Content-Length: 1870 + +HTTP/1.1 200 OK +Date: Wed, 22 Feb 2017 09:33:02 GMT +Content-Length: 1767 +Content-Type: text/html + + +

+ + +

Facebook

+ + + + + From 6aa43f83a2cbc2acd0feb7f2c81d66f4ef1b13c5 Mon Sep 17 00:00:00 2001 From: Mohamed Elsayed Date: Thu, 2 Mar 2017 15:28:16 +0200 Subject: [PATCH 051/240] Fix #25: move missing unit tests over from Heritrix3 --- .../archive/io/ArchiveReaderFactoryTest.java | 94 +++ .../io/BufferedSeekInputStreamTest.java | 67 ++ .../archive/io/HeaderedArchiveRecordTest.java | 209 ++++++ .../archive/io/RecordingInputStreamTest.java | 132 ++++ .../archive/io/ReplayCharSequenceTest.java | 391 ++++++++++ .../io/RepositionableInputStreamTest.java | 70 ++ .../org/archive/io/arc/ARCWriterPoolTest.java | 122 +++ .../org/archive/io/arc/ARCWriterTest.java | 699 ++++++++++++++++++ .../org/archive/io/warc/WARCWriterTest.java | 512 +++++++++++++ .../org/archive/uid/UUIDGeneratorTest.java | 44 ++ .../java/org/archive/util/FileUtilsTest.java | 271 +++++++ .../org/archive/util/MimetypeUtilsTest.java | 63 ++ .../org/archive/util/PropertyUtilsTest.java | 45 ++ .../org/archive/util/anvl/ANVLRecordTest.java | 128 ++++ 14 files changed, 2847 insertions(+) create mode 100644 src/test/java/org/archive/io/ArchiveReaderFactoryTest.java create mode 100644 src/test/java/org/archive/io/BufferedSeekInputStreamTest.java create mode 100644 src/test/java/org/archive/io/HeaderedArchiveRecordTest.java create mode 100644 src/test/java/org/archive/io/RecordingInputStreamTest.java create mode 100644 src/test/java/org/archive/io/ReplayCharSequenceTest.java create mode 100644 src/test/java/org/archive/io/RepositionableInputStreamTest.java create mode 100644 src/test/java/org/archive/io/arc/ARCWriterPoolTest.java create mode 100644 src/test/java/org/archive/io/arc/ARCWriterTest.java create mode 100644 src/test/java/org/archive/io/warc/WARCWriterTest.java create mode 100644 src/test/java/org/archive/uid/UUIDGeneratorTest.java create mode 100644 src/test/java/org/archive/util/FileUtilsTest.java create mode 100644 src/test/java/org/archive/util/MimetypeUtilsTest.java create mode 100644 src/test/java/org/archive/util/PropertyUtilsTest.java create mode 100644 src/test/java/org/archive/util/anvl/ANVLRecordTest.java diff --git a/src/test/java/org/archive/io/ArchiveReaderFactoryTest.java b/src/test/java/org/archive/io/ArchiveReaderFactoryTest.java new file mode 100644 index 00000000..2313868c --- /dev/null +++ b/src/test/java/org/archive/io/ArchiveReaderFactoryTest.java @@ -0,0 +1,94 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + +import java.io.File; +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.Iterator; + +import org.apache.commons.lang.StringUtils; +import org.archive.io.ArchiveRecord; +import org.archive.io.arc.ARCWriterTest; +import org.archive.util.TmpDirTestCase; + +public class ArchiveReaderFactoryTest extends TmpDirTestCase { + /** + * Test local file as URL + * @throws IOException + */ + public void testGetFileURL() throws IOException { + File arc = ARCWriterTest.createARCFile(getTmpDir(), true); + ArchiveReader reader = null; + try { + reader = ArchiveReaderFactory. + get(new URL("file:////" + arc.getAbsolutePath())); + for (Iterator i = reader.iterator(); i.hasNext();) { + ArchiveRecord r = (ArchiveRecord)i.next(); + assertTrue("mime unread",StringUtils.isNotBlank(r.getHeader().getMimetype())); + } + } finally { + if (reader != null) { + reader.close(); + } + } + } + + /** + * Test local file as File + * @throws IOException + */ + public void testGetFile() throws IOException { + File arc = ARCWriterTest.createARCFile(getTmpDir(), true); + ArchiveReader reader = null; + try { + reader = ArchiveReaderFactory.get(arc.getAbsoluteFile()); + for (Iterator i = reader.iterator(); i.hasNext();) { + ArchiveRecord r = (ArchiveRecord)i.next(); + assertTrue("mime unread",StringUtils.isNotBlank(r.getHeader().getMimetype())); + } + } finally { + if (reader != null) { + reader.close(); + } + } + } + + /** + * Test local file as String path + * @throws IOException + */ + public void testGetPath() throws IOException { + File arc = ARCWriterTest.createARCFile(getTmpDir(), true); + ArchiveReader reader = null; + try { + reader = ArchiveReaderFactory.get(arc.getAbsoluteFile().getAbsolutePath()); + for (Iterator i = reader.iterator(); i.hasNext();) { + ArchiveRecord r = (ArchiveRecord)i.next(); + assertTrue("mime unread",StringUtils.isNotBlank(r.getHeader().getMimetype())); + } + } finally { + if (reader != null) { + reader.close(); + } + } + } +} diff --git a/src/test/java/org/archive/io/BufferedSeekInputStreamTest.java b/src/test/java/org/archive/io/BufferedSeekInputStreamTest.java new file mode 100644 index 00000000..270e45e0 --- /dev/null +++ b/src/test/java/org/archive/io/BufferedSeekInputStreamTest.java @@ -0,0 +1,67 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.util.Random; + +import junit.framework.TestCase; + + +/** + * Unit test for BufferedSeekInputStream. The tests do some random + * repositioning in the stream to make sure the buffer is always valid. + * + * @author pjack + */ +public class BufferedSeekInputStreamTest extends TestCase { + + + private static byte[] TEST_DATA = makeTestData(); + + public void testPosition() throws Exception { + Random random = new Random(); + ArraySeekInputStream asis = new ArraySeekInputStream(TEST_DATA); + BufferedSeekInputStream bsis = new BufferedSeekInputStream(asis, 11); + for (int i = 0; i < TEST_DATA.length; i++) { + byte b = (byte)bsis.read(); + assertEquals(TEST_DATA[i], b); + } + for (int i = 0; i < 1000; i++) { + int index = random.nextInt(TEST_DATA.length); + bsis.position(index); + char expected = (char)((int)TEST_DATA[index] & 0xFF); + char read = (char)(bsis.read() & 0xFF); + assertEquals(expected, read); + } + } + + + private static byte[] makeTestData() { + String s = "If the dull substance of my flesh were thought\n" + + "Injurious distance could not stop my way\n" + + "For then, despite of space, I would be brought\n" + + "From limits far remote where thou dost stay.\n"; + byte[] r = new byte[s.length()]; + for (int i = 0; i < r.length; i++) { + r[i] = (byte)s.charAt(i); +// r[i] = (byte)s.charAt(i); + } + return r; + } +} diff --git a/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java b/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java new file mode 100644 index 00000000..9f7e2a15 --- /dev/null +++ b/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java @@ -0,0 +1,209 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import junit.framework.TestCase; + +import org.apache.commons.httpclient.Header; +import org.archive.io.arc.ARCRecord; +import org.archive.io.warc.WARCRecord; + +public class HeaderedArchiveRecordTest extends TestCase { + private static final String HTTPHEADER = "HTTP/1.1 200 OK\r\n" + + "Last-Modified: Sun, 28 Aug 2005 14:10:55 GMT\r\n" + + "Content-Length: 108\r\n" + "Connection: close\r\n" + + "Content-Type: text/html\r\n" + "\r\n"; + private static final String BODY = "\r\n" + " \r\n" + + " Neue Seite 1\r\n" + " \r\n" + + " \r\n" + " \r\n" + ""; + + public void testParseHttpHeadersInWARC() throws IOException { + final String url = "http://foo.maths.uq.edu.au/index.html"; + // final String warcHeader = "WARC/0.10 000000000486 response " + + // url + " 20070315152520 " + + // "urn:uuid:d8b342a8-dba4-4d7f-a551-1d8184f2ff58 " + + // "application/http; msgtype=response\r\n" + + // "Checksum: sha1:IT6YEX5WHKK57GOEHV2YHTTXEP5KPM6A\r\n" + + // "IP-Address: 80.150.6.184\r\n" + + // "\r\n"; + + final String warcHeader = "WARC/0.12\r\n" + + "MIME-Version: 1.0\r\n" + + "WARC-Record-Type: response\r\n" + + "WARC-Target-URI: http://foo.maths.uq.edu.au/index.html\r\n" + + "WARC-Date: 2006-09-19T17:20:24Z\r\n" + + "WARC-Digest: sha1:IT6YEX5WHKK57GOEHV2YHTTXEP5KPM6A\r\n" + + "WARC-IP-Address: 80.150.6.184\r\n" + + "Content-ID: \r\n" + + "Content-Type: application/http; msgtype=response\r\n" + + "Content-Length: " + (HTTPHEADER.length() + BODY.length()) + "\r\n" + + "\r\n"; + + final String hdr = warcHeader + HTTPHEADER + BODY; + + WARCRecord r = new WARCRecord(new ByteArrayInputStream(hdr.getBytes()), + "READER_IDENTIFIER", 0, false, true); + HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true); + + har.skipHttpHeader(); + + byte[] b = new byte[BODY.length()]; + har.read(b); + String bodyRead = new String(b); + assertEquals(BODY, bodyRead); + assertHeaderCorrectlyParsed(har.getContentHeaders()); + assertEquals("failed to retrieve Url from metadata", har.getHeader() + .getUrl(), url); + } + + public void testParseHttpHeadersInARC() throws IOException { + final int len = HTTPHEADER.length() + BODY.length(); + final int contentLength = BODY.length(); + final String url = "http://www.ly.gov.tw:80/accpart.htm"; + final String hdr = HTTPHEADER + BODY; + // Interesting difference between ARCRecord and WARCRecord is that the + // stream passed the ARCRecord is supposed to be just past the + // ARCRecord metadata line where as stream passed WARCRecord is at + // record start. TODO: Add to ARCRecord constructor that doesn't + // take an ArchiveRecordHeader but rather parses it from the stream. + ArchiveRecordHeader arh = new ArchiveRecordHeader() { + public int getContentBegin() { + // TODO: In ARCs, this is where http headers end and + // the content begins. Need to reconcile for generic + // HeaderedArchiveRecord processing. In this context, it + // makes sense setting it to zero -- HeaderedArchiveRecord + // will then figure it out. + return 0; + } + + public String getDate() { + return null; + } + + public String getDigest() { + return null; + } + + public Set getHeaderFieldKeys() { + return null; + } + + public Map getHeaderFields() { + return null; + } + + public Object getHeaderValue(String key) { + return null; + } + + public long getLength() { + return len; + } + + public long getContentLength() { + return contentLength; + } + + public String getMimetype() { + return null; + } + + public long getOffset() { + return 0; + } + + public String getReaderIdentifier() { + return null; + } + + public String getRecordIdentifier() { + return null; + } + + public String getUrl() { + return url; + } + + public String getVersion() { + return null; + } + + }; + ARCRecord r = new ARCRecord(new ByteArrayInputStream(hdr.getBytes()), + arh, 0, false, true, false); + + HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true); + har.skipHttpHeader(); + byte[] b = new byte[BODY.length()]; + har.read(b); + String bodyRead = new String(b); + assertEquals(BODY, bodyRead); + assertHeaderCorrectlyParsed(har.getContentHeaders()); + } + + public void testEasierParseHttpHeadersInARC() throws IOException { + final String url = "http://www.archive.org/index.htm"; + final String arcHeader = url + + " 192.168.0.1 20070515111004 text/html 167568\n"; + final String hdr = arcHeader + HTTPHEADER + BODY; + + ARCRecord r = new ARCRecord(new ByteArrayInputStream(hdr.getBytes()), + "READER_IDENTIFIER", 0, false, true, false); + + HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true); + har.skipHttpHeader(); + byte[] b = new byte[BODY.length()]; + har.read(b); + String bodyRead = new String(b); + assertEquals(BODY, bodyRead); + assertHeaderCorrectlyParsed(har.getContentHeaders()); + assertEquals("failed to retrieve Url from metadata", har.getHeader() + .getUrl(), url); + } + + private void assertHeaderCorrectlyParsed(Header[] headers) { + final List orgHeaders = Arrays.asList(HTTPHEADER.split("\r\n")); + assertEquals("not all HTTP header entries have been retrieved", + orgHeaders.size(), headers.length + 1); + + for (Header header : headers) { + assertTrue(orgHeaders.contains(header.getName() + ": " + + header.getValue())); + } + } + + public void testNoheaderWARC() throws IOException { + String b = "hello world"; + String c = "WARC/0.12\r\nContent-Type: text/plain\r\n" + + "Content-Length: " + b.length() + "\r\n\r\n" + b; + org.archive.io.warc.WARCRecord r = new org.archive.io.warc.WARCRecord( + new ByteArrayInputStream(c.getBytes()), "READER_IDENTIFIER", 0, + false, true); + HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true); + assertTrue(har.isStrict()); + } +} diff --git a/src/test/java/org/archive/io/RecordingInputStreamTest.java b/src/test/java/org/archive/io/RecordingInputStreamTest.java new file mode 100644 index 00000000..20a8b8b3 --- /dev/null +++ b/src/test/java/org/archive/io/RecordingInputStreamTest.java @@ -0,0 +1,132 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.IOException; +import java.io.PipedInputStream; +import java.io.PipedOutputStream; + +import org.archive.util.TmpDirTestCase; + + +/** + * Test cases for RecordingInputStream. + * + * @author gojomo + */ +public class RecordingInputStreamTest extends TmpDirTestCase +{ + + + /* + * @see TmpDirTestCase#setUp() + */ + protected void setUp() throws Exception + { + super.setUp(); + } + + /** + * Test readFullyOrUntil soft (no exception) and hard (exception) + * length cutoffs, timeout, and rate-throttling. + * + * @throws IOException + * @throws InterruptedException + * @throws RecorderTimeoutException + */ + public void testReadFullyOrUntil() throws RecorderTimeoutException, IOException, InterruptedException + { + RecordingInputStream ris = new RecordingInputStream(16384, (new File( + getTmpDir(), "testReadFullyOrUntil").getAbsolutePath())); + ByteArrayInputStream bais = new ByteArrayInputStream( + "abcdefghijklmnopqrstuvwxyz".getBytes()); + // test soft max + ris.open(bais); + ris.setLimits(10,0,0); + ris.readFullyOrUntil(7); + ris.close(); + ReplayInputStream res = ris.getReplayInputStream(); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + res.readFullyTo(baos); + assertEquals("soft max cutoff","abcdefg",new String(baos.toByteArray())); + // test hard max + bais.reset(); + baos.reset(); + ris.open(bais); + boolean exceptionThrown = false; + try { + ris.setLimits(10,0,0); + ris.readFullyOrUntil(13); + } catch (RecorderLengthExceededException ex) { + exceptionThrown = true; + } + assertTrue("hard max exception",exceptionThrown); + ris.close(); + res = ris.getReplayInputStream(); + res.readFullyTo(baos); + assertEquals("hard max cutoff","abcdefghijk", + new String(baos.toByteArray())); + // test timeout + PipedInputStream pin = new PipedInputStream(); + PipedOutputStream pout = new PipedOutputStream(pin); + ris.open(pin); + exceptionThrown = false; + trickle("abcdefghijklmnopqrstuvwxyz".getBytes(),pout); + try { + ris.setLimits(0,5000,0); + ris.readFullyOrUntil(0); + } catch (RecorderTimeoutException ex) { + exceptionThrown = true; + } + assertTrue("timeout exception",exceptionThrown); + ris.close(); + // test rate limit + bais = new ByteArrayInputStream(new byte[1024*2*5]); + ris.open(bais); + long startTime = System.currentTimeMillis(); + ris.setLimits(0,0,2); + ris.readFullyOrUntil(0); + long endTime = System.currentTimeMillis(); + long duration = endTime - startTime; + assertTrue("read too fast: "+duration,duration>=5000); + ris.close(); + } + + protected void trickle(final byte[] bytes, final PipedOutputStream pout) { + new Thread() { + public void run() { + try { + for (int i = 0; i < bytes.length; i++) { + Thread.sleep(1000); + pout.write(bytes[i]); + } + pout.close(); + } catch (IOException e) { + // do nothing + } catch (Exception e) { + System.err.print(e); + } + } + }.start(); + + } +} diff --git a/src/test/java/org/archive/io/ReplayCharSequenceTest.java b/src/test/java/org/archive/io/ReplayCharSequenceTest.java new file mode 100644 index 00000000..9208594a --- /dev/null +++ b/src/test/java/org/archive/io/ReplayCharSequenceTest.java @@ -0,0 +1,391 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + +import java.io.IOException; +import java.nio.charset.Charset; +import java.text.NumberFormat; +import java.util.Date; +import java.util.Random; +import java.util.logging.Logger; + +import org.archive.util.FileUtils; +import org.archive.util.TmpDirTestCase; + +import com.google.common.base.Charsets; + +/** + * Test ReplayCharSequences. + * + * @author stack, gojomo + * @version $Revision$, $Date$ + */ +public class ReplayCharSequenceTest extends TmpDirTestCase +{ + /** + * Logger. + */ + private static Logger logger = + Logger.getLogger("org.archive.io.ReplayCharSequenceFactoryTest"); + + + private static final int SEQUENCE_LENGTH = 127; + private static final int MULTIPLIER = 3; + private static final int BUFFER_SIZE = SEQUENCE_LENGTH * MULTIPLIER; + private static final int INCREMENT = 1; + + /** + * Buffer of regular content. + */ + private byte [] regularBuffer = null; + + /* + * @see TestCase#setUp() + */ + protected void setUp() throws Exception + { + super.setUp(); + this.regularBuffer = + fillBufferWithRegularContent(new byte [BUFFER_SIZE]); + } + + public void testShiftjis() throws IOException { + + // Here's the bytes for the JIS encoding of the Japanese form of Nihongo + byte[] bytes_nihongo = { + (byte) 0x1B, (byte) 0x24, (byte) 0x42, (byte) 0x46, + (byte) 0x7C, (byte) 0x4B, (byte) 0x5C, (byte) 0x38, + (byte) 0x6C, (byte) 0x1B, (byte) 0x28, (byte) 0x42, + (byte) 0x1B, (byte) 0x28, (byte) 0x42 }; + final String ENCODING = "SJIS"; + // Here is nihongo converted to JVM encoding. + String nihongo = new String(bytes_nihongo, ENCODING); + + RecordingOutputStream ros = writeTestStream( + bytes_nihongo,MULTIPLIER, + "testShiftjis",MULTIPLIER); + // TODO: check for existence of overflow file? + ReplayCharSequence rcs = getReplayCharSequence(ros,Charset.forName(ENCODING)); + + // Now check that start of the rcs comes back in as nihongo string. + String rcsStr = rcs.subSequence(0, nihongo.length()).toString(); + assertTrue("Nihongo " + nihongo + " does not equal converted string" + + " from rcs " + rcsStr, + nihongo.equals(rcsStr)); + // And assert next string is also properly nihongo. + if (rcs.length() >= (nihongo.length() * 2)) { + rcsStr = rcs.subSequence(nihongo.length(), + nihongo.length() + nihongo.length()).toString(); + assertTrue("Nihongo " + nihongo + " does not equal converted " + + " string from rcs (2nd time)" + rcsStr, + nihongo.equals(rcsStr)); + } + } + + public void testGetReplayCharSequenceByteZeroOffset() throws IOException { + + RecordingOutputStream ros = writeTestStream( + regularBuffer,MULTIPLIER, + "testGetReplayCharSequenceByteZeroOffset",MULTIPLIER); + ReplayCharSequence rcs = getReplayCharSequence(ros); + + for (int i = 0; i < MULTIPLIER; i++) { + accessingCharacters(rcs); + } + } + + private ReplayCharSequence getReplayCharSequence(RecordingOutputStream ros) throws IOException { + return getReplayCharSequence(ros,null); + } + + private ReplayCharSequence getReplayCharSequence(RecordingOutputStream ros, Charset charset) throws IOException { + return new GenericReplayCharSequence(ros.getReplayInputStream(), + ros.getBufferLength()/2, ros.backingFilename, charset); + } + + + public void testGetReplayCharSequenceMultiByteZeroOffset() + throws IOException { + + RecordingOutputStream ros = writeTestStream( + regularBuffer,MULTIPLIER, + "testGetReplayCharSequenceMultiByteZeroOffset",MULTIPLIER); + ReplayCharSequence rcs = getReplayCharSequence(ros,Charsets.UTF_8); + + for (int i = 0; i < MULTIPLIER; i++) { + accessingCharacters(rcs); + } + } + + public void testReplayCharSequenceByteToString() throws IOException { + String fileContent = "Some file content"; + byte [] buffer = fileContent.getBytes(); + RecordingOutputStream ros = writeTestStream( + buffer,1, + "testReplayCharSequenceByteToString.txt",0); + ReplayCharSequence rcs = getReplayCharSequence(ros); + String result = rcs.toString(); + assertEquals("Strings don't match",result,fileContent); + } + + private String toHexString(String str) + { + if (str != null) { + StringBuilder buf = new StringBuilder("{ "); + buf.append(Integer.toString(str.charAt(0), 16)); + for (int i = 1; i < str.length(); i++) { + buf.append(", "); + buf.append(Integer.toString(str.charAt(i), 16)); + } + buf.append(" }"); + return buf.toString(); + } + else + return "null"; + } + + public void testSingleByteEncodings() throws IOException { + byte[] bytes = { + (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, + (byte) 0x7d, (byte) 0x7e, (byte) 0x7f, (byte) 0x80, + (byte) 0x81, (byte) 0x82, (byte) 0x83, (byte) 0x84, + (byte) 0xfc, (byte) 0xfd, (byte) 0xfe, (byte) 0xff }; + + String latin1String = new String(bytes, "latin1"); + RecordingOutputStream ros = writeTestStream( + bytes, 1, "testSingleByteEncodings-latin1.txt", 0); + ReplayCharSequence rcs = getReplayCharSequence(ros,Charsets.ISO_8859_1); + String result = rcs.toString(); + logger.fine("latin1[0] " + toHexString(latin1String)); + logger.fine("latin1[1] " + toHexString(result)); + assertEquals("latin1 strings don't match", result, latin1String); + + String w1252String = new String(bytes, "windows-1252"); + ros = writeTestStream( + bytes, 1, "testSingleByteEncodings-windows-1252.txt", 0); + rcs = getReplayCharSequence(ros,Charset.forName("windows-1252")); + result = rcs.toString(); + logger.fine("windows-1252[0] " + toHexString(w1252String)); + logger.fine("windows-1252[1] " + toHexString(result)); + assertEquals("windows-1252 strings don't match", result, w1252String); + + String asciiString = new String(bytes, "ascii"); + ros = writeTestStream( + bytes, 1, "testSingleByteEncodings-ascii.txt", 0); + rcs = getReplayCharSequence(ros,Charset.forName("ascii")); + result = rcs.toString(); + logger.fine("ascii[0] " + toHexString(asciiString)); + logger.fine("ascii[1] " + toHexString(result)); + assertEquals("ascii strings don't match", result, asciiString); + } + + public void testReplayCharSequenceByteToStringOverflow() throws IOException { + String fileContent = "Some file content. "; // ascii + byte [] buffer = fileContent.getBytes(); + RecordingOutputStream ros = writeTestStream( + buffer,1, + "testReplayCharSequenceByteToStringOverflow.txt",1); + String expectedContent = fileContent+fileContent; + + // The string is ascii which is a subset of both these encodings. Use + // both encodings because they exercise different code paths. UTF-8 is + // decoded to UTF-16 while windows-1252 is memory mapped directly. See + // GenericReplayCharSequence + ReplayCharSequence rcsUtf8 = getReplayCharSequence(ros,Charsets.UTF_8); + ReplayCharSequence rcs1252 = getReplayCharSequence(ros,Charset.forName("windows-1252")); + + String result = rcsUtf8.toString(); + assertEquals("Strings don't match", expectedContent, result); + + result = rcs1252.toString(); + assertEquals("Strings don't match", expectedContent, result); + } + + public void testReplayCharSequenceByteToStringMulti() throws IOException { + String fileContent = "Some file content"; + byte [] buffer = fileContent.getBytes("UTF-8"); + final int MULTIPLICAND = 10; + StringBuilder sb = + new StringBuilder(MULTIPLICAND * fileContent.length()); + for (int i = 0; i < MULTIPLICAND; i++) { + sb.append(fileContent); + } + String expectedResult = sb.toString(); + RecordingOutputStream ros = writeTestStream( + buffer,1, + "testReplayCharSequenceByteToStringMulti.txt",MULTIPLICAND-1); + for (int i = 0; i < 3; i++) { + ReplayCharSequence rcs = getReplayCharSequence(ros,Charsets.UTF_8); + String result = rcs.toString(); + assertEquals("Strings don't match", result, expectedResult); + rcs.close(); + System.gc(); + System.runFinalization(); + } + } + + public void xestHugeReplayCharSequence() throws IOException { + String fileContent = "01234567890123456789"; + String characterEncoding = "ascii"; + byte[] buffer = fileContent.getBytes(characterEncoding); + + long reps = (long) Integer.MAX_VALUE / (long) buffer.length + 1000000l; + + logger.info("writing " + (reps * buffer.length) + + " bytes to testHugeReplayCharSequence.txt"); + RecordingOutputStream ros = writeTestStream(buffer, 0, + "testHugeReplayCharSequence.txt", reps); + ReplayCharSequence rcs = getReplayCharSequence(ros,Charset.forName(characterEncoding)); + + if (reps * fileContent.length() > (long) Integer.MAX_VALUE) { + assertTrue("ReplayCharSequence has wrong length (length()=" + + rcs.length() + ") (should be " + Integer.MAX_VALUE + ")", + rcs.length() == Integer.MAX_VALUE); + } else { + assertEquals("ReplayCharSequence has wrong length (length()=" + + rcs.length() + ") (should be " + + (reps * fileContent.length()) + ")", (long) rcs.length(), + reps * (long) fileContent.length()); + } + + // boundary cases or something + for (int index : new int[] { 0, rcs.length() / 4, rcs.length() / 2, + rcs.length() - 1, rcs.length() / 4 }) { + // logger.info("testing char at index=" + + // NumberFormat.getInstance().format(index)); + assertEquals("Characters don't match (index=" + + NumberFormat.getInstance().format(index) + ")", + fileContent.charAt(index % fileContent.length()), rcs + .charAt(index)); + } + + // check that out of bounds indices throw exception + for (int n : new int[] { -1, Integer.MIN_VALUE, rcs.length() + 1 }) { + try { + String message = "rcs.charAt(" + n + ")=" + rcs.charAt(n) + + " ?!? -- expected IndexOutOfBoundsException"; + logger.severe(message); + fail(message); + } catch (IndexOutOfBoundsException e) { + logger.info("got expected exception: " + e); + } + } + + // check some characters at random spots & kinda stress test the + // system's memory mapping facility + Random rand = new Random(0); // seed so we get the same ones each time + for (int i = 0; i < 5000; i++) { + int index = rand.nextInt(rcs.length()); + // logger.info(i + ". testing char at index=" + + // NumberFormat.getInstance().format(index)); + assertEquals("Characters don't match (index=" + + NumberFormat.getInstance().format(index) + ")", + fileContent.charAt(index % fileContent.length()), rcs + .charAt(index)); + } + } + + /** + * Accessing characters test. + * + * Checks that characters in the rcs are in sequence. + * + * @param rcs The ReplayCharSequence to try out. + */ + private void accessingCharacters(CharSequence rcs) { + long timestamp = (new Date()).getTime(); + int seeks = 0; + for (int i = (INCREMENT * 2); (i + INCREMENT) < rcs.length(); + i += INCREMENT) { + checkCharacter(rcs, i); + seeks++; + for (int j = i - INCREMENT; j < i; j++) { + checkCharacter(rcs, j); + seeks++; + } + } + // Note that printing out below breaks cruisecontrols drawing + // of the xml unit test results because it outputs disallowed + // xml characters. + logger.fine(rcs + " seeks count " + seeks + " in " + + ((new Date().getTime()) - timestamp) + " milliseconds."); + } + + /** + * Check the character read. + * + * Throws assertion if not expected result. + * + * @param rcs ReplayCharSequence to read from. + * @param i Character offset. + */ + private void checkCharacter(CharSequence rcs, int i) { + int c = rcs.charAt(i); + assertTrue("Character " + Integer.toString(c) + " at offset " + i + + " unexpected.", (c % SEQUENCE_LENGTH) == (i % SEQUENCE_LENGTH)); + } + + /** + * @param baseName + * @return RecordingOutputStream + * @throws IOException + */ + private RecordingOutputStream writeTestStream(byte[] content, + int memReps, String baseName, long fileReps) throws IOException { + String backingFilename = FileUtils.maybeRelative(getTmpDir(),baseName).getAbsolutePath(); + RecordingOutputStream ros = new RecordingOutputStream( + content.length * memReps, + backingFilename); + ros.open(); + ros.markMessageBodyBegin(); + for(long i = 0; i < (memReps+fileReps); i++) { + // fill buffer (repeat MULTIPLIER times) and + // overflow to disk (also MULTIPLIER times) + ros.write(content); + } + ros.close(); + return ros; + } + + + /** + * Fill a buffer w/ regular progression of single-byte + * (and <= 127) characters. + * @param buffer Buffer to fill. + * @return The buffer we filled. + */ + private byte [] fillBufferWithRegularContent(byte [] buffer) { + int index = 0; + for (int i = 0; i < buffer.length; i++) { + buffer[i] = (byte) (index & 0x00ff); + index++; + if (index >= SEQUENCE_LENGTH) { + // Reset the index. + index = 0; + } + } + return buffer; + } + + public void testCheckParameters() + { + // TODO. + } +} diff --git a/src/test/java/org/archive/io/RepositionableInputStreamTest.java b/src/test/java/org/archive/io/RepositionableInputStreamTest.java new file mode 100644 index 00000000..1c7cc74c --- /dev/null +++ b/src/test/java/org/archive/io/RepositionableInputStreamTest.java @@ -0,0 +1,70 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.PrintWriter; + +import org.archive.util.TmpDirTestCase; + +public class RepositionableInputStreamTest extends TmpDirTestCase { + private File testFile; + private static final String LINE = "0123456789abcdefghijklmnopqrstuv"; + protected void setUp() throws Exception { + super.setUp(); + this.testFile = new File(getTmpDir(), this.getClass().getName()); + PrintWriter pw = new PrintWriter(new FileOutputStream(testFile)); + for (int i = 0; i < 100; i++) { + pw.print(LINE); + } + pw.close(); + } + protected void tearDown() throws Exception { + super.tearDown(); + } + public void testname() throws Exception { + // Make buffer awkward size so we run into buffers spanning issues. + RepositionableInputStream ris = + new RepositionableInputStream(new FileInputStream(this.testFile), + 57); + int c = ris.read(); + assertEquals(1, ris.position()); + ris.read(); + ris.position(0); + assertEquals(0, ris.position()); + int c1 = ris.read(); + assertEquals(c, c1); + ris.position(0); + byte [] bytes = new byte[LINE.length()]; + long offset = 0; + for (int i = 0; i < 10; i++) { + ris.read(bytes, 0, LINE.length()); + assertEquals(LINE, new String(bytes)); + offset += LINE.length(); + assertEquals(offset, ris.position()); + } + long p = ris.position(); + ris.position(p - LINE.length()); + assertEquals(p - LINE.length(), ris.position()); + c = ris.read(); + assertEquals(c, c1); + } +} diff --git a/src/test/java/org/archive/io/arc/ARCWriterPoolTest.java b/src/test/java/org/archive/io/arc/ARCWriterPoolTest.java new file mode 100644 index 00000000..f0be6506 --- /dev/null +++ b/src/test/java/org/archive/io/arc/ARCWriterPoolTest.java @@ -0,0 +1,122 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io.arc; + +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.util.Arrays; + +import org.archive.io.WriterPool; +import org.archive.io.WriterPoolMember; +import org.archive.io.WriterPoolSettings; +import org.archive.util.TmpDirTestCase; + + +/** + * Test ARCWriterPool + */ +@SuppressWarnings("deprecation") +public class ARCWriterPoolTest extends TmpDirTestCase { + private static final String PREFIX = "TEST"; + + public void testARCWriterPool() + throws Exception { + final int MAX_ACTIVE = 3; + final int MAX_WAIT_MILLISECONDS = 100; + cleanUpOldFiles(PREFIX); + WriterPool pool = new ARCWriterPool(getSettings(true), + MAX_ACTIVE, MAX_WAIT_MILLISECONDS); + WriterPoolMember [] writers = new WriterPoolMember[MAX_ACTIVE]; + final String CONTENT = "Any old content"; + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + baos.write(CONTENT.getBytes()); + for (int i = 0; i < MAX_ACTIVE; i++) { + writers[i] = pool.borrowFile(); + assertEquals("Number active", i + 1, pool.getNumActive()); + ((ARCWriter)writers[i]).write("http://one.two.three", "no-type", + "0.0.0.0", 1234567890, CONTENT.length(), baos); + } + + // Pool is maxed out. New behavior is that additional requests + // block as long as necessary -- so no longer testing for timeout/ + // exception + + for (int i = (MAX_ACTIVE - 1); i >= 0; i--) { + pool.returnFile(writers[i]); + assertEquals("Number active", i, pool.getNumActive()); + assertEquals("Number idle", MAX_ACTIVE - pool.getNumActive(), + pool.getNumIdle()); + } + pool.close(); + } + + public void testInvalidate() throws Exception { + final int MAX_ACTIVE = 3; + final int MAX_WAIT_MILLISECONDS = 100; + cleanUpOldFiles(PREFIX); + WriterPool pool = new ARCWriterPool(getSettings(true), + MAX_ACTIVE, MAX_WAIT_MILLISECONDS); + WriterPoolMember [] writers = new WriterPoolMember[MAX_ACTIVE]; + final String CONTENT = "Any old content"; + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + baos.write(CONTENT.getBytes()); + for (int i = 0; i < MAX_ACTIVE; i++) { + writers[i] = pool.borrowFile(); + assertEquals("Number active", i + 1, pool.getNumActive()); + ((ARCWriter)writers[i]).write("http://one.two.three", "no-type", + "0.0.0.0", 1234567890, CONTENT.length(), baos); + } + + WriterPoolMember writer2Invalidate = writers[pool.getNumActive() - 1]; + writers[pool.getNumActive() - 1] = null; + pool.invalidateFile(writer2Invalidate); + for (int i = 0; i < (MAX_ACTIVE - 1); i++) { + if (writers[i] == null) { + continue; + } + pool.returnFile(writers[i]); + } + + for (int i = 0; i < MAX_ACTIVE; i++) { + writers[i] = pool.borrowFile(); + assertEquals("Number active", i + 1, pool.getNumActive()); + ((ARCWriter)writers[i]).write("http://one.two.three", "no-type", + "0.0.0.0", 1234567890, CONTENT.length(), baos); + } + for (int i = (MAX_ACTIVE - 1); i >= 0; i--) { + pool.returnFile(writers[i]); + assertEquals("Number active", i, pool.getNumActive()); + assertEquals("Number idle", MAX_ACTIVE - pool.getNumActive(), + pool.getNumIdle()); + } + pool.close(); + } + + private WriterPoolSettings getSettings(final boolean isCompressed) { + File [] files = {getTmpDir()}; + return new WriterPoolSettingsData( + PREFIX, + "${prefix}-${timestamp17}-${serialno}-${heritrix.hostname}", + ARCConstants.DEFAULT_MAX_ARC_FILE_SIZE, + isCompressed, + Arrays.asList(files), + null); + } +} \ No newline at end of file diff --git a/src/test/java/org/archive/io/arc/ARCWriterTest.java b/src/test/java/org/archive/io/arc/ARCWriterTest.java new file mode 100644 index 00000000..f6e2bf6a --- /dev/null +++ b/src/test/java/org/archive/io/arc/ARCWriterTest.java @@ -0,0 +1,699 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io.arc; + +import java.io.BufferedInputStream; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.io.PrintStream; +import java.util.Arrays; +import java.util.Date; +import java.util.Iterator; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.io.input.NullInputStream; +import org.apache.commons.io.output.NullOutputStream; +import org.archive.io.ArchiveRecord; +import org.archive.io.ArchiveRecordHeader; +import org.archive.io.ReplayInputStream; +import org.archive.io.WriterPoolMember; +import org.archive.io.WriterPoolSettings; +import org.archive.util.ArchiveUtils; +import org.archive.util.TmpDirTestCase; + +import com.google.common.io.Closeables; + + +/** + * Test ARCWriter class. + * + * This code exercises ARCWriter AND ARCReader. First it writes ARCs w/ + * ARCWriter. Then it validates what was written w/ ARCReader. + * + * @author stack + */ +public class ARCWriterTest +extends TmpDirTestCase implements ARCConstants { + /** + * Utility class for writing bad ARCs (with trailing junk) + */ + public class CorruptibleARCWriter extends ARCWriter { + byte[] endJunk = null; + + public CorruptibleARCWriter(AtomicInteger serial_no, WriterPoolSettings settings) { + super(serial_no, settings); + } + + @Override + protected void postWriteRecordTasks() throws IOException { + if (endJunk != null) { + this.write(endJunk); + } + super.postWriteRecordTasks(); + } + + public void setEndJunk(byte[] b) throws IOException { + this.endJunk = b; + } + } + + /** + * Suffix to use for ARC files made by JUNIT. + */ + private static final String SUFFIX = "JUNIT"; + + private static final String SOME_URL = "http://www.archive.org/test/"; + + + private static final AtomicInteger SERIAL_NO = new AtomicInteger(); + + /* + * @see TestCase#setUp() + */ + protected void setUp() throws Exception { + super.setUp(); + } + + /* + * @see TestCase#tearDown() + */ + protected void tearDown() throws Exception { + super.tearDown(); + } + + protected static String getContent() { + return getContent(null); + } + + protected static String getContent(String indexStr) { + String page = (indexStr != null)? "Page #" + indexStr: "Some Page"; + return "HTTP/1.1 200 OK\r\n" + + "Content-Type: text/html\r\n\r\n" + + "" + page + + "" + + "" + page + + ""; + } + + @SuppressWarnings("deprecation") + protected int writeRandomHTTPRecord(ARCWriter arcWriter, int index) + throws IOException { + String indexStr = Integer.toString(index); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + // Start the record with an arbitrary 14-digit date per RFC2540 + String now = ArchiveUtils.get14DigitDate(); + int recordLength = 0; + byte[] record = (getContent(indexStr)).getBytes(); + recordLength += record.length; + baos.write(record); + // Add the newline between records back in + baos.write("\n".getBytes()); + recordLength += 1; + arcWriter.write("http://www.one.net/id=" + indexStr, "text/html", + "0.1.2.3", Long.parseLong(now), recordLength, baos); + return recordLength; + } + + private File writeRecords(String baseName, boolean compress, + long maxSize, int recordCount) + throws IOException { + cleanUpOldFiles(baseName); + File [] files = {getTmpDir()}; + ARCWriter arcWriter = + new ARCWriter( + SERIAL_NO, + new WriterPoolSettingsData( + baseName, + "${prefix}-"+SUFFIX, + maxSize, + compress, + Arrays.asList(files), + null)); + assertNotNull(arcWriter); + for (int i = 0; i < recordCount; i++) { + writeRandomHTTPRecord(arcWriter, i); + } + arcWriter.close(); + assertTrue("Doesn't exist: " + + arcWriter.getFile().getAbsolutePath(), + arcWriter.getFile().exists()); + return arcWriter.getFile(); + } + + private void validate(File arcFile, int recordCount) + throws FileNotFoundException, IOException { + ARCReader reader = ARCReaderFactory.get(arcFile); + assertNotNull(reader); + List metaDatas = null; + if (recordCount == -1) { + metaDatas = reader.validate(); + } else { + metaDatas = reader.validate(recordCount); + } + reader.close(); + // Now, run through each of the records doing absolute get going from + // the end to start. Reopen the arc so no context between this test + // and the previous. + + for (int i = metaDatas.size() - 1; i >= 0; i--) { + reader = ARCReaderFactory.get(arcFile); + ARCRecordMetaData meta = (ARCRecordMetaData)metaDatas.get(i); + ArchiveRecord r = reader.get(meta.getOffset()); + String mimeType = r.getHeader().getMimetype(); + assertTrue("Record is bogus", + mimeType != null && mimeType.length() > 0); + reader.close(); + } + assertEquals("Metadata count not as expected",recordCount, metaDatas.size()); + for (Iterator i = metaDatas.iterator(); i.hasNext();) { + ARCRecordMetaData r = (ARCRecordMetaData)i.next(); + assertTrue("Record is empty", r.getLength() > 0); + } + } + + public void testCheckARCFileSize() + throws IOException { + runCheckARCFileSizeTest("checkARCFileSize", false); + } + + public void testCheckARCFileSizeCompressed() + throws IOException { + runCheckARCFileSizeTest("checkARCFileSize", true); + } + + public void testWriteRecord() throws IOException { + final int recordCount = 2; + File arcFile = writeRecords("writeRecord", false, + DEFAULT_MAX_ARC_FILE_SIZE, recordCount); + validate(arcFile, recordCount + 1); // Header record. + } + + public void testRandomAccess() throws IOException { + final int recordCount = 3; + File arcFile = writeRecords("writeRecord", true, + DEFAULT_MAX_ARC_FILE_SIZE, recordCount); + ARCReader reader = ARCReaderFactory.get(arcFile); + // Get to second record. Get its offset for later use. + boolean readFirst = false; + String url = null; + long offset = -1; + long totalRecords = 0; + boolean readSecond = false; + for (final Iterator i = reader.iterator(); i.hasNext(); totalRecords++) { + ARCRecord ar = (ARCRecord)i.next(); + if (!readFirst) { + readFirst = true; + continue; + } + if (!readSecond) { + url = ar.getMetaData().getUrl(); + offset = ar.getMetaData().getOffset(); + readSecond = true; + } + } + reader.close(); + + reader = ARCReaderFactory.get(arcFile, offset); + ArchiveRecord ar = reader.get(); + assertEquals(ar.getHeader().getUrl(), url); + ar.close(); + reader.close(); + + // Get reader again. See how iterator works with offset + reader = ARCReaderFactory.get(arcFile, offset); + int count = 0; + for (final Iterator i = reader.iterator(); i.hasNext(); i.next()) { + count++; + } + reader.close(); + assertEquals(totalRecords - 1, count); + } + + public void testWriteRecordCompressed() throws IOException { + final int recordCount = 2; + File arcFile = writeRecords("writeRecordCompressed", true, + DEFAULT_MAX_ARC_FILE_SIZE, recordCount); + validate(arcFile, recordCount + 1 /*Header record*/); + } + + public void testWriteGiantRecord() throws IOException { + PrintStream dummyStream = new PrintStream(new NullOutputStream()); + ARCWriter arcWriter = + new ARCWriter( + SERIAL_NO, + dummyStream, + new File("dummy"), + new WriterPoolSettingsData( + "", + "", + -1, + false, + null, + null)); + assertNotNull(arcWriter); + + // Start the record with an arbitrary 14-digit date per RFC2540 + long now = System.currentTimeMillis(); + long recordLength = org.apache.commons.io.FileUtils.ONE_GB * 3; + + arcWriter.write("dummy:uri", "application/octet-stream", + "0.1.2.3", now, recordLength, new NullInputStream(recordLength)); + arcWriter.close(); + } + + private void runCheckARCFileSizeTest(String baseName, boolean compress) + throws FileNotFoundException, IOException { + File f = writeRecords(baseName, compress, 1024, 15); + validate(f, 15+1); + } + + protected CorruptibleARCWriter createARCWriter(String name, boolean compress) { + File [] files = {getTmpDir()}; + return new CorruptibleARCWriter( + SERIAL_NO, + new WriterPoolSettingsData( + name, + "${prefix}-"+SUFFIX, + DEFAULT_MAX_ARC_FILE_SIZE, + compress, + Arrays.asList(files), + null)); + } + + protected static ByteArrayInputStream getBais(String str) + throws IOException { + return new ByteArrayInputStream(str.getBytes()); + } + + /** + * Writes a record, suppressing normal length-checks (so that + * intentionally malformed records may be written). + */ + protected static void writeRecord(ARCWriter writer, String url, + String type, int len, ByteArrayInputStream bais) + throws IOException { + writer.write(url, type, "192.168.1.1", (new Date()).getTime(), len, + bais, false); + } + + protected int iterateRecords(ARCReader r) + throws IOException { + int count = 0; + for (Iterator i = r.iterator(); i.hasNext();) { + ARCRecord rec = (ARCRecord)i.next(); + rec.close(); + if (count != 0) { + assertTrue("Unexpected URL " + rec.getMetaData().getUrl(), + rec.getMetaData().getUrl().startsWith(SOME_URL)); + } + count++; + } + return count; + } + + protected CorruptibleARCWriter createArcWithOneRecord(String name, + boolean compressed) + throws IOException { + CorruptibleARCWriter writer = createARCWriter(name, compressed); + String content = getContent(); + writeRecord(writer, SOME_URL, "text/html", + content.length(), getBais(content)); + return writer; + } + + public void testSpaceInURL() { + String eMessage = null; + try { + holeyUrl("testSpaceInURL", false, " "); + } catch (IOException e) { + eMessage = e.getMessage(); + } + assertTrue("Didn't get expected exception: " + eMessage, + eMessage.startsWith("Metadata line doesn't match")); + } + + public void testTabInURL() { + String eMessage = null; + try { + holeyUrl("testTabInURL", false, "\t"); + } catch (IOException e) { + eMessage = e.getMessage(); + } + assertTrue("Didn't get expected exception: " + eMessage, + eMessage.startsWith("Metadata line doesn't match")); + } + + protected void holeyUrl(String name, boolean compress, String urlInsert) + throws IOException { + ARCWriter writer = null; + try { + writer = createArcWithOneRecord(name, compress); + // Add some bytes on the end to mess up the record. + String content = getContent(); + writeRecord(writer, SOME_URL + urlInsert + "/index.html", "text/html", + content.length(), getBais(content)); + } finally { + Closeables.close(writer, true); + } + } + +// If uncompressed, length has to be right or parse will fail. +// +// public void testLengthTooShort() throws IOException { +// lengthTooShort("testLengthTooShort-" + PREFIX, false); +// } + + public void testLengthTooShortCompressed() throws IOException { + lengthTooShort("testLengthTooShortCompressed", true, false); + } + + public void testLengthTooShortCompressedStrict() + throws IOException { + String eMessage = null; + try { + lengthTooShort("testLengthTooShortCompressedStrict", + true, true); + } catch (RuntimeException e) { + eMessage = e.getMessage(); + } + assertTrue("Didn't get expected exception: " + eMessage, + eMessage.startsWith("java.io.IOException: Record STARTING at")); + } + + protected void lengthTooShort(String name, boolean compress, boolean strict) + throws IOException { + CorruptibleARCWriter writer = null; + try { + writer = createArcWithOneRecord(name, compress); + // Add some bytes on the end to mess up the record. + String content = getContent(); + ByteArrayInputStream bais = getBais(content+"SOME TRAILING BYTES"); + writeRecord(writer, SOME_URL, "text/html", + content.length(), bais); + writer.setEndJunk("SOME TRAILING BYTES".getBytes()); + writeRecord(writer, SOME_URL, "text/html", + content.length(), getBais(content)); + } finally { + Closeables.close(writer, true); + } + + // Catch System.err into a byte stream. + ByteArrayOutputStream os = new ByteArrayOutputStream(); + PrintStream origErr = System.err; + ARCReader r = null; + try { + System.setErr(new PrintStream(os)); + + r = ARCReaderFactory.get(writer.getFile()); + r.setStrict(strict); + int count = iterateRecords(r); + assertTrue("Count wrong " + count, count == 4); + + // Make sure we get the warning string which complains about the + // trailing bytes. + String err = os.toString(); + assertTrue("No message " + err, err.startsWith("WARNING") && + (err.indexOf("Record STARTING at") > 0)); + r.close(); + } finally { + Closeables.close(r, true); + System.setErr(origErr); + } + } + +// If uncompressed, length has to be right or parse will fail. +// +// public void testLengthTooLong() +// throws IOException { +// lengthTooLong("testLengthTooLongCompressed-" + PREFIX, +// false, false); +// } + + public void testLengthTooLongCompressed() + throws IOException { + lengthTooLong("testLengthTooLongCompressed", + true, false); + } + + public void testLengthTooLongCompressedStrict() { + String eMessage = null; + try { + lengthTooLong("testLengthTooLongCompressed", + true, true); + } catch (IOException e) { + eMessage = e.getMessage(); + } + assertTrue("Didn't get expected exception: " + eMessage, + eMessage.startsWith("Premature EOF before end-of-record")); + } + + protected void lengthTooLong(String name, boolean compress, + boolean strict) + throws IOException { + ARCWriter writer = createArcWithOneRecord(name, compress); + // Add a record with a length that is too long. + String content = getContent(); + writeRecord(writer, SOME_URL+"2", "text/html", + content.length() + 10, getBais(content)); + writeRecord(writer, SOME_URL+"3", "text/html", + content.length(), getBais(content)); + writer.close(); + + // Catch System.err. + ByteArrayOutputStream os = new ByteArrayOutputStream(); + + PrintStream origErr = System.err; + ARCReader r = null; + try { + System.setErr(new PrintStream(os)); + + r = ARCReaderFactory.get(writer.getFile()); + r.setStrict(strict); + int count = iterateRecords(r); + assertTrue("Count wrong " + count, count == 4); + + // Make sure we get the warning string which complains about the + // trailing bytes. + String err = os.toString(); + assertTrue("No message " + err, + err.startsWith("WARNING Premature EOF before end-of-record")); + } finally { + Closeables.close(r, true); + System.setErr(origErr); + } + } + + public void testGapError() throws IOException { + ARCWriter writer = createArcWithOneRecord("testGapError", true); + String content = getContent(); + // Make a 'weird' RIS that returns bad 'remaining' length + // awhen remaining should be 0 + ReplayInputStream ris = new ReplayInputStream(content.getBytes(), + content.length(), null) { + public long remaining() { + return (super.remaining()==0) ? -1 : super.remaining(); + } + }; + String message = null; + try { + writer.write(SOME_URL, "text/html", "192.168.1.1", + (new Date()).getTime(), content.length(), ris); + } catch (IOException e) { + message = e.getMessage(); + } finally { + IOUtils.closeQuietly(ris); + } + writer.close(); + assertTrue("No gap when should be", + message != null && + message.indexOf("Gap between expected and actual") >= 0); + } + + /** + * Write an arc file for other tests to use. + * @param arcdir Directory to write to. + * @param compress True if file should be compressed. + * @return ARC written. + * @throws IOException + */ + public static File createARCFile(File arcdir, boolean compress) + throws IOException { + File [] files = {arcdir}; + ARCWriter writer = new ARCWriter(SERIAL_NO, + new WriterPoolSettingsData( + "", + "test", + DEFAULT_MAX_ARC_FILE_SIZE, + compress, + Arrays.asList(files), + null)); + String content = getContent(); + writeRecord(writer, SOME_URL, "text/html", content.length(), + getBais(content)); + writer.close(); + return writer.getFile(); + } + +// public void testSpeed() throws IOException { +// ARCWriter writer = createArcWithOneRecord("speed", true); +// // Add a record with a length that is too long. +// String content = getContent(); +// final int count = 100000; +// logger.info("Starting speed write of " + count + " records."); +// for (int i = 0; i < count; i++) { +// writeRecord(writer, SOME_URL, "text/html", content.length(), +// getBaos(content)); +// } +// writer.close(); +// logger.info("Finished speed write test."); +// } + + + public void testValidateMetaLine() throws Exception { + final String line = "http://www.aandw.net/images/walden2.png " + + "128.197.34.86 20060111174224 image/png 2160"; + ARCWriter w = createARCWriter("testValidateMetaLine", true); + try { + w.validateMetaLine(line); + w.validateMetaLine(line + LINE_SEPARATOR); + w.validateMetaLine(line + "\\r\\n"); + } finally { + w.close(); + } + } + + public void testArcRecordOffsetReads() throws Exception { + ARCReader r = getSingleRecordReader("testArcRecordInBufferStream"); + ARCRecord ar = getSingleRecord(r); + // Now try getting some random set of bytes out of it + // at an odd offset (used to fail because we were + // doing bad math to find where in buffer to read). + final byte[] buffer = new byte[17]; + final int maxRead = 4; + int totalRead = 0; + while (totalRead < maxRead) { + totalRead = totalRead + + ar.read(buffer, 13 + totalRead, maxRead - totalRead); + assertTrue(totalRead > 0); + } + r.close(); + } + + // available should always be >= 0; extra read()s should all give EOF + public void testArchiveRecordAvailableConsistent() throws Exception { + // first test reading byte-at-a-time via no-param read() + ARCReader r = getSingleRecordReader("testArchiveRecordAvailableConsistent"); + ARCRecord record = getSingleRecord(r); + int c = record.read(); + while(c>=0) { + c = record.read(); + } + // consecutive reads after EOR should always give -1, still show zero available() + for (int i=0; i<5; i++) { + assertTrue("available negative:"+record.available(), record.available()>=0); + assertEquals(-1, record.read()); + } + r.close(); + } + + // should always give -1 on repeated reads past EOR + public void testArchiveRecordEORConsistent() throws Exception { + ARCReader r = getSingleRecordReader("testArchiveRecordEORConsistent"); + ARCRecord record = getSingleRecord(r); + this.readToEOS(record); + // consecutive reads after EOR should always give -1 + for (int i=0; i<5; i++) { + assertEquals(-1, record.read(new byte[1])); + } + r.close(); + } + + // should not throw premature EOF when wrapped with BufferedInputStream + // [HER-1450] showed this was the case using Apache Tika + public void testArchiveRecordMarkSupport() throws Exception { + ARCReader r = getSingleRecordReader("testArchiveRecordMarkSupport"); + ARCRecord record = getSingleRecord(r); + record.setStrict(true); + // ensure mark support + InputStream stream = new BufferedInputStream(record); + if (stream.markSupported()) { + for (int i=0; i<3; i++) { + this.readToEOS(stream); + stream.mark(stream.available()); + stream.reset(); + } + stream.close(); + } + r.close(); + } + + /** + * Test a particular style of using the reader iterator. (Should + * possibly be on a reader-centric test class, but the best setup + * functionality is here.) + * + * @throws IOException + */ + public void testReadIterator() throws IOException { + final int recordCount = 3; + File arcFile = writeRecords("writeRecord", true, + DEFAULT_MAX_ARC_FILE_SIZE, recordCount); + ARCReader reader = ARCReaderFactory.get(arcFile); + Iterator it = reader.iterator(); + while (it.hasNext()) { + ArchiveRecord next = it.next(); + next.close(); + } + reader.close(); + } + + protected void readToEOS(InputStream in) throws Exception { + byte [] buf = new byte[1024]; + int read = 0; + while (read >= 0) { + read = in.read(buf); + // System.out.println("readToEOS read " + read + " bytes"); + } + } + + protected ARCReader getSingleRecordReader(String name) throws Exception { + // Get an ARC with one record. + WriterPoolMember w = createArcWithOneRecord(name, true); + w.close(); + // Get reader on said ARC. + ARCReader r = ARCReaderFactory.get(w.getFile()); + return r; + } + + protected ARCRecord getSingleRecord(ARCReader r) { + final Iterator i = r.iterator(); + // Skip first ARC meta record. + i.next(); + i.hasNext(); + // Now we're at first and only record in ARC. + return (ARCRecord) i.next(); + } +} diff --git a/src/test/java/org/archive/io/warc/WARCWriterTest.java b/src/test/java/org/archive/io/warc/WARCWriterTest.java new file mode 100644 index 00000000..35c68714 --- /dev/null +++ b/src/test/java/org/archive/io/warc/WARCWriterTest.java @@ -0,0 +1,512 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io.warc; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.net.URI; +import java.util.Arrays; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; + +import org.archive.io.ArchiveRecord; +import org.archive.io.ArchiveRecordHeader; +import org.archive.io.UTF8Bytes; +import org.archive.io.WriterPoolMember; +import org.archive.uid.RecordIDGenerator; +import org.archive.uid.UUIDGenerator; +import org.archive.util.ArchiveUtils; +import org.archive.util.TmpDirTestCase; +import org.archive.util.anvl.ANVLRecord; + +/** + * Test Writer and Reader. + * @author stack + * @version $Date: 2006-08-29 19:35:48 -0700 (Tue, 29 Aug 2006) $ $Version$ + */ +public class WARCWriterTest +extends TmpDirTestCase implements WARCConstants { + + private static final AtomicInteger SERIAL_NO = new AtomicInteger(); + + RecordIDGenerator generator = new UUIDGenerator(); + + /** + * Prefix to use for ARC files made by JUNIT. + */ + private static final String SUFFIX = "JUNIT"; + + private static final String SOME_URL = "http://www.archive.org/test/"; + + @SuppressWarnings("unchecked") + public void testCheckHeaderLineValue() throws Exception { + WARCWriter writer = new WARCWriter( + SERIAL_NO, + new WARCWriterPoolSettingsData( + "","test",1,false,Collections.EMPTY_LIST,Collections.EMPTY_LIST,generator)); + writer.checkHeaderValue("one"); + IllegalArgumentException exception = null; + try { + writer.checkHeaderValue("with space"); + } catch(IllegalArgumentException e) { + exception = e; + } + assertNotNull(exception); + exception = null; + try { + writer.checkHeaderValue("with\0x0000controlcharacter"); + } catch(IllegalArgumentException e) { + exception = e; + } + writer.close(); + assertNotNull(exception); + } + + @SuppressWarnings("unchecked") + public void testMimetypes() throws IOException { + WARCWriter writer = new WARCWriter(SERIAL_NO, + new WARCWriterPoolSettingsData( + "m","testM",1,false,Collections.EMPTY_LIST,Collections.EMPTY_LIST,generator)); + writer.checkHeaderLineMimetypeParameter("text/xml"); + writer.checkHeaderLineMimetypeParameter("text/xml+rdf"); + assertEquals(writer.checkHeaderLineMimetypeParameter( + "text/plain; charset=SHIFT-JIS"), "text/plain; charset=SHIFT-JIS"); + assertEquals(writer.checkHeaderLineMimetypeParameter( + "multipart/mixed; \r\n boundary=\"simple boundary\""), + "multipart/mixed; boundary=\"simple boundary\""); + } + + public void testWriteRecord() throws IOException { + File [] files = {getTmpDir()}; + + // Write uncompressed. + WARCWriter writer = + new WARCWriter(SERIAL_NO, new WARCWriterPoolSettingsData( + this.getClass().getName(), "templateWR1", -1, false, Arrays.asList(files), null, generator)); + + writeFile(writer); + writer.close(); + + // Write compressed. + writer = new WARCWriter(SERIAL_NO, new WARCWriterPoolSettingsData( + this.getClass().getName(), "templateWR2", -1, true, Arrays.asList(files), null, generator)); + + writeFile(writer); + writer.close(); + } + + private void writeFile(final WARCWriter writer) + throws IOException { + try { + writeWarcinfoRecord(writer); + writeBasicRecords(writer); + } finally { + writer.close(); + writer.getFile().delete(); + } + } + + private void writeWarcinfoRecord(WARCWriter writer) + throws IOException { + WARCRecordInfo recordInfo = new WARCRecordInfo(); + recordInfo.setType(WARCRecordType.warcinfo); + recordInfo.setUrl(null); + recordInfo.setCreate14DigitDate(ArchiveUtils.getLog14Date()); + recordInfo.setMimetype(ANVLRecord.MIMETYPE); + recordInfo.setExtraHeaders(null); + recordInfo.setEnforceLength(true); + + ANVLRecord meta = new ANVLRecord(); + meta.addLabelValue("size", "1G"); + meta.addLabelValue("operator", "igor"); + byte [] bytes = meta.getUTF8Bytes(); + recordInfo.setContentStream(new ByteArrayInputStream(bytes)); + recordInfo.setContentLength((long) bytes.length); + + final URI recordid = writer.generateRecordId(WARCWriter.TYPE, WARCRecordType.warcinfo.toString()); + recordInfo.setRecordId(recordid); + + writer.writeRecord(recordInfo); + } + + protected void writeBasicRecords(final WARCWriter writer) + throws IOException { + WARCRecordInfo recordInfo = new WARCRecordInfo(); + recordInfo.setType(WARCRecordType.metadata); + recordInfo.setUrl("http://www.archive.org/"); + recordInfo.setCreate14DigitDate(ArchiveUtils.get14DigitDate()); + recordInfo.setMimetype("no/type"); + recordInfo.setEnforceLength(true); + + ANVLRecord headerFields = new ANVLRecord(); + headerFields.addLabelValue("x", "y"); + headerFields.addLabelValue("a", "b"); + recordInfo.setExtraHeaders(headerFields); + + URI rid = (new UUIDGenerator()).getQualifiedRecordID(TYPE, WARCRecordType.metadata.toString()); + recordInfo.setRecordId(rid); + + final String content = "Any old content."; + for (int i = 0; i < 10; i++) { + String body = i + ". " + content; + byte [] bodyBytes = body.getBytes(UTF8Bytes.UTF8); + recordInfo.setContentStream(new ByteArrayInputStream(bodyBytes)); + recordInfo.setContentLength((long)bodyBytes.length); + writer.writeRecord(recordInfo); + } + } + + /** + * @return Generic HTML Content. + */ + protected static String getContent() { + return getContent(null); + } + + /** + * @return Generic HTML Content with mention of passed indexStr + * in title and body. + */ + protected static String getContent(String indexStr) { + String page = (indexStr != null)? "Page #" + indexStr: "Some Page"; + return "HTTP/1.1 200 OK\r\n" + + "Content-Type: text/html\r\n\r\n" + + "" + page + + "" + + "" + page + + ""; + } + + /** + * Write random HTML Record. + * @param w Where to write. + * @param index An index to put into content. + * @return Length of record written. + * @throws IOException + */ + protected int writeRandomHTTPRecord(WARCWriter w, int index) + throws IOException { + WARCRecordInfo recordInfo = new WARCRecordInfo(); + recordInfo.setType(WARCRecordType.resource); + recordInfo.setCreate14DigitDate(ArchiveUtils.get14DigitDate()); + recordInfo.setMimetype("text/html; charset=UTF-8"); + recordInfo.setRecordId(w.generateRecordId(null)); + recordInfo.setEnforceLength(true); + + String indexStr = Integer.toString(index); + recordInfo.setUrl("http://www.one.net/id=" + indexStr); + + byte[] record = (getContent(indexStr)).getBytes(); + recordInfo.setContentLength((long) record.length); + + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + baos.write(record); + recordInfo.setContentStream(new ByteArrayInputStream(baos.toByteArray())); + + // Add named fields for ip, checksum, and relate the metadata + // and request to the resource field. + recordInfo.addExtraHeader(NAMED_FIELD_IP_LABEL, "127.0.0.1"); + + w.writeRecord(recordInfo); + return record.length; + } + + /** + * Fill a WARC with HTML Records. + * @param baseName WARC basename. + * @param compress Whether to compress or not. + * @param maxSize Maximum WARC size. + * @param recordCount How many records. + * @return The written file. + * @throws IOException + */ + private File writeRecords(String baseName, boolean compress, + int maxSize, int recordCount) + throws IOException { + cleanUpOldFiles(baseName); + File [] files = {getTmpDir()}; + WARCWriter w = new WARCWriter(SERIAL_NO, new WARCWriterPoolSettingsData( + baseName + '-' + SUFFIX, "${prefix}", maxSize, compress, Arrays.asList(files), null, generator)); + + assertNotNull(w); + for (int i = 0; i < recordCount; i++) { + writeRandomHTTPRecord(w, i); + } + w.close(); + assertTrue("Doesn't exist: " + w.getFile().getAbsolutePath(), + w.getFile().exists()); + return w.getFile(); + } + + /** + * Run validation of passed file. + * @param f File to validate. + * @param recordCount Expected count of records. + * @throws FileNotFoundException + * @throws IOException + */ + private void validate(File f, int recordCount) + throws FileNotFoundException, IOException { + WARCReader reader = WARCReaderFactory.get(f); + assertNotNull(reader); + List headers = null; + if (recordCount == -1) { + headers = reader.validate(); + } else { + headers = reader.validate(recordCount); + } + reader.close(); + + // Now, run through each of the records doing absolute get going from + // the end to start. Reopen the arc so no context between this test + // and the previous. + + for (int i = headers.size() - 1; i >= 0; i--) { + reader = WARCReaderFactory.get(f); + ArchiveRecordHeader h = (ArchiveRecordHeader)headers.get(i); + ArchiveRecord r = reader.get(h.getOffset()); + String mimeType = r.getHeader().getMimetype(); + assertTrue("Record is bogus", + mimeType != null && mimeType.length() > 0); + reader.close(); + } + + assertTrue("Metadatas not equal", headers.size() == recordCount); + for (Iterator i = headers.iterator(); i.hasNext();) { + ArchiveRecordHeader r = (ArchiveRecordHeader)i.next(); + assertTrue("Record is empty", r.getLength() > 0); + } + } + + public void testWriteRecords() throws IOException { + final int recordCount = 2; + File f = writeRecords("writeRecords", false, DEFAULT_MAX_WARC_FILE_SIZE, + recordCount); + validate(f, recordCount + 1); // Header record. + } + + public void testRandomAccess() throws IOException { + final int recordCount = 3; + File f = writeRecords("randomAccess", true, DEFAULT_MAX_WARC_FILE_SIZE, + recordCount); + WARCReader reader = WARCReaderFactory.get(f); + // Get to second record. Get its offset for later use. + boolean readFirst = false; + String url = null; + long offset = -1; + long totalRecords = 0; + boolean readSecond = false; + for (final Iterator i = reader.iterator(); i.hasNext(); + totalRecords++) { + WARCRecord ar = (WARCRecord)i.next(); + if (!readFirst) { + readFirst = true; + continue; + } + if (!readSecond) { + url = ar.getHeader().getUrl(); + offset = ar.getHeader().getOffset(); + readSecond = true; + } + } + reader.close(); + + reader = WARCReaderFactory.get(f, offset); + ArchiveRecord ar = reader.get(); + assertEquals(ar.getHeader().getUrl(), url); + ar.close(); + reader.close(); + + // Get reader again. See how iterator works with offset + reader = WARCReaderFactory.get(f, offset); + int count = 0; + for (final Iterator i = reader.iterator(); i.hasNext(); i.next()) { + count++; + } + reader.close(); + assertEquals(totalRecords - 1, count); + } + + public void testWriteRecordCompressed() throws IOException { + final int recordCount = 2; + File arcFile = writeRecords("writeRecordCompressed", true, + DEFAULT_MAX_WARC_FILE_SIZE, recordCount); + validate(arcFile, recordCount + 1 /*Header record*/); + } + + protected WARCWriter createWARCWriter(String name, + boolean compress) { + File [] files = {getTmpDir()}; + return new WARCWriter(SERIAL_NO, + new WARCWriterPoolSettingsData( + name, + "${prefix}-"+SUFFIX, + DEFAULT_MAX_WARC_FILE_SIZE, + compress, + Arrays.asList(files), + null, + generator)); + } + + protected static ByteArrayOutputStream getBaos(String str) + throws IOException { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + baos.write(str.getBytes()); + return baos; + } + + protected static void writeRecord(WARCWriter w, String url, + String mimetype, int len, ByteArrayOutputStream baos) + throws IOException { + WARCRecordInfo recordInfo = new WARCRecordInfo(); + recordInfo.setType(WARCRecordType.resource); + recordInfo.setUrl(url); + recordInfo.setCreate14DigitDate(ArchiveUtils.get14DigitDate()); + recordInfo.setMimetype(mimetype); + recordInfo.setRecordId(w.generateRecordId(null)); + recordInfo.setExtraHeaders(null); + recordInfo.setContentStream(new ByteArrayInputStream(baos.toByteArray())); + recordInfo.setContentLength((long) len); + recordInfo.setEnforceLength(true); + + w.writeRecord(recordInfo); + } + + protected int iterateRecords(WARCReader r) + throws IOException { + int count = 0; + for (Iterator i = r.iterator(); i.hasNext();) { + ArchiveRecord ar = i.next(); + ar.close(); + if (count != 0) { + assertTrue("Unexpected URL " + ar.getHeader().getUrl(), + ar.getHeader().getUrl().equals(SOME_URL)); + } + count++; + } + return count; + } + + protected WARCWriter createWithOneRecord(String name, + boolean compressed) + throws IOException { + WARCWriter writer = createWARCWriter(name, compressed); + String content = getContent(); + writeRecord(writer, SOME_URL, "text/html", + content.length(), getBaos(content)); + return writer; + } + + public void testSpaceInURL() throws IOException { + long bytesWritten = holeyUrl("testSpaceInURL", false, " "); + assertEquals("Unexpected successful writing occurred",0,bytesWritten); + } + + public void testTabInURL() throws IOException { + long bytesWritten = holeyUrl("testTabInURL", false, "\t"); + assertEquals("Unexpected successful writing occurred",0,bytesWritten); + } + + protected long holeyUrl(String name, boolean compress, String urlInsert) + throws IOException { + WARCWriter writer = createWithOneRecord(name, compress); + // Add some bytes on the end to mess up the record. + long startPos = writer.getPosition(); + String content = getContent(); + ByteArrayOutputStream baos = getBaos(content); + writeRecord(writer, SOME_URL + urlInsert + "/index.html", "text/html", + content.length(), baos); + long endPos = writer.getPosition(); + writer.close(); + return endPos-startPos; + } + + /** + * Write an arc file for other tests to use. + * @param arcdir Directory to write to. + * @param compress True if file should be compressed. + * @return ARC written. + * @throws IOException + */ + public static File createWARCFile(File arcdir, boolean compress) + throws IOException { + File [] files = {arcdir}; + WARCWriter writer = + new WARCWriter(SERIAL_NO, + new WARCWriterPoolSettingsData( + "", + "test", + DEFAULT_MAX_WARC_FILE_SIZE, + compress, + Arrays.asList(files), + null, + new UUIDGenerator())); + String content = getContent(); + writeRecord(writer, SOME_URL, "text/html", content.length(), + getBaos(content)); + writer.close(); + return writer.getFile(); + } + +// public void testSpeed() throws IOException { +// ARCWriter writer = createArcWithOneRecord("speed", true); +// // Add a record with a length that is too long. +// String content = getContent(); +// final int count = 100000; +// logger.info("Starting speed write of " + count + " records."); +// for (int i = 0; i < count; i++) { +// writeRecord(writer, SOME_URL, "text/html", content.length(), +// getBaos(content)); +// } +// writer.close(); +// logger.info("Finished speed write test."); +// } + + public void testArcRecordOffsetReads() throws Exception { + // Get an ARC with one record. + WriterPoolMember w = + createWithOneRecord("testArcRecordInBufferStream", true); + w.close(); + // Get reader on said ARC. + WARCReader r = WARCReaderFactory.get(w.getFile()); + final Iterator i = r.iterator(); + // Skip first ARC meta record. + ArchiveRecord ar = i.next(); + i.hasNext(); + // Now we're at first and only record in ARC. + ar = (WARCRecord) i.next(); + // Now try getting some random set of bytes out of it + // at an odd offset (used to fail because we were + // doing bad math to find where in buffer to read). + final byte[] buffer = new byte[17]; + final int maxRead = 4; + int totalRead = 0; + while (totalRead < maxRead) { + totalRead = totalRead + + ar.read(buffer, 13 + totalRead, maxRead - totalRead); + assertTrue(totalRead > 0); + } + } +} \ No newline at end of file diff --git a/src/test/java/org/archive/uid/UUIDGeneratorTest.java b/src/test/java/org/archive/uid/UUIDGeneratorTest.java new file mode 100644 index 00000000..79e98fb6 --- /dev/null +++ b/src/test/java/org/archive/uid/UUIDGeneratorTest.java @@ -0,0 +1,44 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.uid; + +import java.net.URI; +import java.net.URISyntaxException; +import java.util.HashMap; +import java.util.Map; + +import junit.framework.TestCase; + +/** + * @author stack + * @version $Revision$ $Date$ + */ +public class UUIDGeneratorTest extends TestCase { + public void testQualifyRecordID() throws URISyntaxException { + RecordIDGenerator g = new UUIDGenerator(); + URI uri = g.getRecordID(); + Map qualifiers = new HashMap(); + qualifiers.put("a", "b"); + URI nuURI = g.qualifyRecordID(uri, qualifiers); + assertNotSame(uri, nuURI); + qualifiers.put("c", "d"); + nuURI = g.qualifyRecordID(nuURI, qualifiers); + assertNotSame(uri, nuURI); + } +} diff --git a/src/test/java/org/archive/util/FileUtilsTest.java b/src/test/java/org/archive/util/FileUtilsTest.java new file mode 100644 index 00000000..19271435 --- /dev/null +++ b/src/test/java/org/archive/util/FileUtilsTest.java @@ -0,0 +1,271 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.util; + +import java.io.File; +import java.io.IOException; +import java.util.Collections; +import java.util.LinkedList; +import java.util.List; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang.math.LongRange; + + +/** + * FileUtils tests. + * + * @contributor stack + * @contributor gojomo + * @version $Date$, $Revision$ + */ +public class FileUtilsTest extends TmpDirTestCase { + private String srcDirName = FileUtilsTest.class.getName() + ".srcdir"; + private File srcDirFile = null; + private String tgtDirName = FileUtilsTest.class.getName() + ".tgtdir"; + private File tgtDirFile = null; + + protected File zeroLengthLinesUnix; + protected File zeroLengthLinesWindows; + + protected File smallLinesUnix; + protected File smallLinesWindows; + protected File largeLinesUnix; + protected File largeLinesWindows; + protected File nakedLastLineUnix; + protected File nakedLastLineWindows; + + + protected void setUp() throws Exception { + super.setUp(); + this.srcDirFile = new File(getTmpDir(), srcDirName); + FileUtils.ensureWriteableDirectory(srcDirFile); + this.tgtDirFile = new File(getTmpDir(), tgtDirName); + FileUtils.ensureWriteableDirectory(tgtDirFile); + addFiles(); + + zeroLengthLinesUnix = setUpLinesFile("zeroLengthLinesUnix",0,0,400,IOUtils.LINE_SEPARATOR_UNIX); + zeroLengthLinesWindows = setUpLinesFile("zeroLengthLinesUnix",0,0,400,IOUtils.LINE_SEPARATOR_WINDOWS); + + smallLinesUnix = setUpLinesFile("smallLinesUnix", 0, 25, 400, IOUtils.LINE_SEPARATOR_UNIX); + smallLinesWindows = setUpLinesFile("smallLinesWindows", 0, 25, 400, IOUtils.LINE_SEPARATOR_WINDOWS); + largeLinesUnix = setUpLinesFile("largeLinesUnix", 128, 256, 5, IOUtils.LINE_SEPARATOR_UNIX); + largeLinesWindows = setUpLinesFile("largeLinesWindows", 128, 256, 4096, IOUtils.LINE_SEPARATOR_WINDOWS); + + nakedLastLineUnix = setUpLinesFile("nakedLastLineUnix", 0, 50, 401, IOUtils.LINE_SEPARATOR_UNIX); + org.apache.commons.io.FileUtils.writeStringToFile(nakedLastLineUnix,"a"); + nakedLastLineWindows = setUpLinesFile("nakedLastLineWindows", 0, 50, 401, IOUtils.LINE_SEPARATOR_WINDOWS); + org.apache.commons.io.FileUtils.writeStringToFile(nakedLastLineWindows,"a"); + } + + private void addFiles() throws IOException { + addFiles(3, this.getName()); + } + + private void addFiles(final int howMany, final String baseName) + throws IOException { + for (int i = 0; i < howMany; i++) { + File.createTempFile(baseName, null, this.srcDirFile); + } + } + + private File setUpLinesFile(String name, int minLineSize, int maxLineSize, int lineCount, String lineEnding) throws IOException { + List lines = new LinkedList(); + StringBuilder sb = new StringBuilder(maxLineSize); + for(int i = 0; i< lineSize; j++) { + sb.append("-"); + } + lines.add(sb.toString()); + } + File file = File.createTempFile(name, null); + org.apache.commons.io.FileUtils.writeLines(file, lines, lineEnding); + return file; + + } + + protected void tearDown() throws Exception { + super.tearDown(); + org.apache.commons.io.FileUtils.deleteQuietly(this.srcDirFile); + org.apache.commons.io.FileUtils.deleteQuietly(this.tgtDirFile); + org.apache.commons.io.FileUtils.deleteQuietly(zeroLengthLinesUnix); + org.apache.commons.io.FileUtils.deleteQuietly(zeroLengthLinesWindows); + org.apache.commons.io.FileUtils.deleteQuietly(smallLinesUnix); + org.apache.commons.io.FileUtils.deleteQuietly(smallLinesWindows); + org.apache.commons.io.FileUtils.deleteQuietly(largeLinesUnix); + org.apache.commons.io.FileUtils.deleteQuietly(largeLinesWindows); + org.apache.commons.io.FileUtils.deleteQuietly(nakedLastLineUnix); + org.apache.commons.io.FileUtils.deleteQuietly(nakedLastLineWindows); + + } + + public void testCopyFile() { + // Test exception copying nonexistent file. + File [] srcFiles = this.srcDirFile.listFiles(); + srcFiles[0].delete(); + IOException e = null; + try { + FileUtils.copyFile(srcFiles[0], + new File(this.tgtDirFile, srcFiles[0].getName())); + } catch (IOException ioe) { + e = ioe; + } + assertNotNull("Didn't get expected IOE", e); + } + + public void testTailLinesZeroLengthUnix() throws IOException { + verifyTailLines(zeroLengthLinesUnix); + } + + public void testTailLinesZeroLengthWindows() throws IOException { + verifyTailLines(zeroLengthLinesWindows); + } + + public void testTailLinesSmallUnix() throws IOException { + verifyTailLines(smallLinesUnix); + } + + public void testTailLinesLargeUnix() throws IOException { + verifyTailLines(largeLinesUnix); + } + + public void testTailLinesSmallWindows() throws IOException { + verifyTailLines(smallLinesWindows); + } + + public void testTailLinesLargeWindows() throws IOException { + verifyTailLines(largeLinesWindows); + } + + public void testTailLinesNakedUnix() throws IOException { + verifyTailLines(nakedLastLineUnix); + } + + public void testTailLinesNakedWindows() throws IOException { + verifyTailLines(nakedLastLineWindows); + } + + @SuppressWarnings("unchecked") + private void verifyTailLines(File file) throws IOException { + List lines = org.apache.commons.io.FileUtils.readLines(file); + verifyTailLines(file, lines, 1, 80); + verifyTailLines(file, lines, 5, 80); + verifyTailLines(file, lines, 10, 80); + verifyTailLines(file, lines, 20, 80); + verifyTailLines(file, lines, 100, 80); + verifyTailLines(file, lines, 1, 1); + verifyTailLines(file, lines, 5, 1); + verifyTailLines(file, lines, 10, 1); + verifyTailLines(file, lines, 20, 1); + verifyTailLines(file, lines, 100, 1); + } + + + private void verifyTailLines(File file, List lines, int count, int estimate) throws IOException { + List testLines; + testLines = getTestTailLines(file,count,estimate); + assertEquals("line counts not equal:"+file.getName()+" "+count+" "+estimate,lines.size(),testLines.size()); + assertEquals("lines not equal: "+file.getName()+" "+count+" "+estimate,lines,testLines); + } + + private List getTestTailLines(File file, int count, int estimate) throws IOException { + long pos = -1; + List testLines = new LinkedList(); + do { + List returnedLines = new LinkedList(); + LongRange range = FileUtils.pagedLines(file,pos,-count,returnedLines,estimate); + Collections.reverse(returnedLines); + testLines.addAll(returnedLines); + pos = range.getMinimumLong()-1; + } while (pos>=0); + Collections.reverse(testLines); + return testLines; + } + + public void testHeadLinesZeroLengthUnix() throws IOException { + verifyHeadLines(zeroLengthLinesUnix); + } + + public void testHeadLinesZeroLengthWindows() throws IOException { + verifyHeadLines(zeroLengthLinesWindows); + } + + public void testHeadLinesSmallUnix() throws IOException { + verifyHeadLines(smallLinesUnix); + } + + public void testHeadLinesLargeUnix() throws IOException { + verifyHeadLines(largeLinesUnix); + } + + public void testHeadLinesSmallWindows() throws IOException { + verifyHeadLines(smallLinesWindows); + } + + public void testHeadLinesLargeWindows() throws IOException { + verifyHeadLines(largeLinesWindows); + } + + public void testHeadLinesNakedUnix() throws IOException { + verifyHeadLines(nakedLastLineUnix); + } + + public void testHeadLinesNakedWindows() throws IOException { + verifyHeadLines(nakedLastLineWindows); + } + + + @SuppressWarnings("unchecked") + private void verifyHeadLines(File file) throws IOException { + List lines = org.apache.commons.io.FileUtils.readLines(file); + verifyHeadLines(file, lines, 1, 80); + verifyHeadLines(file, lines, 5, 80); + verifyHeadLines(file, lines, 10, 80); + verifyHeadLines(file, lines, 20, 80); + verifyHeadLines(file, lines, 100, 80); + verifyHeadLines(file, lines, 1, 1); + verifyHeadLines(file, lines, 5, 1); + verifyHeadLines(file, lines, 10, 1); + verifyHeadLines(file, lines, 20, 1); + verifyHeadLines(file, lines, 100, 1); + } + + + private void verifyHeadLines(File file, List lines, int count, int estimate) throws IOException { + List testLines; + testLines = getTestHeadLines(file,count,estimate); + assertEquals("line counts not equal:"+file.getName()+" "+count+" "+estimate,lines.size(),testLines.size()); + assertEquals("lines not equal: "+file.getName()+" "+count+" "+estimate,lines,testLines); + } + + private List getTestHeadLines(File file, int count, int estimate) throws IOException { + long pos = 0; + List testLines = new LinkedList(); + do { + LongRange range = FileUtils.pagedLines(file,pos,count,testLines,estimate); + pos = range.getMaximumLong(); + } while (pos m = am.asMap(); + logger.fine(m.toString()); + } + + public void testEmptyRecord() throws Exception { + byte [] b = ANVLRecord.EMPTY_ANVL_RECORD.getUTF8Bytes(); + assertEquals(b.length, 2); + assertEquals(b[0], '\r'); + assertEquals(b[1], '\n'); + } + + public void testFolding() throws Exception { + ANVLRecord am = new ANVLRecord(); + Exception e = null; + try { + am.addLabel("Label with \n in it"); + } catch (IllegalArgumentException iae) { + e = iae; + } + assertTrue(e != null && e instanceof IllegalArgumentException); + am.addLabelValue("label", "value with \n in it"); + } + + public void testParse() throws UnsupportedEncodingException, IOException { + String record = " a: b\r\n#c#\r\nc:d\r\n \t\t\r\t\n\te" + + "\r\nx:\r\n # z\r\n\r\n"; + ANVLRecord r = ANVLRecord.load(new ByteArrayInputStream( + record.getBytes("ISO-8859-1"))); + logger.fine(r.toString()); + assertEquals(r.get(0).toString(), "a: b"); + record = " a: b\r\n\r\nsdfsdsdfds"; + r = ANVLRecord.load(new ByteArrayInputStream( + record.getBytes("ISO-8859-1"))); + logger.fine(r.toString()); + record = "x:\r\n # z\r\ny:\r\n\r\n"; + r = ANVLRecord.load(new ByteArrayInputStream( + record.getBytes("ISO-8859-1"))); + logger.fine(r.toString()); + assertEquals(r.get(0).toString(), "x:"); + } + + public void testExampleParse() + throws UnsupportedEncodingException, IOException { + final String sample = "entry:\t\t\r\n# first ###draft\r\n" + + "who:\tGilbert, W.S. | Sullivan, Arthur\r\n" + + "what:\tThe Yeoman of\r\n" + + "\t\tthe Guard\r\n" + + "when/created:\t 1888\r\n\r\n"; + ANVLRecord r = ANVLRecord.load(new ByteArrayInputStream( + sample.getBytes("ISO-8859-1"))); + logger.fine(r.toString()); + } + + public void testPoundLabel() + throws UnsupportedEncodingException, IOException { + final String sample = "ent#ry:\t\t\r\n# first ###draft\r\n" + + "who:\tGilbert, W.S. | Sullivan, Arthur\r\n" + + "what:\tThe Yeoman of\r\n" + + "\t\tthe Guard\r\n" + + "when/created:\t 1888\r\n\r\n"; + ANVLRecord r = ANVLRecord.load(sample); + logger.fine(r.toString()); + } + + public void testNewlineLabel() + throws UnsupportedEncodingException, IOException { + final String sample = "ent\nry:\t\t\r\n# first ###draft\r\n" + + "who:\tGilbert, W.S. | Sullivan, Arthur\r\n" + + "what:\tThe Yeoman of\r\n" + + "\t\tthe Guard\r\n" + + "when/created:\t 1888\r\n\r\n"; + IllegalArgumentException iae = null; + try { + ANVLRecord.load(sample); + } catch(IllegalArgumentException e) { + iae = e; + } + assertTrue(iae != null); + } +} From b04f5d82604245461b6a802f1962d86e3d899e98 Mon Sep 17 00:00:00 2001 From: Lauren Ko Date: Thu, 9 Mar 2017 11:32:03 -0600 Subject: [PATCH 052/240] Updating CHANGES.md --- CHANGES.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGES.md b/CHANGES.md index fee29e16..767881ec 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,6 @@ 1.1.8 ----- +* [Move unit tests over from heritrix3 to webarchive-commons](https://github.com/iipc/webarchive-commons/issues/25) * [Strip empty port via URLParser](https://github.com/iipc/webarchive-commons/pull/69/) * [Use CharsetDetector to guess encoding of HTML documents](https://github.com/iipc/webarchive-commons/pull/68/) * [Fix last header was lost if LF LF](https://github.com/iipc/webarchive-commons/pull/65/) From b655796770eb967c931d656b1c80d4967f91e7fc Mon Sep 17 00:00:00 2001 From: Lauren Ko Date: Tue, 21 Mar 2017 14:20:54 -0500 Subject: [PATCH 053/240] Updating change log. --- CHANGES.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGES.md b/CHANGES.md index 767881ec..ccdc1ce7 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,6 @@ 1.1.8 ----- +* [Improve HTML link extraction](https://github.com/iipc/webarchive-commons/pull/72) * [Move unit tests over from heritrix3 to webarchive-commons](https://github.com/iipc/webarchive-commons/issues/25) * [Strip empty port via URLParser](https://github.com/iipc/webarchive-commons/pull/69/) * [Use CharsetDetector to guess encoding of HTML documents](https://github.com/iipc/webarchive-commons/pull/68/) From aee6ff55bfcaa5a9e15092f8c3b1e40ec9faaf87 Mon Sep 17 00:00:00 2001 From: John Erik Halse Date: Tue, 2 May 2017 12:25:28 +0200 Subject: [PATCH 054/240] [maven-release-plugin] prepare release webarchive-commons-1.1.8 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 24780063..63909b90 100644 --- a/pom.xml +++ b/pom.xml @@ -9,7 +9,7 @@ org.netpreserve.commons webarchive-commons - 1.1.8-SNAPSHOT + 1.1.8 jar webarchive-commons From dfe1f62e416f6a881fe15a2544449fff44dd1e51 Mon Sep 17 00:00:00 2001 From: John Erik Halse Date: Tue, 2 May 2017 12:25:35 +0200 Subject: [PATCH 055/240] [maven-release-plugin] prepare for next development iteration --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 63909b90..23953c06 100644 --- a/pom.xml +++ b/pom.xml @@ -9,7 +9,7 @@ org.netpreserve.commons webarchive-commons - 1.1.8 + 1.1.9-SNAPSHOT jar webarchive-commons From cf34a3e13c09cfa4a1412492cfcf3503df698931 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Fri, 28 Apr 2017 22:41:56 +0200 Subject: [PATCH 056/240] Do not add value of preceding HTTP header field if there is no value (or only white space) --- .../archive/format/http/HttpHeaderParser.java | 4 ++-- .../format/http/HttpResponseParserTest.java | 24 +++++++++++++++++++ 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/archive/format/http/HttpHeaderParser.java b/src/main/java/org/archive/format/http/HttpHeaderParser.java index d63ec405..bee3c28b 100755 --- a/src/main/java/org/archive/format/http/HttpHeaderParser.java +++ b/src/main/java/org/archive/format/http/HttpHeaderParser.java @@ -301,8 +301,9 @@ public ParseState handleByte(byte b, HttpHeaderParser parser) throws HttpParseEx if(isLWSP(b)) { return parser.postColonState; } + // reset previous value also in case the header value is empty + parser.setValueStartIdx(); if(b == CR) { - // TODO: THINK more... parser.valuePreCRState = parser.postColonState; return parser.valuePostCRState; } @@ -310,7 +311,6 @@ public ParseState handleByte(byte b, HttpHeaderParser parser) throws HttpParseEx // TODO: this is lax, is LFLF an OK terminator? return parser.lineStartState; } - parser.setValueStartIdx(); parser.addValueByte(b); return parser.valueState; } diff --git a/src/test/java/org/archive/format/http/HttpResponseParserTest.java b/src/test/java/org/archive/format/http/HttpResponseParserTest.java index c0d13230..ea076a69 100644 --- a/src/test/java/org/archive/format/http/HttpResponseParserTest.java +++ b/src/test/java/org/archive/format/http/HttpResponseParserTest.java @@ -57,4 +57,28 @@ public void testParseWithLf() throws IOException { } + public void testParseEmptyHeaderField() throws IOException { + + HttpResponseParser parser = new HttpResponseParser(); + String message = "200 OK\r\nContent-Type: text/plain\r\nServer: \r\n\r\nHi there"; + try { + HttpResponse response = + parser.parse(new ByteArrayInputStream(message.getBytes(IAUtils.UTF8))); + assertNotNull(response); + HttpHeaders headers = response.getHeaders(); + assertNotNull(headers); + assertEquals(2, headers.size()); + HttpHeader header = headers.get(1); + assertEquals("Server",header.getName()); + System.err.println(header.getValue()); + assertFalse("text/plain".equals(header.getValue())); + TestUtils.assertStreamEquals(response, "Hi there".getBytes(IAUtils.UTF8)); + + } catch (HttpParseException e) { + e.printStackTrace(); + fail(); + } + + } + } From bd08143577ea35cb48047a08b2bb67e806992cc2 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Thu, 29 Sep 2016 11:44:18 +0200 Subject: [PATCH 057/240] Extract also `property` attributes of HTML meta elements, this fixes #67 --- .../java/org/archive/resource/html/ExtractingParseObserver.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java index 826851e0..52989455 100644 --- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java +++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java @@ -406,7 +406,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs private static class MetaTagExtractor implements TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { - ArrayList l = getAttrList(node,"name","rel","content","http-equiv"); + ArrayList l = getAttrList(node,"name","rel","content","http-equiv","property"); if(l != null) { data.addMeta(l); } From 4077670acca3f0d2958d926692cdb3a6b29428ca Mon Sep 17 00:00:00 2001 From: Lauren Ko Date: Tue, 2 May 2017 15:15:06 -0500 Subject: [PATCH 058/240] Fix HTTP-Response-Metadata for wget WARCs. Changes came from https://github.com/commoncrawl/ia-web-commons/commit/58e85a60d75707da55ed499f836e57d49347484a --- .../org/archive/extract/ExtractingResourceFactoryMapper.java | 5 ++++- src/main/java/org/archive/format/warc/WARCConstants.java | 4 +++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java b/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java index ad10be40..0afe16fb 100644 --- a/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java +++ b/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java @@ -153,7 +153,10 @@ private boolean isWARCInfoResource(MetaData envelope) { private boolean isHTTPResponseWARCResource(MetaData envelope) { return childFieldEquals(envelope,WARC_HEADER_METADATA, WARCConstants.CONTENT_TYPE, - WARCConstants.HTTP_RESPONSE_MIMETYPE); + WARCConstants.HTTP_RESPONSE_MIMETYPE) + || childFieldEquals(envelope,WARC_HEADER_METADATA, + WARCConstants.CONTENT_TYPE, + WARCConstants.HTTP_RESPONSE_MIMETYPE_NS); } private boolean isWARCJSONResource(MetaData envelope) { return childFieldEquals(envelope,WARC_HEADER_METADATA, diff --git a/src/main/java/org/archive/format/warc/WARCConstants.java b/src/main/java/org/archive/format/warc/WARCConstants.java index 93a81f96..504dc380 100644 --- a/src/main/java/org/archive/format/warc/WARCConstants.java +++ b/src/main/java/org/archive/format/warc/WARCConstants.java @@ -209,7 +209,9 @@ enum WARCRecordType { "application/http; msgtype=request"; public static final String HTTP_RESPONSE_MIMETYPE = "application/http; msgtype=response"; - + public static final String HTTP_RESPONSE_MIMETYPE_NS = + "application/http;msgtype=response"; // wget does this + public static final String FTP_CONTROL_CONVERSATION_MIMETYPE = "text/x-ftp-control-conversation"; From 3bba7e489b7d946eea83344e2150faebe0b35ed2 Mon Sep 17 00:00:00 2001 From: Lauren Ko Date: Tue, 2 May 2017 15:41:23 -0500 Subject: [PATCH 059/240] Update with fixes for 1.1.9 --- CHANGES.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index ccdc1ce7..1ba5c1de 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,3 +1,9 @@ +1.1.9 +----- +* [Extract `property` attributes of HTML meta elements](https://github.com/iipc/webarchive-commons/pull/75) +* [Do not add value of preceding HTTP header field if there is no value](https://github.com/iipc/webarchive-commons/pull/74) +* [Fix WAT records corresponding to response records of Wget generated WARCs](https://github.com/iipc/webarchive-commons/pull/74) + 1.1.8 ----- * [Improve HTML link extraction](https://github.com/iipc/webarchive-commons/pull/72) From 4101f7e39cbdcc508a936faf8b519e68258b9639 Mon Sep 17 00:00:00 2001 From: Naomi Dushay Date: Tue, 8 Aug 2017 16:08:43 -0700 Subject: [PATCH 060/240] use commons-collections v3.2.2 to avoid v3.2.1 vulnerability --- pom.xml | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index 23953c06..8373cdad 100644 --- a/pom.xml +++ b/pom.xml @@ -72,7 +72,7 @@ guava 17.0 - + org.json json @@ -89,12 +89,12 @@ juniversalchardet 1.0.3 - + commons-httpclient commons-httpclient 3.1 - + org.apache.hadoop @@ -128,12 +128,12 @@ tomcat jasper-compiler - + hsqldb hsqldb - - + + @@ -160,7 +160,7 @@ libidn 1.15 - + it.unimi.dsi dsiutils 2.0.12 @@ -170,13 +170,26 @@ ch.qos.logback logback-classic + + + commons-collections + commons-collections + + + + + commons-collections + commons-collections + 3.2.2 + + org.apache.httpcomponents httpcore 4.3 - + joda-time joda-time From 988bec707c27a01333becfc3bd502af4441ea1e1 Mon Sep 17 00:00:00 2001 From: Lauren Ko Date: Wed, 9 Aug 2017 10:57:28 -0500 Subject: [PATCH 061/240] Update CHANGES.md for PR 77 --- CHANGES.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGES.md b/CHANGES.md index 1ba5c1de..dcb598d9 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,6 @@ 1.1.9 ----- +* [Use commons-collections v3.2.2 to avoid v3.2.1 vulnerability](https://github.com/iipc/webarchive-commons/pull/77) * [Extract `property` attributes of HTML meta elements](https://github.com/iipc/webarchive-commons/pull/75) * [Do not add value of preceding HTTP header field if there is no value](https://github.com/iipc/webarchive-commons/pull/74) * [Fix WAT records corresponding to response records of Wget generated WARCs](https://github.com/iipc/webarchive-commons/pull/74) From 2e8cdea3d245c11e1ea3a2a6153c0038479aef12 Mon Sep 17 00:00:00 2001 From: nruest Date: Tue, 7 May 2019 13:23:28 -0400 Subject: [PATCH 062/240] [maven-release-plugin] prepare release webarchive-commons-1.1.9 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 8373cdad..833f42c3 100644 --- a/pom.xml +++ b/pom.xml @@ -9,7 +9,7 @@ org.netpreserve.commons webarchive-commons - 1.1.9-SNAPSHOT + 1.1.9 jar webarchive-commons From da029db2ba89205b93a5291ed18b9c69155271bb Mon Sep 17 00:00:00 2001 From: nruest Date: Tue, 7 May 2019 13:23:34 -0400 Subject: [PATCH 063/240] [maven-release-plugin] prepare for next development iteration --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 833f42c3..1cbeb99a 100644 --- a/pom.xml +++ b/pom.xml @@ -9,7 +9,7 @@ org.netpreserve.commons webarchive-commons - 1.1.9 + 1.1.10-SNAPSHOT jar webarchive-commons From 723b18a35f8be786cb073282b5ea88b5d8c643ce Mon Sep 17 00:00:00 2001 From: nruest Date: Tue, 7 May 2019 13:56:18 -0400 Subject: [PATCH 064/240] Update TravisCI config; resolves #82. - Test Oracle Java 8 - Test OpenJDK Java 8 - Use trusty - Require sudo for OpenJDK7 - Remove Oracle Java 7 (it's gone!) - Remove mvn site from the build process since there is no javadoc site (at least that I can tell) --- .travis.yml | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index 0dfd3f7f..54daf83b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,7 +1,13 @@ +dist: trusty language: java +# sudo required for OpenJDK7 support per: +# https://github.com/travis-ci/travis-ci/issues/7884#issuecomment-309689557 +sudo: required jdk: - - oraclejdk7 + - openjdk7 + - oraclejdk8 + - openjdk8 before_install: - "git clone https://github.com/iipc/travis.git target/travis" @@ -11,8 +17,8 @@ before_script: - "export MAVEN_OPTS=-Xmx512m" - "ulimit -u 2048" -script: - - "target/travis/deploy-if.sh" +script: + - mvn install -B -V # whitelist in the master branch only branches: @@ -23,4 +29,3 @@ env: global: - secure: "qDKjVdoe4Qcz4WfXiQydU7tyl51T62FUJrjqu4FUPBcgeQhFQiggwhpaE6xCOzOpxbsuBi2R1c8gMQf5esE5iDL5jZMu+kz++dYbuzMTd13ttvZWMW5wRPH0H8iHk609FP/RDtVKKBr7WO0JvvIAZEhWNHZrLXBrrKgdTey171g=" - secure: "FXGBKJNP9X7ePJfS4eYTZtoFo4RT1sxor34XxncSJr7uV6ggtZb4B4WNd16IlLcDk6E32sx8YoWdltaOGwQ5Vg/kux5Ko/wKZCoccS018Ln1bRT86dD1KoPY34rGoNJVQxe7J/1MPqpBKwmi2XCKfzpsEh3W7bbIqg8w9MEOOZA=" - From 79aed910b44510294367a4acf4f3e6376b1c62c0 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Wed, 23 Aug 2017 17:04:52 +0200 Subject: [PATCH 065/240] ExtractingParseObserver: get links from onClick attributes - extract links from JavaScript code snippets in onClick attributes of INPUT and DIV elements --- .../html/ExtractingParseObserver.java | 40 +++++++++++++++++- .../html/ExtractingParseObserverTest.java | 10 +++++ .../resource/html/link-extraction-test.warc | 42 +++++++++++++++++++ 3 files changed, 91 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java index 52989455..e4fa83c7 100644 --- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java +++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java @@ -39,6 +39,15 @@ public class ExtractingParseObserver implements ParseObserver { protected static Pattern cssUrlTrimPattern = Pattern.compile(cssUrlTrimPatString); + protected static String jsOnClickUrl1PatString = + "(?i)^(?:javascript:)?(?:(?:window|top|document|self|parent)\\.)?location(?:\\.href)?\\s*=\\s*('|')([^'\"]{3,256})\\1$"; + protected static String jsOnClickUrl2PatString = + "(?i)^(?:javascript:)?(?:window|parent)\\.open\\((['\"]|')([^\"']{3,256}?)\\1[,)]"; + protected static Pattern[] jsOnClickUrlPatterns = { + Pattern.compile(jsOnClickUrl1PatString), + Pattern.compile(jsOnClickUrl2PatString) + }; + private final static int MAX_TEXT_LEN = 100; private static final String PATH = "path"; @@ -51,6 +60,7 @@ public class ExtractingParseObserver implements ParseObserver { extractors.put("APPLET", new AppletTagExtractor()); extractors.put("AREA", new AreaTagExtractor()); extractors.put("BASE", new BaseTagExtractor()); + extractors.put("DIV", new DivTagExtractor()); extractors.put("EMBED", new EmbedTagExtractor()); extractors.put("FORM", new FormTagExtractor()); extractors.put("FRAME", new FrameTagExtractor()); @@ -268,7 +278,20 @@ private static void addHrefWithAttrs(HTMLMetaData data, TagNode node, if(l != null) { data.addHref(l); } - } + } + + private static void addHrefsOnclick(HTMLMetaData data, TagNode node) { + String onclick = node.getAttribute("onclick"); + if (onclick != null) { + String path = makePath(node.getTagName(), "onclick"); + for (Pattern pattern : jsOnClickUrlPatterns) { + String url = patternJSExtract(pattern, onclick); + if (url != null) { + data.addHref(PATH, path, "url", url); + } + } + } + } private interface TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs); @@ -330,6 +353,12 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs } } + private static class DivTagExtractor implements TagExtractor { + public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { + addHrefsOnclick(data,node); + } + } + private static class EmbedTagExtractor implements TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { addBasicHrefs(data,node,"src"); @@ -386,6 +415,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs private static class InputTagExtractor implements TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { addBasicHrefs(data,node,"src","formaction"); + addHrefsOnclick(data,node); } } @@ -450,4 +480,12 @@ private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String conten } } } + + private static String patternJSExtract(Pattern pattern, String content) { + Matcher m = pattern.matcher(content); + if (m.find()) { + return m.group(2); + } + return null; + } } diff --git a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java index 8f690a06..4828ad64 100644 --- a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java +++ b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java @@ -263,6 +263,16 @@ public void testLinkExtraction() throws ResourceParseException, IOException { {"http://www.your-domain.com/your-page.html", "DIV@/data-href"} }; checkLinks(extractor.getNext(), fbSocialLinks); + String[][] onClickLinks = { + {"webpage.html", "DIV@/onclick"}, + {"index.html", "INPUT@/onclick"}, + {"http://www.x.com/", "INPUT@/onclick"}, + {"button-child.php", "INPUT@/onclick"}, + {"http://example.com/", "INPUT@/onclick"}, + {"http://example.com/location/href/1.html", "INPUT@/onclick"}, + {"http://example.com/location/href/2.html", "INPUT@/onclick"} + }; + checkLinks(extractor.getNext(), onClickLinks); } } diff --git a/src/test/resources/org/archive/resource/html/link-extraction-test.warc b/src/test/resources/org/archive/resource/html/link-extraction-test.warc index ab0e54c8..1a30598e 100644 --- a/src/test/resources/org/archive/resource/html/link-extraction-test.warc +++ b/src/test/resources/org/archive/resource/html/link-extraction-test.warc @@ -318,3 +318,45 @@ Content-Type: text/html +WARC/1.0 +WARC-Type: response +WARC-Date: 2017-08-23T13:54:59Z +Content-Type: application/http;msgtype=response +Content-Length: 1279 + +HTTP/1.1 200 OK +Date: Wed, 23 Aug 2017 13:54:59 GMT +Server: Apache/2.4.18 (Ubuntu) +Last-Modified: Wed, 23 Aug 2017 13:54:03 GMT +ETag: "3ca-5576c0b718ab3" +Accept-Ranges: bytes +Content-Length: 971 +Vary: Accept-Encoding +Keep-Alive: timeout=5, max=100 +Connection: Keep-Alive +Content-Type: text/html + + + +Test Extraction of URLs from INPUT onClick Attributes + + + + +

Click to load webpage

+ + + + + + + + From 26b1e7af27abec102ab36faf6a786dfedf9436fd Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Wed, 23 Aug 2017 14:48:05 +0200 Subject: [PATCH 066/240] ExtractingParseObserver: extract rel, hreflang and type attributes - add "rel" attribute to A and AREA links - add attributes "hreflang" and "type" (MIME type) to A@/href links --- .../html/ExtractingParseObserver.java | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java index 52989455..a487fd34 100644 --- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java +++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java @@ -284,7 +284,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs l.add(makePath("A","href")); l.add("url"); l.add(url); - for(String a : new String[] {"target","alt","title"}) { + for(String a : new String[] {"target","alt","title","rel","hreflang","type"}) { String v = node.getAttribute(a); if(v != null) { l.add(a); @@ -311,7 +311,22 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs private static class AreaTagExtractor implements TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { - addBasicHrefs(data,node,"href"); + String url = node.getAttribute("href"); + if(url != null) { + ArrayList l = new ArrayList(); + l.add(PATH); + l.add(makePath("AREA","href")); + l.add("url"); + l.add(url); + for(String a : new String[] {"rel"}) { + String v = node.getAttribute(a); + if(v != null) { + l.add(a); + l.add(v); + } + } + data.addHref(l); + } } } From a2cc42cac2777d06ab40e09811cdc883773775b9 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Thu, 11 Jun 2020 14:24:03 +0200 Subject: [PATCH 067/240] WAT extractor: do not fail on missing WARC-Filename in warcinfo record, fixes #88 - do not throw IOException if there is no WARC-Filename in warcinfo record - write metadata record (corresponding to warcinfo) without WARC-Target-URI --- src/main/java/org/archive/extract/WATExtractorOutput.java | 2 +- src/main/java/org/archive/format/warc/WARCRecordWriter.java | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/archive/extract/WATExtractorOutput.java b/src/main/java/org/archive/extract/WATExtractorOutput.java index 3bcfa924..4b5f72ed 100644 --- a/src/main/java/org/archive/extract/WATExtractorOutput.java +++ b/src/main/java/org/archive/extract/WATExtractorOutput.java @@ -151,7 +151,7 @@ private void writeWARC(OutputStream recOut, MetaData md) throws IOException { String warcType = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Type"); String targetURI; if(warcType.equals("warcinfo")) { - targetURI = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Filename"); + targetURI = JSONUtils.extractSingle(md, "Envelope.WARC-Header-Metadata.WARC-Filename"); } else { targetURI = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Target-URI"); } diff --git a/src/main/java/org/archive/format/warc/WARCRecordWriter.java b/src/main/java/org/archive/format/warc/WARCRecordWriter.java index 0aab83b7..3278b289 100644 --- a/src/main/java/org/archive/format/warc/WARCRecordWriter.java +++ b/src/main/java/org/archive/format/warc/WARCRecordWriter.java @@ -88,7 +88,10 @@ public void writeJSONMetadataRecord( OutputStream out, { HttpHeaders headers = new HttpHeaders(); headers.add(HEADER_KEY_TYPE, WARCRecordType.metadata.name()); - headers.add(HEADER_KEY_URI, targetURI); + if (targetURI != null) { + // WARC-Target-URI is optional in metadata records + headers.add(HEADER_KEY_URI, targetURI); + } headers.add(HEADER_KEY_DATE, DateUtils.getLog14Date(originalDate)); headers.add(HEADER_KEY_ID, makeRecordId()); headers.add(HEADER_KEY_REFERS_TO, origRecordId); From 04e10397b9137a36812c17276826bc60d1a37ede Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Mon, 15 Jun 2020 13:29:25 +0200 Subject: [PATCH 068/240] Update change log to include #85, #86 and #89 --- CHANGES.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index dcb598d9..bf985ada 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,3 +1,10 @@ +1.1.10 +------ +* [WAT extractor: do not fail on missing WARC-Filename in warcinfo record](https://github.com/iipc/webarchive-commons/pull/89) +* [ExtractingParseObserver: extract rel, hreflang and type attributes](https://github.com/iipc/webarchive-commons/pull/86) +* [ExtractingParseObserver: extract links from onClick attributes](https://github.com/iipc/webarchive-commons/pull/85) +* [Update TravisCI config](https://github.com/iipc/webarchive-commons/pull/83) + 1.1.9 ----- * [Use commons-collections v3.2.2 to avoid v3.2.1 vulnerability](https://github.com/iipc/webarchive-commons/pull/77) From 9041ff4e96f6554658742affe490223dc0241d06 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 13 Oct 2020 01:28:48 +0000 Subject: [PATCH 069/240] Bump junit from 3.8.1 to 4.13.1 Bumps [junit](https://github.com/junit-team/junit4) from 3.8.1 to 4.13.1. - [Release notes](https://github.com/junit-team/junit4/releases) - [Changelog](https://github.com/junit-team/junit4/blob/main/doc/ReleaseNotes4.13.1.md) - [Commits](https://github.com/junit-team/junit4/commits/r4.13.1) Signed-off-by: dependabot[bot] --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 1cbeb99a..5ca7e1a3 100644 --- a/pom.xml +++ b/pom.xml @@ -64,7 +64,7 @@ junit junit - 3.8.1 + 4.13.1 From c2530d77b73838c31f4e83f2be941ec61032ebb2 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Tue, 16 Mar 2021 11:58:11 +0100 Subject: [PATCH 070/240] Fix InterruptibleCharSequenceTest (testInterruptibility) to run on JDK 11 - if thread running the regexp matching is already finished after the initial/current sleeping time, rerun the test again with a shorter sleeping time until the expected RuntimeException is hit --- .../util/InterruptibleCharSequenceTest.java | 26 +++++++++++++------ 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/src/test/java/org/archive/util/InterruptibleCharSequenceTest.java b/src/test/java/org/archive/util/InterruptibleCharSequenceTest.java index a3a5f180..8b5c5d1b 100644 --- a/src/test/java/org/archive/util/InterruptibleCharSequenceTest.java +++ b/src/test/java/org/archive/util/InterruptibleCharSequenceTest.java @@ -107,14 +107,24 @@ public void testNoninterruptible() throws InterruptedException { } public void testInterruptibility() throws InterruptedException { - BlockingQueue