+
+
From 6aa43f83a2cbc2acd0feb7f2c81d66f4ef1b13c5 Mon Sep 17 00:00:00 2001
From: Mohamed Elsayed
Date: Thu, 2 Mar 2017 15:28:16 +0200
Subject: [PATCH 029/216] Fix #25: move missing unit tests over from Heritrix3
---
.../archive/io/ArchiveReaderFactoryTest.java | 94 +++
.../io/BufferedSeekInputStreamTest.java | 67 ++
.../archive/io/HeaderedArchiveRecordTest.java | 209 ++++++
.../archive/io/RecordingInputStreamTest.java | 132 ++++
.../archive/io/ReplayCharSequenceTest.java | 391 ++++++++++
.../io/RepositionableInputStreamTest.java | 70 ++
.../org/archive/io/arc/ARCWriterPoolTest.java | 122 +++
.../org/archive/io/arc/ARCWriterTest.java | 699 ++++++++++++++++++
.../org/archive/io/warc/WARCWriterTest.java | 512 +++++++++++++
.../org/archive/uid/UUIDGeneratorTest.java | 44 ++
.../java/org/archive/util/FileUtilsTest.java | 271 +++++++
.../org/archive/util/MimetypeUtilsTest.java | 63 ++
.../org/archive/util/PropertyUtilsTest.java | 45 ++
.../org/archive/util/anvl/ANVLRecordTest.java | 128 ++++
14 files changed, 2847 insertions(+)
create mode 100644 src/test/java/org/archive/io/ArchiveReaderFactoryTest.java
create mode 100644 src/test/java/org/archive/io/BufferedSeekInputStreamTest.java
create mode 100644 src/test/java/org/archive/io/HeaderedArchiveRecordTest.java
create mode 100644 src/test/java/org/archive/io/RecordingInputStreamTest.java
create mode 100644 src/test/java/org/archive/io/ReplayCharSequenceTest.java
create mode 100644 src/test/java/org/archive/io/RepositionableInputStreamTest.java
create mode 100644 src/test/java/org/archive/io/arc/ARCWriterPoolTest.java
create mode 100644 src/test/java/org/archive/io/arc/ARCWriterTest.java
create mode 100644 src/test/java/org/archive/io/warc/WARCWriterTest.java
create mode 100644 src/test/java/org/archive/uid/UUIDGeneratorTest.java
create mode 100644 src/test/java/org/archive/util/FileUtilsTest.java
create mode 100644 src/test/java/org/archive/util/MimetypeUtilsTest.java
create mode 100644 src/test/java/org/archive/util/PropertyUtilsTest.java
create mode 100644 src/test/java/org/archive/util/anvl/ANVLRecordTest.java
diff --git a/src/test/java/org/archive/io/ArchiveReaderFactoryTest.java b/src/test/java/org/archive/io/ArchiveReaderFactoryTest.java
new file mode 100644
index 00000000..2313868c
--- /dev/null
+++ b/src/test/java/org/archive/io/ArchiveReaderFactoryTest.java
@@ -0,0 +1,94 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Iterator;
+
+import org.apache.commons.lang.StringUtils;
+import org.archive.io.ArchiveRecord;
+import org.archive.io.arc.ARCWriterTest;
+import org.archive.util.TmpDirTestCase;
+
+public class ArchiveReaderFactoryTest extends TmpDirTestCase {
+ /**
+ * Test local file as URL
+ * @throws IOException
+ */
+ public void testGetFileURL() throws IOException {
+ File arc = ARCWriterTest.createARCFile(getTmpDir(), true);
+ ArchiveReader reader = null;
+ try {
+ reader = ArchiveReaderFactory.
+ get(new URL("file:////" + arc.getAbsolutePath()));
+ for (Iterator i = reader.iterator(); i.hasNext();) {
+ ArchiveRecord r = (ArchiveRecord)i.next();
+ assertTrue("mime unread",StringUtils.isNotBlank(r.getHeader().getMimetype()));
+ }
+ } finally {
+ if (reader != null) {
+ reader.close();
+ }
+ }
+ }
+
+ /**
+ * Test local file as File
+ * @throws IOException
+ */
+ public void testGetFile() throws IOException {
+ File arc = ARCWriterTest.createARCFile(getTmpDir(), true);
+ ArchiveReader reader = null;
+ try {
+ reader = ArchiveReaderFactory.get(arc.getAbsoluteFile());
+ for (Iterator i = reader.iterator(); i.hasNext();) {
+ ArchiveRecord r = (ArchiveRecord)i.next();
+ assertTrue("mime unread",StringUtils.isNotBlank(r.getHeader().getMimetype()));
+ }
+ } finally {
+ if (reader != null) {
+ reader.close();
+ }
+ }
+ }
+
+ /**
+ * Test local file as String path
+ * @throws IOException
+ */
+ public void testGetPath() throws IOException {
+ File arc = ARCWriterTest.createARCFile(getTmpDir(), true);
+ ArchiveReader reader = null;
+ try {
+ reader = ArchiveReaderFactory.get(arc.getAbsoluteFile().getAbsolutePath());
+ for (Iterator i = reader.iterator(); i.hasNext();) {
+ ArchiveRecord r = (ArchiveRecord)i.next();
+ assertTrue("mime unread",StringUtils.isNotBlank(r.getHeader().getMimetype()));
+ }
+ } finally {
+ if (reader != null) {
+ reader.close();
+ }
+ }
+ }
+}
diff --git a/src/test/java/org/archive/io/BufferedSeekInputStreamTest.java b/src/test/java/org/archive/io/BufferedSeekInputStreamTest.java
new file mode 100644
index 00000000..270e45e0
--- /dev/null
+++ b/src/test/java/org/archive/io/BufferedSeekInputStreamTest.java
@@ -0,0 +1,67 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+import java.util.Random;
+
+import junit.framework.TestCase;
+
+
+/**
+ * Unit test for BufferedSeekInputStream. The tests do some random
+ * repositioning in the stream to make sure the buffer is always valid.
+ *
+ * @author pjack
+ */
+public class BufferedSeekInputStreamTest extends TestCase {
+
+
+ private static byte[] TEST_DATA = makeTestData();
+
+ public void testPosition() throws Exception {
+ Random random = new Random();
+ ArraySeekInputStream asis = new ArraySeekInputStream(TEST_DATA);
+ BufferedSeekInputStream bsis = new BufferedSeekInputStream(asis, 11);
+ for (int i = 0; i < TEST_DATA.length; i++) {
+ byte b = (byte)bsis.read();
+ assertEquals(TEST_DATA[i], b);
+ }
+ for (int i = 0; i < 1000; i++) {
+ int index = random.nextInt(TEST_DATA.length);
+ bsis.position(index);
+ char expected = (char)((int)TEST_DATA[index] & 0xFF);
+ char read = (char)(bsis.read() & 0xFF);
+ assertEquals(expected, read);
+ }
+ }
+
+
+ private static byte[] makeTestData() {
+ String s = "If the dull substance of my flesh were thought\n"
+ + "Injurious distance could not stop my way\n"
+ + "For then, despite of space, I would be brought\n"
+ + "From limits far remote where thou dost stay.\n";
+ byte[] r = new byte[s.length()];
+ for (int i = 0; i < r.length; i++) {
+ r[i] = (byte)s.charAt(i);
+// r[i] = (byte)s.charAt(i);
+ }
+ return r;
+ }
+}
diff --git a/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java b/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java
new file mode 100644
index 00000000..9f7e2a15
--- /dev/null
+++ b/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java
@@ -0,0 +1,209 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import junit.framework.TestCase;
+
+import org.apache.commons.httpclient.Header;
+import org.archive.io.arc.ARCRecord;
+import org.archive.io.warc.WARCRecord;
+
+public class HeaderedArchiveRecordTest extends TestCase {
+ private static final String HTTPHEADER = "HTTP/1.1 200 OK\r\n"
+ + "Last-Modified: Sun, 28 Aug 2005 14:10:55 GMT\r\n"
+ + "Content-Length: 108\r\n" + "Connection: close\r\n"
+ + "Content-Type: text/html\r\n" + "\r\n";
+ private static final String BODY = "\r\n" + " \r\n"
+ + " Neue Seite 1\r\n" + " \r\n"
+ + " \r\n" + " \r\n" + "";
+
+ public void testParseHttpHeadersInWARC() throws IOException {
+ final String url = "http://foo.maths.uq.edu.au/index.html";
+ // final String warcHeader = "WARC/0.10 000000000486 response " +
+ // url + " 20070315152520 " +
+ // "urn:uuid:d8b342a8-dba4-4d7f-a551-1d8184f2ff58 " +
+ // "application/http; msgtype=response\r\n" +
+ // "Checksum: sha1:IT6YEX5WHKK57GOEHV2YHTTXEP5KPM6A\r\n" +
+ // "IP-Address: 80.150.6.184\r\n" +
+ // "\r\n";
+
+ final String warcHeader = "WARC/0.12\r\n"
+ + "MIME-Version: 1.0\r\n"
+ + "WARC-Record-Type: response\r\n"
+ + "WARC-Target-URI: http://foo.maths.uq.edu.au/index.html\r\n"
+ + "WARC-Date: 2006-09-19T17:20:24Z\r\n"
+ + "WARC-Digest: sha1:IT6YEX5WHKK57GOEHV2YHTTXEP5KPM6A\r\n"
+ + "WARC-IP-Address: 80.150.6.184\r\n"
+ + "Content-ID: \r\n"
+ + "Content-Type: application/http; msgtype=response\r\n"
+ + "Content-Length: " + (HTTPHEADER.length() + BODY.length()) + "\r\n"
+ + "\r\n";
+
+ final String hdr = warcHeader + HTTPHEADER + BODY;
+
+ WARCRecord r = new WARCRecord(new ByteArrayInputStream(hdr.getBytes()),
+ "READER_IDENTIFIER", 0, false, true);
+ HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true);
+
+ har.skipHttpHeader();
+
+ byte[] b = new byte[BODY.length()];
+ har.read(b);
+ String bodyRead = new String(b);
+ assertEquals(BODY, bodyRead);
+ assertHeaderCorrectlyParsed(har.getContentHeaders());
+ assertEquals("failed to retrieve Url from metadata", har.getHeader()
+ .getUrl(), url);
+ }
+
+ public void testParseHttpHeadersInARC() throws IOException {
+ final int len = HTTPHEADER.length() + BODY.length();
+ final int contentLength = BODY.length();
+ final String url = "http://www.ly.gov.tw:80/accpart.htm";
+ final String hdr = HTTPHEADER + BODY;
+ // Interesting difference between ARCRecord and WARCRecord is that the
+ // stream passed the ARCRecord is supposed to be just past the
+ // ARCRecord metadata line where as stream passed WARCRecord is at
+ // record start. TODO: Add to ARCRecord constructor that doesn't
+ // take an ArchiveRecordHeader but rather parses it from the stream.
+ ArchiveRecordHeader arh = new ArchiveRecordHeader() {
+ public int getContentBegin() {
+ // TODO: In ARCs, this is where http headers end and
+ // the content begins. Need to reconcile for generic
+ // HeaderedArchiveRecord processing. In this context, it
+ // makes sense setting it to zero -- HeaderedArchiveRecord
+ // will then figure it out.
+ return 0;
+ }
+
+ public String getDate() {
+ return null;
+ }
+
+ public String getDigest() {
+ return null;
+ }
+
+ public Set getHeaderFieldKeys() {
+ return null;
+ }
+
+ public Map getHeaderFields() {
+ return null;
+ }
+
+ public Object getHeaderValue(String key) {
+ return null;
+ }
+
+ public long getLength() {
+ return len;
+ }
+
+ public long getContentLength() {
+ return contentLength;
+ }
+
+ public String getMimetype() {
+ return null;
+ }
+
+ public long getOffset() {
+ return 0;
+ }
+
+ public String getReaderIdentifier() {
+ return null;
+ }
+
+ public String getRecordIdentifier() {
+ return null;
+ }
+
+ public String getUrl() {
+ return url;
+ }
+
+ public String getVersion() {
+ return null;
+ }
+
+ };
+ ARCRecord r = new ARCRecord(new ByteArrayInputStream(hdr.getBytes()),
+ arh, 0, false, true, false);
+
+ HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true);
+ har.skipHttpHeader();
+ byte[] b = new byte[BODY.length()];
+ har.read(b);
+ String bodyRead = new String(b);
+ assertEquals(BODY, bodyRead);
+ assertHeaderCorrectlyParsed(har.getContentHeaders());
+ }
+
+ public void testEasierParseHttpHeadersInARC() throws IOException {
+ final String url = "http://www.archive.org/index.htm";
+ final String arcHeader = url
+ + " 192.168.0.1 20070515111004 text/html 167568\n";
+ final String hdr = arcHeader + HTTPHEADER + BODY;
+
+ ARCRecord r = new ARCRecord(new ByteArrayInputStream(hdr.getBytes()),
+ "READER_IDENTIFIER", 0, false, true, false);
+
+ HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true);
+ har.skipHttpHeader();
+ byte[] b = new byte[BODY.length()];
+ har.read(b);
+ String bodyRead = new String(b);
+ assertEquals(BODY, bodyRead);
+ assertHeaderCorrectlyParsed(har.getContentHeaders());
+ assertEquals("failed to retrieve Url from metadata", har.getHeader()
+ .getUrl(), url);
+ }
+
+ private void assertHeaderCorrectlyParsed(Header[] headers) {
+ final List orgHeaders = Arrays.asList(HTTPHEADER.split("\r\n"));
+ assertEquals("not all HTTP header entries have been retrieved",
+ orgHeaders.size(), headers.length + 1);
+
+ for (Header header : headers) {
+ assertTrue(orgHeaders.contains(header.getName() + ": "
+ + header.getValue()));
+ }
+ }
+
+ public void testNoheaderWARC() throws IOException {
+ String b = "hello world";
+ String c = "WARC/0.12\r\nContent-Type: text/plain\r\n"
+ + "Content-Length: " + b.length() + "\r\n\r\n" + b;
+ org.archive.io.warc.WARCRecord r = new org.archive.io.warc.WARCRecord(
+ new ByteArrayInputStream(c.getBytes()), "READER_IDENTIFIER", 0,
+ false, true);
+ HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true);
+ assertTrue(har.isStrict());
+ }
+}
diff --git a/src/test/java/org/archive/io/RecordingInputStreamTest.java b/src/test/java/org/archive/io/RecordingInputStreamTest.java
new file mode 100644
index 00000000..20a8b8b3
--- /dev/null
+++ b/src/test/java/org/archive/io/RecordingInputStreamTest.java
@@ -0,0 +1,132 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.PipedInputStream;
+import java.io.PipedOutputStream;
+
+import org.archive.util.TmpDirTestCase;
+
+
+/**
+ * Test cases for RecordingInputStream.
+ *
+ * @author gojomo
+ */
+public class RecordingInputStreamTest extends TmpDirTestCase
+{
+
+
+ /*
+ * @see TmpDirTestCase#setUp()
+ */
+ protected void setUp() throws Exception
+ {
+ super.setUp();
+ }
+
+ /**
+ * Test readFullyOrUntil soft (no exception) and hard (exception)
+ * length cutoffs, timeout, and rate-throttling.
+ *
+ * @throws IOException
+ * @throws InterruptedException
+ * @throws RecorderTimeoutException
+ */
+ public void testReadFullyOrUntil() throws RecorderTimeoutException, IOException, InterruptedException
+ {
+ RecordingInputStream ris = new RecordingInputStream(16384, (new File(
+ getTmpDir(), "testReadFullyOrUntil").getAbsolutePath()));
+ ByteArrayInputStream bais = new ByteArrayInputStream(
+ "abcdefghijklmnopqrstuvwxyz".getBytes());
+ // test soft max
+ ris.open(bais);
+ ris.setLimits(10,0,0);
+ ris.readFullyOrUntil(7);
+ ris.close();
+ ReplayInputStream res = ris.getReplayInputStream();
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ res.readFullyTo(baos);
+ assertEquals("soft max cutoff","abcdefg",new String(baos.toByteArray()));
+ // test hard max
+ bais.reset();
+ baos.reset();
+ ris.open(bais);
+ boolean exceptionThrown = false;
+ try {
+ ris.setLimits(10,0,0);
+ ris.readFullyOrUntil(13);
+ } catch (RecorderLengthExceededException ex) {
+ exceptionThrown = true;
+ }
+ assertTrue("hard max exception",exceptionThrown);
+ ris.close();
+ res = ris.getReplayInputStream();
+ res.readFullyTo(baos);
+ assertEquals("hard max cutoff","abcdefghijk",
+ new String(baos.toByteArray()));
+ // test timeout
+ PipedInputStream pin = new PipedInputStream();
+ PipedOutputStream pout = new PipedOutputStream(pin);
+ ris.open(pin);
+ exceptionThrown = false;
+ trickle("abcdefghijklmnopqrstuvwxyz".getBytes(),pout);
+ try {
+ ris.setLimits(0,5000,0);
+ ris.readFullyOrUntil(0);
+ } catch (RecorderTimeoutException ex) {
+ exceptionThrown = true;
+ }
+ assertTrue("timeout exception",exceptionThrown);
+ ris.close();
+ // test rate limit
+ bais = new ByteArrayInputStream(new byte[1024*2*5]);
+ ris.open(bais);
+ long startTime = System.currentTimeMillis();
+ ris.setLimits(0,0,2);
+ ris.readFullyOrUntil(0);
+ long endTime = System.currentTimeMillis();
+ long duration = endTime - startTime;
+ assertTrue("read too fast: "+duration,duration>=5000);
+ ris.close();
+ }
+
+ protected void trickle(final byte[] bytes, final PipedOutputStream pout) {
+ new Thread() {
+ public void run() {
+ try {
+ for (int i = 0; i < bytes.length; i++) {
+ Thread.sleep(1000);
+ pout.write(bytes[i]);
+ }
+ pout.close();
+ } catch (IOException e) {
+ // do nothing
+ } catch (Exception e) {
+ System.err.print(e);
+ }
+ }
+ }.start();
+
+ }
+}
diff --git a/src/test/java/org/archive/io/ReplayCharSequenceTest.java b/src/test/java/org/archive/io/ReplayCharSequenceTest.java
new file mode 100644
index 00000000..9208594a
--- /dev/null
+++ b/src/test/java/org/archive/io/ReplayCharSequenceTest.java
@@ -0,0 +1,391 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io;
+
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.text.NumberFormat;
+import java.util.Date;
+import java.util.Random;
+import java.util.logging.Logger;
+
+import org.archive.util.FileUtils;
+import org.archive.util.TmpDirTestCase;
+
+import com.google.common.base.Charsets;
+
+/**
+ * Test ReplayCharSequences.
+ *
+ * @author stack, gojomo
+ * @version $Revision$, $Date$
+ */
+public class ReplayCharSequenceTest extends TmpDirTestCase
+{
+ /**
+ * Logger.
+ */
+ private static Logger logger =
+ Logger.getLogger("org.archive.io.ReplayCharSequenceFactoryTest");
+
+
+ private static final int SEQUENCE_LENGTH = 127;
+ private static final int MULTIPLIER = 3;
+ private static final int BUFFER_SIZE = SEQUENCE_LENGTH * MULTIPLIER;
+ private static final int INCREMENT = 1;
+
+ /**
+ * Buffer of regular content.
+ */
+ private byte [] regularBuffer = null;
+
+ /*
+ * @see TestCase#setUp()
+ */
+ protected void setUp() throws Exception
+ {
+ super.setUp();
+ this.regularBuffer =
+ fillBufferWithRegularContent(new byte [BUFFER_SIZE]);
+ }
+
+ public void testShiftjis() throws IOException {
+
+ // Here's the bytes for the JIS encoding of the Japanese form of Nihongo
+ byte[] bytes_nihongo = {
+ (byte) 0x1B, (byte) 0x24, (byte) 0x42, (byte) 0x46,
+ (byte) 0x7C, (byte) 0x4B, (byte) 0x5C, (byte) 0x38,
+ (byte) 0x6C, (byte) 0x1B, (byte) 0x28, (byte) 0x42,
+ (byte) 0x1B, (byte) 0x28, (byte) 0x42 };
+ final String ENCODING = "SJIS";
+ // Here is nihongo converted to JVM encoding.
+ String nihongo = new String(bytes_nihongo, ENCODING);
+
+ RecordingOutputStream ros = writeTestStream(
+ bytes_nihongo,MULTIPLIER,
+ "testShiftjis",MULTIPLIER);
+ // TODO: check for existence of overflow file?
+ ReplayCharSequence rcs = getReplayCharSequence(ros,Charset.forName(ENCODING));
+
+ // Now check that start of the rcs comes back in as nihongo string.
+ String rcsStr = rcs.subSequence(0, nihongo.length()).toString();
+ assertTrue("Nihongo " + nihongo + " does not equal converted string" +
+ " from rcs " + rcsStr,
+ nihongo.equals(rcsStr));
+ // And assert next string is also properly nihongo.
+ if (rcs.length() >= (nihongo.length() * 2)) {
+ rcsStr = rcs.subSequence(nihongo.length(),
+ nihongo.length() + nihongo.length()).toString();
+ assertTrue("Nihongo " + nihongo + " does not equal converted " +
+ " string from rcs (2nd time)" + rcsStr,
+ nihongo.equals(rcsStr));
+ }
+ }
+
+ public void testGetReplayCharSequenceByteZeroOffset() throws IOException {
+
+ RecordingOutputStream ros = writeTestStream(
+ regularBuffer,MULTIPLIER,
+ "testGetReplayCharSequenceByteZeroOffset",MULTIPLIER);
+ ReplayCharSequence rcs = getReplayCharSequence(ros);
+
+ for (int i = 0; i < MULTIPLIER; i++) {
+ accessingCharacters(rcs);
+ }
+ }
+
+ private ReplayCharSequence getReplayCharSequence(RecordingOutputStream ros) throws IOException {
+ return getReplayCharSequence(ros,null);
+ }
+
+ private ReplayCharSequence getReplayCharSequence(RecordingOutputStream ros, Charset charset) throws IOException {
+ return new GenericReplayCharSequence(ros.getReplayInputStream(),
+ ros.getBufferLength()/2, ros.backingFilename, charset);
+ }
+
+
+ public void testGetReplayCharSequenceMultiByteZeroOffset()
+ throws IOException {
+
+ RecordingOutputStream ros = writeTestStream(
+ regularBuffer,MULTIPLIER,
+ "testGetReplayCharSequenceMultiByteZeroOffset",MULTIPLIER);
+ ReplayCharSequence rcs = getReplayCharSequence(ros,Charsets.UTF_8);
+
+ for (int i = 0; i < MULTIPLIER; i++) {
+ accessingCharacters(rcs);
+ }
+ }
+
+ public void testReplayCharSequenceByteToString() throws IOException {
+ String fileContent = "Some file content";
+ byte [] buffer = fileContent.getBytes();
+ RecordingOutputStream ros = writeTestStream(
+ buffer,1,
+ "testReplayCharSequenceByteToString.txt",0);
+ ReplayCharSequence rcs = getReplayCharSequence(ros);
+ String result = rcs.toString();
+ assertEquals("Strings don't match",result,fileContent);
+ }
+
+ private String toHexString(String str)
+ {
+ if (str != null) {
+ StringBuilder buf = new StringBuilder("{ ");
+ buf.append(Integer.toString(str.charAt(0), 16));
+ for (int i = 1; i < str.length(); i++) {
+ buf.append(", ");
+ buf.append(Integer.toString(str.charAt(i), 16));
+ }
+ buf.append(" }");
+ return buf.toString();
+ }
+ else
+ return "null";
+ }
+
+ public void testSingleByteEncodings() throws IOException {
+ byte[] bytes = {
+ (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64,
+ (byte) 0x7d, (byte) 0x7e, (byte) 0x7f, (byte) 0x80,
+ (byte) 0x81, (byte) 0x82, (byte) 0x83, (byte) 0x84,
+ (byte) 0xfc, (byte) 0xfd, (byte) 0xfe, (byte) 0xff };
+
+ String latin1String = new String(bytes, "latin1");
+ RecordingOutputStream ros = writeTestStream(
+ bytes, 1, "testSingleByteEncodings-latin1.txt", 0);
+ ReplayCharSequence rcs = getReplayCharSequence(ros,Charsets.ISO_8859_1);
+ String result = rcs.toString();
+ logger.fine("latin1[0] " + toHexString(latin1String));
+ logger.fine("latin1[1] " + toHexString(result));
+ assertEquals("latin1 strings don't match", result, latin1String);
+
+ String w1252String = new String(bytes, "windows-1252");
+ ros = writeTestStream(
+ bytes, 1, "testSingleByteEncodings-windows-1252.txt", 0);
+ rcs = getReplayCharSequence(ros,Charset.forName("windows-1252"));
+ result = rcs.toString();
+ logger.fine("windows-1252[0] " + toHexString(w1252String));
+ logger.fine("windows-1252[1] " + toHexString(result));
+ assertEquals("windows-1252 strings don't match", result, w1252String);
+
+ String asciiString = new String(bytes, "ascii");
+ ros = writeTestStream(
+ bytes, 1, "testSingleByteEncodings-ascii.txt", 0);
+ rcs = getReplayCharSequence(ros,Charset.forName("ascii"));
+ result = rcs.toString();
+ logger.fine("ascii[0] " + toHexString(asciiString));
+ logger.fine("ascii[1] " + toHexString(result));
+ assertEquals("ascii strings don't match", result, asciiString);
+ }
+
+ public void testReplayCharSequenceByteToStringOverflow() throws IOException {
+ String fileContent = "Some file content. "; // ascii
+ byte [] buffer = fileContent.getBytes();
+ RecordingOutputStream ros = writeTestStream(
+ buffer,1,
+ "testReplayCharSequenceByteToStringOverflow.txt",1);
+ String expectedContent = fileContent+fileContent;
+
+ // The string is ascii which is a subset of both these encodings. Use
+ // both encodings because they exercise different code paths. UTF-8 is
+ // decoded to UTF-16 while windows-1252 is memory mapped directly. See
+ // GenericReplayCharSequence
+ ReplayCharSequence rcsUtf8 = getReplayCharSequence(ros,Charsets.UTF_8);
+ ReplayCharSequence rcs1252 = getReplayCharSequence(ros,Charset.forName("windows-1252"));
+
+ String result = rcsUtf8.toString();
+ assertEquals("Strings don't match", expectedContent, result);
+
+ result = rcs1252.toString();
+ assertEquals("Strings don't match", expectedContent, result);
+ }
+
+ public void testReplayCharSequenceByteToStringMulti() throws IOException {
+ String fileContent = "Some file content";
+ byte [] buffer = fileContent.getBytes("UTF-8");
+ final int MULTIPLICAND = 10;
+ StringBuilder sb =
+ new StringBuilder(MULTIPLICAND * fileContent.length());
+ for (int i = 0; i < MULTIPLICAND; i++) {
+ sb.append(fileContent);
+ }
+ String expectedResult = sb.toString();
+ RecordingOutputStream ros = writeTestStream(
+ buffer,1,
+ "testReplayCharSequenceByteToStringMulti.txt",MULTIPLICAND-1);
+ for (int i = 0; i < 3; i++) {
+ ReplayCharSequence rcs = getReplayCharSequence(ros,Charsets.UTF_8);
+ String result = rcs.toString();
+ assertEquals("Strings don't match", result, expectedResult);
+ rcs.close();
+ System.gc();
+ System.runFinalization();
+ }
+ }
+
+ public void xestHugeReplayCharSequence() throws IOException {
+ String fileContent = "01234567890123456789";
+ String characterEncoding = "ascii";
+ byte[] buffer = fileContent.getBytes(characterEncoding);
+
+ long reps = (long) Integer.MAX_VALUE / (long) buffer.length + 1000000l;
+
+ logger.info("writing " + (reps * buffer.length)
+ + " bytes to testHugeReplayCharSequence.txt");
+ RecordingOutputStream ros = writeTestStream(buffer, 0,
+ "testHugeReplayCharSequence.txt", reps);
+ ReplayCharSequence rcs = getReplayCharSequence(ros,Charset.forName(characterEncoding));
+
+ if (reps * fileContent.length() > (long) Integer.MAX_VALUE) {
+ assertTrue("ReplayCharSequence has wrong length (length()="
+ + rcs.length() + ") (should be " + Integer.MAX_VALUE + ")",
+ rcs.length() == Integer.MAX_VALUE);
+ } else {
+ assertEquals("ReplayCharSequence has wrong length (length()="
+ + rcs.length() + ") (should be "
+ + (reps * fileContent.length()) + ")", (long) rcs.length(),
+ reps * (long) fileContent.length());
+ }
+
+ // boundary cases or something
+ for (int index : new int[] { 0, rcs.length() / 4, rcs.length() / 2,
+ rcs.length() - 1, rcs.length() / 4 }) {
+ // logger.info("testing char at index=" +
+ // NumberFormat.getInstance().format(index));
+ assertEquals("Characters don't match (index="
+ + NumberFormat.getInstance().format(index) + ")",
+ fileContent.charAt(index % fileContent.length()), rcs
+ .charAt(index));
+ }
+
+ // check that out of bounds indices throw exception
+ for (int n : new int[] { -1, Integer.MIN_VALUE, rcs.length() + 1 }) {
+ try {
+ String message = "rcs.charAt(" + n + ")=" + rcs.charAt(n)
+ + " ?!? -- expected IndexOutOfBoundsException";
+ logger.severe(message);
+ fail(message);
+ } catch (IndexOutOfBoundsException e) {
+ logger.info("got expected exception: " + e);
+ }
+ }
+
+ // check some characters at random spots & kinda stress test the
+ // system's memory mapping facility
+ Random rand = new Random(0); // seed so we get the same ones each time
+ for (int i = 0; i < 5000; i++) {
+ int index = rand.nextInt(rcs.length());
+ // logger.info(i + ". testing char at index=" +
+ // NumberFormat.getInstance().format(index));
+ assertEquals("Characters don't match (index="
+ + NumberFormat.getInstance().format(index) + ")",
+ fileContent.charAt(index % fileContent.length()), rcs
+ .charAt(index));
+ }
+ }
+
+ /**
+ * Accessing characters test.
+ *
+ * Checks that characters in the rcs are in sequence.
+ *
+ * @param rcs The ReplayCharSequence to try out.
+ */
+ private void accessingCharacters(CharSequence rcs) {
+ long timestamp = (new Date()).getTime();
+ int seeks = 0;
+ for (int i = (INCREMENT * 2); (i + INCREMENT) < rcs.length();
+ i += INCREMENT) {
+ checkCharacter(rcs, i);
+ seeks++;
+ for (int j = i - INCREMENT; j < i; j++) {
+ checkCharacter(rcs, j);
+ seeks++;
+ }
+ }
+ // Note that printing out below breaks cruisecontrols drawing
+ // of the xml unit test results because it outputs disallowed
+ // xml characters.
+ logger.fine(rcs + " seeks count " + seeks + " in " +
+ ((new Date().getTime()) - timestamp) + " milliseconds.");
+ }
+
+ /**
+ * Check the character read.
+ *
+ * Throws assertion if not expected result.
+ *
+ * @param rcs ReplayCharSequence to read from.
+ * @param i Character offset.
+ */
+ private void checkCharacter(CharSequence rcs, int i) {
+ int c = rcs.charAt(i);
+ assertTrue("Character " + Integer.toString(c) + " at offset " + i +
+ " unexpected.", (c % SEQUENCE_LENGTH) == (i % SEQUENCE_LENGTH));
+ }
+
+ /**
+ * @param baseName
+ * @return RecordingOutputStream
+ * @throws IOException
+ */
+ private RecordingOutputStream writeTestStream(byte[] content,
+ int memReps, String baseName, long fileReps) throws IOException {
+ String backingFilename = FileUtils.maybeRelative(getTmpDir(),baseName).getAbsolutePath();
+ RecordingOutputStream ros = new RecordingOutputStream(
+ content.length * memReps,
+ backingFilename);
+ ros.open();
+ ros.markMessageBodyBegin();
+ for(long i = 0; i < (memReps+fileReps); i++) {
+ // fill buffer (repeat MULTIPLIER times) and
+ // overflow to disk (also MULTIPLIER times)
+ ros.write(content);
+ }
+ ros.close();
+ return ros;
+ }
+
+
+ /**
+ * Fill a buffer w/ regular progression of single-byte
+ * (and <= 127) characters.
+ * @param buffer Buffer to fill.
+ * @return The buffer we filled.
+ */
+ private byte [] fillBufferWithRegularContent(byte [] buffer) {
+ int index = 0;
+ for (int i = 0; i < buffer.length; i++) {
+ buffer[i] = (byte) (index & 0x00ff);
+ index++;
+ if (index >= SEQUENCE_LENGTH) {
+ // Reset the index.
+ index = 0;
+ }
+ }
+ return buffer;
+ }
+
+ public void testCheckParameters()
+ {
+ // TODO.
+ }
+}
diff --git a/src/test/java/org/archive/io/RepositionableInputStreamTest.java b/src/test/java/org/archive/io/RepositionableInputStreamTest.java
new file mode 100644
index 00000000..1c7cc74c
--- /dev/null
+++ b/src/test/java/org/archive/io/RepositionableInputStreamTest.java
@@ -0,0 +1,70 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.io;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.PrintWriter;
+
+import org.archive.util.TmpDirTestCase;
+
+public class RepositionableInputStreamTest extends TmpDirTestCase {
+ private File testFile;
+ private static final String LINE = "0123456789abcdefghijklmnopqrstuv";
+ protected void setUp() throws Exception {
+ super.setUp();
+ this.testFile = new File(getTmpDir(), this.getClass().getName());
+ PrintWriter pw = new PrintWriter(new FileOutputStream(testFile));
+ for (int i = 0; i < 100; i++) {
+ pw.print(LINE);
+ }
+ pw.close();
+ }
+ protected void tearDown() throws Exception {
+ super.tearDown();
+ }
+ public void testname() throws Exception {
+ // Make buffer awkward size so we run into buffers spanning issues.
+ RepositionableInputStream ris =
+ new RepositionableInputStream(new FileInputStream(this.testFile),
+ 57);
+ int c = ris.read();
+ assertEquals(1, ris.position());
+ ris.read();
+ ris.position(0);
+ assertEquals(0, ris.position());
+ int c1 = ris.read();
+ assertEquals(c, c1);
+ ris.position(0);
+ byte [] bytes = new byte[LINE.length()];
+ long offset = 0;
+ for (int i = 0; i < 10; i++) {
+ ris.read(bytes, 0, LINE.length());
+ assertEquals(LINE, new String(bytes));
+ offset += LINE.length();
+ assertEquals(offset, ris.position());
+ }
+ long p = ris.position();
+ ris.position(p - LINE.length());
+ assertEquals(p - LINE.length(), ris.position());
+ c = ris.read();
+ assertEquals(c, c1);
+ }
+}
diff --git a/src/test/java/org/archive/io/arc/ARCWriterPoolTest.java b/src/test/java/org/archive/io/arc/ARCWriterPoolTest.java
new file mode 100644
index 00000000..f0be6506
--- /dev/null
+++ b/src/test/java/org/archive/io/arc/ARCWriterPoolTest.java
@@ -0,0 +1,122 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io.arc;
+
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.util.Arrays;
+
+import org.archive.io.WriterPool;
+import org.archive.io.WriterPoolMember;
+import org.archive.io.WriterPoolSettings;
+import org.archive.util.TmpDirTestCase;
+
+
+/**
+ * Test ARCWriterPool
+ */
+@SuppressWarnings("deprecation")
+public class ARCWriterPoolTest extends TmpDirTestCase {
+ private static final String PREFIX = "TEST";
+
+ public void testARCWriterPool()
+ throws Exception {
+ final int MAX_ACTIVE = 3;
+ final int MAX_WAIT_MILLISECONDS = 100;
+ cleanUpOldFiles(PREFIX);
+ WriterPool pool = new ARCWriterPool(getSettings(true),
+ MAX_ACTIVE, MAX_WAIT_MILLISECONDS);
+ WriterPoolMember [] writers = new WriterPoolMember[MAX_ACTIVE];
+ final String CONTENT = "Any old content";
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ baos.write(CONTENT.getBytes());
+ for (int i = 0; i < MAX_ACTIVE; i++) {
+ writers[i] = pool.borrowFile();
+ assertEquals("Number active", i + 1, pool.getNumActive());
+ ((ARCWriter)writers[i]).write("http://one.two.three", "no-type",
+ "0.0.0.0", 1234567890, CONTENT.length(), baos);
+ }
+
+ // Pool is maxed out. New behavior is that additional requests
+ // block as long as necessary -- so no longer testing for timeout/
+ // exception
+
+ for (int i = (MAX_ACTIVE - 1); i >= 0; i--) {
+ pool.returnFile(writers[i]);
+ assertEquals("Number active", i, pool.getNumActive());
+ assertEquals("Number idle", MAX_ACTIVE - pool.getNumActive(),
+ pool.getNumIdle());
+ }
+ pool.close();
+ }
+
+ public void testInvalidate() throws Exception {
+ final int MAX_ACTIVE = 3;
+ final int MAX_WAIT_MILLISECONDS = 100;
+ cleanUpOldFiles(PREFIX);
+ WriterPool pool = new ARCWriterPool(getSettings(true),
+ MAX_ACTIVE, MAX_WAIT_MILLISECONDS);
+ WriterPoolMember [] writers = new WriterPoolMember[MAX_ACTIVE];
+ final String CONTENT = "Any old content";
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ baos.write(CONTENT.getBytes());
+ for (int i = 0; i < MAX_ACTIVE; i++) {
+ writers[i] = pool.borrowFile();
+ assertEquals("Number active", i + 1, pool.getNumActive());
+ ((ARCWriter)writers[i]).write("http://one.two.three", "no-type",
+ "0.0.0.0", 1234567890, CONTENT.length(), baos);
+ }
+
+ WriterPoolMember writer2Invalidate = writers[pool.getNumActive() - 1];
+ writers[pool.getNumActive() - 1] = null;
+ pool.invalidateFile(writer2Invalidate);
+ for (int i = 0; i < (MAX_ACTIVE - 1); i++) {
+ if (writers[i] == null) {
+ continue;
+ }
+ pool.returnFile(writers[i]);
+ }
+
+ for (int i = 0; i < MAX_ACTIVE; i++) {
+ writers[i] = pool.borrowFile();
+ assertEquals("Number active", i + 1, pool.getNumActive());
+ ((ARCWriter)writers[i]).write("http://one.two.three", "no-type",
+ "0.0.0.0", 1234567890, CONTENT.length(), baos);
+ }
+ for (int i = (MAX_ACTIVE - 1); i >= 0; i--) {
+ pool.returnFile(writers[i]);
+ assertEquals("Number active", i, pool.getNumActive());
+ assertEquals("Number idle", MAX_ACTIVE - pool.getNumActive(),
+ pool.getNumIdle());
+ }
+ pool.close();
+ }
+
+ private WriterPoolSettings getSettings(final boolean isCompressed) {
+ File [] files = {getTmpDir()};
+ return new WriterPoolSettingsData(
+ PREFIX,
+ "${prefix}-${timestamp17}-${serialno}-${heritrix.hostname}",
+ ARCConstants.DEFAULT_MAX_ARC_FILE_SIZE,
+ isCompressed,
+ Arrays.asList(files),
+ null);
+ }
+}
\ No newline at end of file
diff --git a/src/test/java/org/archive/io/arc/ARCWriterTest.java b/src/test/java/org/archive/io/arc/ARCWriterTest.java
new file mode 100644
index 00000000..f6e2bf6a
--- /dev/null
+++ b/src/test/java/org/archive/io/arc/ARCWriterTest.java
@@ -0,0 +1,699 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io.arc;
+
+import java.io.BufferedInputStream;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PrintStream;
+import java.util.Arrays;
+import java.util.Date;
+import java.util.Iterator;
+import java.util.List;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.input.NullInputStream;
+import org.apache.commons.io.output.NullOutputStream;
+import org.archive.io.ArchiveRecord;
+import org.archive.io.ArchiveRecordHeader;
+import org.archive.io.ReplayInputStream;
+import org.archive.io.WriterPoolMember;
+import org.archive.io.WriterPoolSettings;
+import org.archive.util.ArchiveUtils;
+import org.archive.util.TmpDirTestCase;
+
+import com.google.common.io.Closeables;
+
+
+/**
+ * Test ARCWriter class.
+ *
+ * This code exercises ARCWriter AND ARCReader. First it writes ARCs w/
+ * ARCWriter. Then it validates what was written w/ ARCReader.
+ *
+ * @author stack
+ */
+public class ARCWriterTest
+extends TmpDirTestCase implements ARCConstants {
+ /**
+ * Utility class for writing bad ARCs (with trailing junk)
+ */
+ public class CorruptibleARCWriter extends ARCWriter {
+ byte[] endJunk = null;
+
+ public CorruptibleARCWriter(AtomicInteger serial_no, WriterPoolSettings settings) {
+ super(serial_no, settings);
+ }
+
+ @Override
+ protected void postWriteRecordTasks() throws IOException {
+ if (endJunk != null) {
+ this.write(endJunk);
+ }
+ super.postWriteRecordTasks();
+ }
+
+ public void setEndJunk(byte[] b) throws IOException {
+ this.endJunk = b;
+ }
+ }
+
+ /**
+ * Suffix to use for ARC files made by JUNIT.
+ */
+ private static final String SUFFIX = "JUNIT";
+
+ private static final String SOME_URL = "http://www.archive.org/test/";
+
+
+ private static final AtomicInteger SERIAL_NO = new AtomicInteger();
+
+ /*
+ * @see TestCase#setUp()
+ */
+ protected void setUp() throws Exception {
+ super.setUp();
+ }
+
+ /*
+ * @see TestCase#tearDown()
+ */
+ protected void tearDown() throws Exception {
+ super.tearDown();
+ }
+
+ protected static String getContent() {
+ return getContent(null);
+ }
+
+ protected static String getContent(String indexStr) {
+ String page = (indexStr != null)? "Page #" + indexStr: "Some Page";
+ return "HTTP/1.1 200 OK\r\n" +
+ "Content-Type: text/html\r\n\r\n" +
+ "" + page +
+ "" +
+ "" + page +
+ "";
+ }
+
+ @SuppressWarnings("deprecation")
+ protected int writeRandomHTTPRecord(ARCWriter arcWriter, int index)
+ throws IOException {
+ String indexStr = Integer.toString(index);
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ // Start the record with an arbitrary 14-digit date per RFC2540
+ String now = ArchiveUtils.get14DigitDate();
+ int recordLength = 0;
+ byte[] record = (getContent(indexStr)).getBytes();
+ recordLength += record.length;
+ baos.write(record);
+ // Add the newline between records back in
+ baos.write("\n".getBytes());
+ recordLength += 1;
+ arcWriter.write("http://www.one.net/id=" + indexStr, "text/html",
+ "0.1.2.3", Long.parseLong(now), recordLength, baos);
+ return recordLength;
+ }
+
+ private File writeRecords(String baseName, boolean compress,
+ long maxSize, int recordCount)
+ throws IOException {
+ cleanUpOldFiles(baseName);
+ File [] files = {getTmpDir()};
+ ARCWriter arcWriter =
+ new ARCWriter(
+ SERIAL_NO,
+ new WriterPoolSettingsData(
+ baseName,
+ "${prefix}-"+SUFFIX,
+ maxSize,
+ compress,
+ Arrays.asList(files),
+ null));
+ assertNotNull(arcWriter);
+ for (int i = 0; i < recordCount; i++) {
+ writeRandomHTTPRecord(arcWriter, i);
+ }
+ arcWriter.close();
+ assertTrue("Doesn't exist: " +
+ arcWriter.getFile().getAbsolutePath(),
+ arcWriter.getFile().exists());
+ return arcWriter.getFile();
+ }
+
+ private void validate(File arcFile, int recordCount)
+ throws FileNotFoundException, IOException {
+ ARCReader reader = ARCReaderFactory.get(arcFile);
+ assertNotNull(reader);
+ List metaDatas = null;
+ if (recordCount == -1) {
+ metaDatas = reader.validate();
+ } else {
+ metaDatas = reader.validate(recordCount);
+ }
+ reader.close();
+ // Now, run through each of the records doing absolute get going from
+ // the end to start. Reopen the arc so no context between this test
+ // and the previous.
+
+ for (int i = metaDatas.size() - 1; i >= 0; i--) {
+ reader = ARCReaderFactory.get(arcFile);
+ ARCRecordMetaData meta = (ARCRecordMetaData)metaDatas.get(i);
+ ArchiveRecord r = reader.get(meta.getOffset());
+ String mimeType = r.getHeader().getMimetype();
+ assertTrue("Record is bogus",
+ mimeType != null && mimeType.length() > 0);
+ reader.close();
+ }
+ assertEquals("Metadata count not as expected",recordCount, metaDatas.size());
+ for (Iterator i = metaDatas.iterator(); i.hasNext();) {
+ ARCRecordMetaData r = (ARCRecordMetaData)i.next();
+ assertTrue("Record is empty", r.getLength() > 0);
+ }
+ }
+
+ public void testCheckARCFileSize()
+ throws IOException {
+ runCheckARCFileSizeTest("checkARCFileSize", false);
+ }
+
+ public void testCheckARCFileSizeCompressed()
+ throws IOException {
+ runCheckARCFileSizeTest("checkARCFileSize", true);
+ }
+
+ public void testWriteRecord() throws IOException {
+ final int recordCount = 2;
+ File arcFile = writeRecords("writeRecord", false,
+ DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
+ validate(arcFile, recordCount + 1); // Header record.
+ }
+
+ public void testRandomAccess() throws IOException {
+ final int recordCount = 3;
+ File arcFile = writeRecords("writeRecord", true,
+ DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
+ ARCReader reader = ARCReaderFactory.get(arcFile);
+ // Get to second record. Get its offset for later use.
+ boolean readFirst = false;
+ String url = null;
+ long offset = -1;
+ long totalRecords = 0;
+ boolean readSecond = false;
+ for (final Iterator i = reader.iterator(); i.hasNext(); totalRecords++) {
+ ARCRecord ar = (ARCRecord)i.next();
+ if (!readFirst) {
+ readFirst = true;
+ continue;
+ }
+ if (!readSecond) {
+ url = ar.getMetaData().getUrl();
+ offset = ar.getMetaData().getOffset();
+ readSecond = true;
+ }
+ }
+ reader.close();
+
+ reader = ARCReaderFactory.get(arcFile, offset);
+ ArchiveRecord ar = reader.get();
+ assertEquals(ar.getHeader().getUrl(), url);
+ ar.close();
+ reader.close();
+
+ // Get reader again. See how iterator works with offset
+ reader = ARCReaderFactory.get(arcFile, offset);
+ int count = 0;
+ for (final Iterator i = reader.iterator(); i.hasNext(); i.next()) {
+ count++;
+ }
+ reader.close();
+ assertEquals(totalRecords - 1, count);
+ }
+
+ public void testWriteRecordCompressed() throws IOException {
+ final int recordCount = 2;
+ File arcFile = writeRecords("writeRecordCompressed", true,
+ DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
+ validate(arcFile, recordCount + 1 /*Header record*/);
+ }
+
+ public void testWriteGiantRecord() throws IOException {
+ PrintStream dummyStream = new PrintStream(new NullOutputStream());
+ ARCWriter arcWriter =
+ new ARCWriter(
+ SERIAL_NO,
+ dummyStream,
+ new File("dummy"),
+ new WriterPoolSettingsData(
+ "",
+ "",
+ -1,
+ false,
+ null,
+ null));
+ assertNotNull(arcWriter);
+
+ // Start the record with an arbitrary 14-digit date per RFC2540
+ long now = System.currentTimeMillis();
+ long recordLength = org.apache.commons.io.FileUtils.ONE_GB * 3;
+
+ arcWriter.write("dummy:uri", "application/octet-stream",
+ "0.1.2.3", now, recordLength, new NullInputStream(recordLength));
+ arcWriter.close();
+ }
+
+ private void runCheckARCFileSizeTest(String baseName, boolean compress)
+ throws FileNotFoundException, IOException {
+ File f = writeRecords(baseName, compress, 1024, 15);
+ validate(f, 15+1);
+ }
+
+ protected CorruptibleARCWriter createARCWriter(String name, boolean compress) {
+ File [] files = {getTmpDir()};
+ return new CorruptibleARCWriter(
+ SERIAL_NO,
+ new WriterPoolSettingsData(
+ name,
+ "${prefix}-"+SUFFIX,
+ DEFAULT_MAX_ARC_FILE_SIZE,
+ compress,
+ Arrays.asList(files),
+ null));
+ }
+
+ protected static ByteArrayInputStream getBais(String str)
+ throws IOException {
+ return new ByteArrayInputStream(str.getBytes());
+ }
+
+ /**
+ * Writes a record, suppressing normal length-checks (so that
+ * intentionally malformed records may be written).
+ */
+ protected static void writeRecord(ARCWriter writer, String url,
+ String type, int len, ByteArrayInputStream bais)
+ throws IOException {
+ writer.write(url, type, "192.168.1.1", (new Date()).getTime(), len,
+ bais, false);
+ }
+
+ protected int iterateRecords(ARCReader r)
+ throws IOException {
+ int count = 0;
+ for (Iterator i = r.iterator(); i.hasNext();) {
+ ARCRecord rec = (ARCRecord)i.next();
+ rec.close();
+ if (count != 0) {
+ assertTrue("Unexpected URL " + rec.getMetaData().getUrl(),
+ rec.getMetaData().getUrl().startsWith(SOME_URL));
+ }
+ count++;
+ }
+ return count;
+ }
+
+ protected CorruptibleARCWriter createArcWithOneRecord(String name,
+ boolean compressed)
+ throws IOException {
+ CorruptibleARCWriter writer = createARCWriter(name, compressed);
+ String content = getContent();
+ writeRecord(writer, SOME_URL, "text/html",
+ content.length(), getBais(content));
+ return writer;
+ }
+
+ public void testSpaceInURL() {
+ String eMessage = null;
+ try {
+ holeyUrl("testSpaceInURL", false, " ");
+ } catch (IOException e) {
+ eMessage = e.getMessage();
+ }
+ assertTrue("Didn't get expected exception: " + eMessage,
+ eMessage.startsWith("Metadata line doesn't match"));
+ }
+
+ public void testTabInURL() {
+ String eMessage = null;
+ try {
+ holeyUrl("testTabInURL", false, "\t");
+ } catch (IOException e) {
+ eMessage = e.getMessage();
+ }
+ assertTrue("Didn't get expected exception: " + eMessage,
+ eMessage.startsWith("Metadata line doesn't match"));
+ }
+
+ protected void holeyUrl(String name, boolean compress, String urlInsert)
+ throws IOException {
+ ARCWriter writer = null;
+ try {
+ writer = createArcWithOneRecord(name, compress);
+ // Add some bytes on the end to mess up the record.
+ String content = getContent();
+ writeRecord(writer, SOME_URL + urlInsert + "/index.html", "text/html",
+ content.length(), getBais(content));
+ } finally {
+ Closeables.close(writer, true);
+ }
+ }
+
+// If uncompressed, length has to be right or parse will fail.
+//
+// public void testLengthTooShort() throws IOException {
+// lengthTooShort("testLengthTooShort-" + PREFIX, false);
+// }
+
+ public void testLengthTooShortCompressed() throws IOException {
+ lengthTooShort("testLengthTooShortCompressed", true, false);
+ }
+
+ public void testLengthTooShortCompressedStrict()
+ throws IOException {
+ String eMessage = null;
+ try {
+ lengthTooShort("testLengthTooShortCompressedStrict",
+ true, true);
+ } catch (RuntimeException e) {
+ eMessage = e.getMessage();
+ }
+ assertTrue("Didn't get expected exception: " + eMessage,
+ eMessage.startsWith("java.io.IOException: Record STARTING at"));
+ }
+
+ protected void lengthTooShort(String name, boolean compress, boolean strict)
+ throws IOException {
+ CorruptibleARCWriter writer = null;
+ try {
+ writer = createArcWithOneRecord(name, compress);
+ // Add some bytes on the end to mess up the record.
+ String content = getContent();
+ ByteArrayInputStream bais = getBais(content+"SOME TRAILING BYTES");
+ writeRecord(writer, SOME_URL, "text/html",
+ content.length(), bais);
+ writer.setEndJunk("SOME TRAILING BYTES".getBytes());
+ writeRecord(writer, SOME_URL, "text/html",
+ content.length(), getBais(content));
+ } finally {
+ Closeables.close(writer, true);
+ }
+
+ // Catch System.err into a byte stream.
+ ByteArrayOutputStream os = new ByteArrayOutputStream();
+ PrintStream origErr = System.err;
+ ARCReader r = null;
+ try {
+ System.setErr(new PrintStream(os));
+
+ r = ARCReaderFactory.get(writer.getFile());
+ r.setStrict(strict);
+ int count = iterateRecords(r);
+ assertTrue("Count wrong " + count, count == 4);
+
+ // Make sure we get the warning string which complains about the
+ // trailing bytes.
+ String err = os.toString();
+ assertTrue("No message " + err, err.startsWith("WARNING") &&
+ (err.indexOf("Record STARTING at") > 0));
+ r.close();
+ } finally {
+ Closeables.close(r, true);
+ System.setErr(origErr);
+ }
+ }
+
+// If uncompressed, length has to be right or parse will fail.
+//
+// public void testLengthTooLong()
+// throws IOException {
+// lengthTooLong("testLengthTooLongCompressed-" + PREFIX,
+// false, false);
+// }
+
+ public void testLengthTooLongCompressed()
+ throws IOException {
+ lengthTooLong("testLengthTooLongCompressed",
+ true, false);
+ }
+
+ public void testLengthTooLongCompressedStrict() {
+ String eMessage = null;
+ try {
+ lengthTooLong("testLengthTooLongCompressed",
+ true, true);
+ } catch (IOException e) {
+ eMessage = e.getMessage();
+ }
+ assertTrue("Didn't get expected exception: " + eMessage,
+ eMessage.startsWith("Premature EOF before end-of-record"));
+ }
+
+ protected void lengthTooLong(String name, boolean compress,
+ boolean strict)
+ throws IOException {
+ ARCWriter writer = createArcWithOneRecord(name, compress);
+ // Add a record with a length that is too long.
+ String content = getContent();
+ writeRecord(writer, SOME_URL+"2", "text/html",
+ content.length() + 10, getBais(content));
+ writeRecord(writer, SOME_URL+"3", "text/html",
+ content.length(), getBais(content));
+ writer.close();
+
+ // Catch System.err.
+ ByteArrayOutputStream os = new ByteArrayOutputStream();
+
+ PrintStream origErr = System.err;
+ ARCReader r = null;
+ try {
+ System.setErr(new PrintStream(os));
+
+ r = ARCReaderFactory.get(writer.getFile());
+ r.setStrict(strict);
+ int count = iterateRecords(r);
+ assertTrue("Count wrong " + count, count == 4);
+
+ // Make sure we get the warning string which complains about the
+ // trailing bytes.
+ String err = os.toString();
+ assertTrue("No message " + err,
+ err.startsWith("WARNING Premature EOF before end-of-record"));
+ } finally {
+ Closeables.close(r, true);
+ System.setErr(origErr);
+ }
+ }
+
+ public void testGapError() throws IOException {
+ ARCWriter writer = createArcWithOneRecord("testGapError", true);
+ String content = getContent();
+ // Make a 'weird' RIS that returns bad 'remaining' length
+ // awhen remaining should be 0
+ ReplayInputStream ris = new ReplayInputStream(content.getBytes(),
+ content.length(), null) {
+ public long remaining() {
+ return (super.remaining()==0) ? -1 : super.remaining();
+ }
+ };
+ String message = null;
+ try {
+ writer.write(SOME_URL, "text/html", "192.168.1.1",
+ (new Date()).getTime(), content.length(), ris);
+ } catch (IOException e) {
+ message = e.getMessage();
+ } finally {
+ IOUtils.closeQuietly(ris);
+ }
+ writer.close();
+ assertTrue("No gap when should be",
+ message != null &&
+ message.indexOf("Gap between expected and actual") >= 0);
+ }
+
+ /**
+ * Write an arc file for other tests to use.
+ * @param arcdir Directory to write to.
+ * @param compress True if file should be compressed.
+ * @return ARC written.
+ * @throws IOException
+ */
+ public static File createARCFile(File arcdir, boolean compress)
+ throws IOException {
+ File [] files = {arcdir};
+ ARCWriter writer = new ARCWriter(SERIAL_NO,
+ new WriterPoolSettingsData(
+ "",
+ "test",
+ DEFAULT_MAX_ARC_FILE_SIZE,
+ compress,
+ Arrays.asList(files),
+ null));
+ String content = getContent();
+ writeRecord(writer, SOME_URL, "text/html", content.length(),
+ getBais(content));
+ writer.close();
+ return writer.getFile();
+ }
+
+// public void testSpeed() throws IOException {
+// ARCWriter writer = createArcWithOneRecord("speed", true);
+// // Add a record with a length that is too long.
+// String content = getContent();
+// final int count = 100000;
+// logger.info("Starting speed write of " + count + " records.");
+// for (int i = 0; i < count; i++) {
+// writeRecord(writer, SOME_URL, "text/html", content.length(),
+// getBaos(content));
+// }
+// writer.close();
+// logger.info("Finished speed write test.");
+// }
+
+
+ public void testValidateMetaLine() throws Exception {
+ final String line = "http://www.aandw.net/images/walden2.png " +
+ "128.197.34.86 20060111174224 image/png 2160";
+ ARCWriter w = createARCWriter("testValidateMetaLine", true);
+ try {
+ w.validateMetaLine(line);
+ w.validateMetaLine(line + LINE_SEPARATOR);
+ w.validateMetaLine(line + "\\r\\n");
+ } finally {
+ w.close();
+ }
+ }
+
+ public void testArcRecordOffsetReads() throws Exception {
+ ARCReader r = getSingleRecordReader("testArcRecordInBufferStream");
+ ARCRecord ar = getSingleRecord(r);
+ // Now try getting some random set of bytes out of it
+ // at an odd offset (used to fail because we were
+ // doing bad math to find where in buffer to read).
+ final byte[] buffer = new byte[17];
+ final int maxRead = 4;
+ int totalRead = 0;
+ while (totalRead < maxRead) {
+ totalRead = totalRead
+ + ar.read(buffer, 13 + totalRead, maxRead - totalRead);
+ assertTrue(totalRead > 0);
+ }
+ r.close();
+ }
+
+ // available should always be >= 0; extra read()s should all give EOF
+ public void testArchiveRecordAvailableConsistent() throws Exception {
+ // first test reading byte-at-a-time via no-param read()
+ ARCReader r = getSingleRecordReader("testArchiveRecordAvailableConsistent");
+ ARCRecord record = getSingleRecord(r);
+ int c = record.read();
+ while(c>=0) {
+ c = record.read();
+ }
+ // consecutive reads after EOR should always give -1, still show zero available()
+ for (int i=0; i<5; i++) {
+ assertTrue("available negative:"+record.available(), record.available()>=0);
+ assertEquals(-1, record.read());
+ }
+ r.close();
+ }
+
+ // should always give -1 on repeated reads past EOR
+ public void testArchiveRecordEORConsistent() throws Exception {
+ ARCReader r = getSingleRecordReader("testArchiveRecordEORConsistent");
+ ARCRecord record = getSingleRecord(r);
+ this.readToEOS(record);
+ // consecutive reads after EOR should always give -1
+ for (int i=0; i<5; i++) {
+ assertEquals(-1, record.read(new byte[1]));
+ }
+ r.close();
+ }
+
+ // should not throw premature EOF when wrapped with BufferedInputStream
+ // [HER-1450] showed this was the case using Apache Tika
+ public void testArchiveRecordMarkSupport() throws Exception {
+ ARCReader r = getSingleRecordReader("testArchiveRecordMarkSupport");
+ ARCRecord record = getSingleRecord(r);
+ record.setStrict(true);
+ // ensure mark support
+ InputStream stream = new BufferedInputStream(record);
+ if (stream.markSupported()) {
+ for (int i=0; i<3; i++) {
+ this.readToEOS(stream);
+ stream.mark(stream.available());
+ stream.reset();
+ }
+ stream.close();
+ }
+ r.close();
+ }
+
+ /**
+ * Test a particular style of using the reader iterator. (Should
+ * possibly be on a reader-centric test class, but the best setup
+ * functionality is here.)
+ *
+ * @throws IOException
+ */
+ public void testReadIterator() throws IOException {
+ final int recordCount = 3;
+ File arcFile = writeRecords("writeRecord", true,
+ DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
+ ARCReader reader = ARCReaderFactory.get(arcFile);
+ Iterator it = reader.iterator();
+ while (it.hasNext()) {
+ ArchiveRecord next = it.next();
+ next.close();
+ }
+ reader.close();
+ }
+
+ protected void readToEOS(InputStream in) throws Exception {
+ byte [] buf = new byte[1024];
+ int read = 0;
+ while (read >= 0) {
+ read = in.read(buf);
+ // System.out.println("readToEOS read " + read + " bytes");
+ }
+ }
+
+ protected ARCReader getSingleRecordReader(String name) throws Exception {
+ // Get an ARC with one record.
+ WriterPoolMember w = createArcWithOneRecord(name, true);
+ w.close();
+ // Get reader on said ARC.
+ ARCReader r = ARCReaderFactory.get(w.getFile());
+ return r;
+ }
+
+ protected ARCRecord getSingleRecord(ARCReader r) {
+ final Iterator i = r.iterator();
+ // Skip first ARC meta record.
+ i.next();
+ i.hasNext();
+ // Now we're at first and only record in ARC.
+ return (ARCRecord) i.next();
+ }
+}
diff --git a/src/test/java/org/archive/io/warc/WARCWriterTest.java b/src/test/java/org/archive/io/warc/WARCWriterTest.java
new file mode 100644
index 00000000..35c68714
--- /dev/null
+++ b/src/test/java/org/archive/io/warc/WARCWriterTest.java
@@ -0,0 +1,512 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.io.warc;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.net.URI;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.archive.io.ArchiveRecord;
+import org.archive.io.ArchiveRecordHeader;
+import org.archive.io.UTF8Bytes;
+import org.archive.io.WriterPoolMember;
+import org.archive.uid.RecordIDGenerator;
+import org.archive.uid.UUIDGenerator;
+import org.archive.util.ArchiveUtils;
+import org.archive.util.TmpDirTestCase;
+import org.archive.util.anvl.ANVLRecord;
+
+/**
+ * Test Writer and Reader.
+ * @author stack
+ * @version $Date: 2006-08-29 19:35:48 -0700 (Tue, 29 Aug 2006) $ $Version$
+ */
+public class WARCWriterTest
+extends TmpDirTestCase implements WARCConstants {
+
+ private static final AtomicInteger SERIAL_NO = new AtomicInteger();
+
+ RecordIDGenerator generator = new UUIDGenerator();
+
+ /**
+ * Prefix to use for ARC files made by JUNIT.
+ */
+ private static final String SUFFIX = "JUNIT";
+
+ private static final String SOME_URL = "http://www.archive.org/test/";
+
+ @SuppressWarnings("unchecked")
+ public void testCheckHeaderLineValue() throws Exception {
+ WARCWriter writer = new WARCWriter(
+ SERIAL_NO,
+ new WARCWriterPoolSettingsData(
+ "","test",1,false,Collections.EMPTY_LIST,Collections.EMPTY_LIST,generator));
+ writer.checkHeaderValue("one");
+ IllegalArgumentException exception = null;
+ try {
+ writer.checkHeaderValue("with space");
+ } catch(IllegalArgumentException e) {
+ exception = e;
+ }
+ assertNotNull(exception);
+ exception = null;
+ try {
+ writer.checkHeaderValue("with\0x0000controlcharacter");
+ } catch(IllegalArgumentException e) {
+ exception = e;
+ }
+ writer.close();
+ assertNotNull(exception);
+ }
+
+ @SuppressWarnings("unchecked")
+ public void testMimetypes() throws IOException {
+ WARCWriter writer = new WARCWriter(SERIAL_NO,
+ new WARCWriterPoolSettingsData(
+ "m","testM",1,false,Collections.EMPTY_LIST,Collections.EMPTY_LIST,generator));
+ writer.checkHeaderLineMimetypeParameter("text/xml");
+ writer.checkHeaderLineMimetypeParameter("text/xml+rdf");
+ assertEquals(writer.checkHeaderLineMimetypeParameter(
+ "text/plain; charset=SHIFT-JIS"), "text/plain; charset=SHIFT-JIS");
+ assertEquals(writer.checkHeaderLineMimetypeParameter(
+ "multipart/mixed; \r\n boundary=\"simple boundary\""),
+ "multipart/mixed; boundary=\"simple boundary\"");
+ }
+
+ public void testWriteRecord() throws IOException {
+ File [] files = {getTmpDir()};
+
+ // Write uncompressed.
+ WARCWriter writer =
+ new WARCWriter(SERIAL_NO, new WARCWriterPoolSettingsData(
+ this.getClass().getName(), "templateWR1", -1, false, Arrays.asList(files), null, generator));
+
+ writeFile(writer);
+ writer.close();
+
+ // Write compressed.
+ writer = new WARCWriter(SERIAL_NO, new WARCWriterPoolSettingsData(
+ this.getClass().getName(), "templateWR2", -1, true, Arrays.asList(files), null, generator));
+
+ writeFile(writer);
+ writer.close();
+ }
+
+ private void writeFile(final WARCWriter writer)
+ throws IOException {
+ try {
+ writeWarcinfoRecord(writer);
+ writeBasicRecords(writer);
+ } finally {
+ writer.close();
+ writer.getFile().delete();
+ }
+ }
+
+ private void writeWarcinfoRecord(WARCWriter writer)
+ throws IOException {
+ WARCRecordInfo recordInfo = new WARCRecordInfo();
+ recordInfo.setType(WARCRecordType.warcinfo);
+ recordInfo.setUrl(null);
+ recordInfo.setCreate14DigitDate(ArchiveUtils.getLog14Date());
+ recordInfo.setMimetype(ANVLRecord.MIMETYPE);
+ recordInfo.setExtraHeaders(null);
+ recordInfo.setEnforceLength(true);
+
+ ANVLRecord meta = new ANVLRecord();
+ meta.addLabelValue("size", "1G");
+ meta.addLabelValue("operator", "igor");
+ byte [] bytes = meta.getUTF8Bytes();
+ recordInfo.setContentStream(new ByteArrayInputStream(bytes));
+ recordInfo.setContentLength((long) bytes.length);
+
+ final URI recordid = writer.generateRecordId(WARCWriter.TYPE, WARCRecordType.warcinfo.toString());
+ recordInfo.setRecordId(recordid);
+
+ writer.writeRecord(recordInfo);
+ }
+
+ protected void writeBasicRecords(final WARCWriter writer)
+ throws IOException {
+ WARCRecordInfo recordInfo = new WARCRecordInfo();
+ recordInfo.setType(WARCRecordType.metadata);
+ recordInfo.setUrl("http://www.archive.org/");
+ recordInfo.setCreate14DigitDate(ArchiveUtils.get14DigitDate());
+ recordInfo.setMimetype("no/type");
+ recordInfo.setEnforceLength(true);
+
+ ANVLRecord headerFields = new ANVLRecord();
+ headerFields.addLabelValue("x", "y");
+ headerFields.addLabelValue("a", "b");
+ recordInfo.setExtraHeaders(headerFields);
+
+ URI rid = (new UUIDGenerator()).getQualifiedRecordID(TYPE, WARCRecordType.metadata.toString());
+ recordInfo.setRecordId(rid);
+
+ final String content = "Any old content.";
+ for (int i = 0; i < 10; i++) {
+ String body = i + ". " + content;
+ byte [] bodyBytes = body.getBytes(UTF8Bytes.UTF8);
+ recordInfo.setContentStream(new ByteArrayInputStream(bodyBytes));
+ recordInfo.setContentLength((long)bodyBytes.length);
+ writer.writeRecord(recordInfo);
+ }
+ }
+
+ /**
+ * @return Generic HTML Content.
+ */
+ protected static String getContent() {
+ return getContent(null);
+ }
+
+ /**
+ * @return Generic HTML Content with mention of passed indexStr
+ * in title and body.
+ */
+ protected static String getContent(String indexStr) {
+ String page = (indexStr != null)? "Page #" + indexStr: "Some Page";
+ return "HTTP/1.1 200 OK\r\n" +
+ "Content-Type: text/html\r\n\r\n" +
+ "" + page +
+ "" +
+ "" + page +
+ "";
+ }
+
+ /**
+ * Write random HTML Record.
+ * @param w Where to write.
+ * @param index An index to put into content.
+ * @return Length of record written.
+ * @throws IOException
+ */
+ protected int writeRandomHTTPRecord(WARCWriter w, int index)
+ throws IOException {
+ WARCRecordInfo recordInfo = new WARCRecordInfo();
+ recordInfo.setType(WARCRecordType.resource);
+ recordInfo.setCreate14DigitDate(ArchiveUtils.get14DigitDate());
+ recordInfo.setMimetype("text/html; charset=UTF-8");
+ recordInfo.setRecordId(w.generateRecordId(null));
+ recordInfo.setEnforceLength(true);
+
+ String indexStr = Integer.toString(index);
+ recordInfo.setUrl("http://www.one.net/id=" + indexStr);
+
+ byte[] record = (getContent(indexStr)).getBytes();
+ recordInfo.setContentLength((long) record.length);
+
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ baos.write(record);
+ recordInfo.setContentStream(new ByteArrayInputStream(baos.toByteArray()));
+
+ // Add named fields for ip, checksum, and relate the metadata
+ // and request to the resource field.
+ recordInfo.addExtraHeader(NAMED_FIELD_IP_LABEL, "127.0.0.1");
+
+ w.writeRecord(recordInfo);
+ return record.length;
+ }
+
+ /**
+ * Fill a WARC with HTML Records.
+ * @param baseName WARC basename.
+ * @param compress Whether to compress or not.
+ * @param maxSize Maximum WARC size.
+ * @param recordCount How many records.
+ * @return The written file.
+ * @throws IOException
+ */
+ private File writeRecords(String baseName, boolean compress,
+ int maxSize, int recordCount)
+ throws IOException {
+ cleanUpOldFiles(baseName);
+ File [] files = {getTmpDir()};
+ WARCWriter w = new WARCWriter(SERIAL_NO, new WARCWriterPoolSettingsData(
+ baseName + '-' + SUFFIX, "${prefix}", maxSize, compress, Arrays.asList(files), null, generator));
+
+ assertNotNull(w);
+ for (int i = 0; i < recordCount; i++) {
+ writeRandomHTTPRecord(w, i);
+ }
+ w.close();
+ assertTrue("Doesn't exist: " + w.getFile().getAbsolutePath(),
+ w.getFile().exists());
+ return w.getFile();
+ }
+
+ /**
+ * Run validation of passed file.
+ * @param f File to validate.
+ * @param recordCount Expected count of records.
+ * @throws FileNotFoundException
+ * @throws IOException
+ */
+ private void validate(File f, int recordCount)
+ throws FileNotFoundException, IOException {
+ WARCReader reader = WARCReaderFactory.get(f);
+ assertNotNull(reader);
+ List headers = null;
+ if (recordCount == -1) {
+ headers = reader.validate();
+ } else {
+ headers = reader.validate(recordCount);
+ }
+ reader.close();
+
+ // Now, run through each of the records doing absolute get going from
+ // the end to start. Reopen the arc so no context between this test
+ // and the previous.
+
+ for (int i = headers.size() - 1; i >= 0; i--) {
+ reader = WARCReaderFactory.get(f);
+ ArchiveRecordHeader h = (ArchiveRecordHeader)headers.get(i);
+ ArchiveRecord r = reader.get(h.getOffset());
+ String mimeType = r.getHeader().getMimetype();
+ assertTrue("Record is bogus",
+ mimeType != null && mimeType.length() > 0);
+ reader.close();
+ }
+
+ assertTrue("Metadatas not equal", headers.size() == recordCount);
+ for (Iterator i = headers.iterator(); i.hasNext();) {
+ ArchiveRecordHeader r = (ArchiveRecordHeader)i.next();
+ assertTrue("Record is empty", r.getLength() > 0);
+ }
+ }
+
+ public void testWriteRecords() throws IOException {
+ final int recordCount = 2;
+ File f = writeRecords("writeRecords", false, DEFAULT_MAX_WARC_FILE_SIZE,
+ recordCount);
+ validate(f, recordCount + 1); // Header record.
+ }
+
+ public void testRandomAccess() throws IOException {
+ final int recordCount = 3;
+ File f = writeRecords("randomAccess", true, DEFAULT_MAX_WARC_FILE_SIZE,
+ recordCount);
+ WARCReader reader = WARCReaderFactory.get(f);
+ // Get to second record. Get its offset for later use.
+ boolean readFirst = false;
+ String url = null;
+ long offset = -1;
+ long totalRecords = 0;
+ boolean readSecond = false;
+ for (final Iterator i = reader.iterator(); i.hasNext();
+ totalRecords++) {
+ WARCRecord ar = (WARCRecord)i.next();
+ if (!readFirst) {
+ readFirst = true;
+ continue;
+ }
+ if (!readSecond) {
+ url = ar.getHeader().getUrl();
+ offset = ar.getHeader().getOffset();
+ readSecond = true;
+ }
+ }
+ reader.close();
+
+ reader = WARCReaderFactory.get(f, offset);
+ ArchiveRecord ar = reader.get();
+ assertEquals(ar.getHeader().getUrl(), url);
+ ar.close();
+ reader.close();
+
+ // Get reader again. See how iterator works with offset
+ reader = WARCReaderFactory.get(f, offset);
+ int count = 0;
+ for (final Iterator i = reader.iterator(); i.hasNext(); i.next()) {
+ count++;
+ }
+ reader.close();
+ assertEquals(totalRecords - 1, count);
+ }
+
+ public void testWriteRecordCompressed() throws IOException {
+ final int recordCount = 2;
+ File arcFile = writeRecords("writeRecordCompressed", true,
+ DEFAULT_MAX_WARC_FILE_SIZE, recordCount);
+ validate(arcFile, recordCount + 1 /*Header record*/);
+ }
+
+ protected WARCWriter createWARCWriter(String name,
+ boolean compress) {
+ File [] files = {getTmpDir()};
+ return new WARCWriter(SERIAL_NO,
+ new WARCWriterPoolSettingsData(
+ name,
+ "${prefix}-"+SUFFIX,
+ DEFAULT_MAX_WARC_FILE_SIZE,
+ compress,
+ Arrays.asList(files),
+ null,
+ generator));
+ }
+
+ protected static ByteArrayOutputStream getBaos(String str)
+ throws IOException {
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ baos.write(str.getBytes());
+ return baos;
+ }
+
+ protected static void writeRecord(WARCWriter w, String url,
+ String mimetype, int len, ByteArrayOutputStream baos)
+ throws IOException {
+ WARCRecordInfo recordInfo = new WARCRecordInfo();
+ recordInfo.setType(WARCRecordType.resource);
+ recordInfo.setUrl(url);
+ recordInfo.setCreate14DigitDate(ArchiveUtils.get14DigitDate());
+ recordInfo.setMimetype(mimetype);
+ recordInfo.setRecordId(w.generateRecordId(null));
+ recordInfo.setExtraHeaders(null);
+ recordInfo.setContentStream(new ByteArrayInputStream(baos.toByteArray()));
+ recordInfo.setContentLength((long) len);
+ recordInfo.setEnforceLength(true);
+
+ w.writeRecord(recordInfo);
+ }
+
+ protected int iterateRecords(WARCReader r)
+ throws IOException {
+ int count = 0;
+ for (Iterator i = r.iterator(); i.hasNext();) {
+ ArchiveRecord ar = i.next();
+ ar.close();
+ if (count != 0) {
+ assertTrue("Unexpected URL " + ar.getHeader().getUrl(),
+ ar.getHeader().getUrl().equals(SOME_URL));
+ }
+ count++;
+ }
+ return count;
+ }
+
+ protected WARCWriter createWithOneRecord(String name,
+ boolean compressed)
+ throws IOException {
+ WARCWriter writer = createWARCWriter(name, compressed);
+ String content = getContent();
+ writeRecord(writer, SOME_URL, "text/html",
+ content.length(), getBaos(content));
+ return writer;
+ }
+
+ public void testSpaceInURL() throws IOException {
+ long bytesWritten = holeyUrl("testSpaceInURL", false, " ");
+ assertEquals("Unexpected successful writing occurred",0,bytesWritten);
+ }
+
+ public void testTabInURL() throws IOException {
+ long bytesWritten = holeyUrl("testTabInURL", false, "\t");
+ assertEquals("Unexpected successful writing occurred",0,bytesWritten);
+ }
+
+ protected long holeyUrl(String name, boolean compress, String urlInsert)
+ throws IOException {
+ WARCWriter writer = createWithOneRecord(name, compress);
+ // Add some bytes on the end to mess up the record.
+ long startPos = writer.getPosition();
+ String content = getContent();
+ ByteArrayOutputStream baos = getBaos(content);
+ writeRecord(writer, SOME_URL + urlInsert + "/index.html", "text/html",
+ content.length(), baos);
+ long endPos = writer.getPosition();
+ writer.close();
+ return endPos-startPos;
+ }
+
+ /**
+ * Write an arc file for other tests to use.
+ * @param arcdir Directory to write to.
+ * @param compress True if file should be compressed.
+ * @return ARC written.
+ * @throws IOException
+ */
+ public static File createWARCFile(File arcdir, boolean compress)
+ throws IOException {
+ File [] files = {arcdir};
+ WARCWriter writer =
+ new WARCWriter(SERIAL_NO,
+ new WARCWriterPoolSettingsData(
+ "",
+ "test",
+ DEFAULT_MAX_WARC_FILE_SIZE,
+ compress,
+ Arrays.asList(files),
+ null,
+ new UUIDGenerator()));
+ String content = getContent();
+ writeRecord(writer, SOME_URL, "text/html", content.length(),
+ getBaos(content));
+ writer.close();
+ return writer.getFile();
+ }
+
+// public void testSpeed() throws IOException {
+// ARCWriter writer = createArcWithOneRecord("speed", true);
+// // Add a record with a length that is too long.
+// String content = getContent();
+// final int count = 100000;
+// logger.info("Starting speed write of " + count + " records.");
+// for (int i = 0; i < count; i++) {
+// writeRecord(writer, SOME_URL, "text/html", content.length(),
+// getBaos(content));
+// }
+// writer.close();
+// logger.info("Finished speed write test.");
+// }
+
+ public void testArcRecordOffsetReads() throws Exception {
+ // Get an ARC with one record.
+ WriterPoolMember w =
+ createWithOneRecord("testArcRecordInBufferStream", true);
+ w.close();
+ // Get reader on said ARC.
+ WARCReader r = WARCReaderFactory.get(w.getFile());
+ final Iterator i = r.iterator();
+ // Skip first ARC meta record.
+ ArchiveRecord ar = i.next();
+ i.hasNext();
+ // Now we're at first and only record in ARC.
+ ar = (WARCRecord) i.next();
+ // Now try getting some random set of bytes out of it
+ // at an odd offset (used to fail because we were
+ // doing bad math to find where in buffer to read).
+ final byte[] buffer = new byte[17];
+ final int maxRead = 4;
+ int totalRead = 0;
+ while (totalRead < maxRead) {
+ totalRead = totalRead
+ + ar.read(buffer, 13 + totalRead, maxRead - totalRead);
+ assertTrue(totalRead > 0);
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/test/java/org/archive/uid/UUIDGeneratorTest.java b/src/test/java/org/archive/uid/UUIDGeneratorTest.java
new file mode 100644
index 00000000..79e98fb6
--- /dev/null
+++ b/src/test/java/org/archive/uid/UUIDGeneratorTest.java
@@ -0,0 +1,44 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.uid;
+
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.HashMap;
+import java.util.Map;
+
+import junit.framework.TestCase;
+
+/**
+ * @author stack
+ * @version $Revision$ $Date$
+ */
+public class UUIDGeneratorTest extends TestCase {
+ public void testQualifyRecordID() throws URISyntaxException {
+ RecordIDGenerator g = new UUIDGenerator();
+ URI uri = g.getRecordID();
+ Map qualifiers = new HashMap();
+ qualifiers.put("a", "b");
+ URI nuURI = g.qualifyRecordID(uri, qualifiers);
+ assertNotSame(uri, nuURI);
+ qualifiers.put("c", "d");
+ nuURI = g.qualifyRecordID(nuURI, qualifiers);
+ assertNotSame(uri, nuURI);
+ }
+}
diff --git a/src/test/java/org/archive/util/FileUtilsTest.java b/src/test/java/org/archive/util/FileUtilsTest.java
new file mode 100644
index 00000000..19271435
--- /dev/null
+++ b/src/test/java/org/archive/util/FileUtilsTest.java
@@ -0,0 +1,271 @@
+/*
+ * This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.util;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Collections;
+import java.util.LinkedList;
+import java.util.List;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang.math.LongRange;
+
+
+/**
+ * FileUtils tests.
+ *
+ * @contributor stack
+ * @contributor gojomo
+ * @version $Date$, $Revision$
+ */
+public class FileUtilsTest extends TmpDirTestCase {
+ private String srcDirName = FileUtilsTest.class.getName() + ".srcdir";
+ private File srcDirFile = null;
+ private String tgtDirName = FileUtilsTest.class.getName() + ".tgtdir";
+ private File tgtDirFile = null;
+
+ protected File zeroLengthLinesUnix;
+ protected File zeroLengthLinesWindows;
+
+ protected File smallLinesUnix;
+ protected File smallLinesWindows;
+ protected File largeLinesUnix;
+ protected File largeLinesWindows;
+ protected File nakedLastLineUnix;
+ protected File nakedLastLineWindows;
+
+
+ protected void setUp() throws Exception {
+ super.setUp();
+ this.srcDirFile = new File(getTmpDir(), srcDirName);
+ FileUtils.ensureWriteableDirectory(srcDirFile);
+ this.tgtDirFile = new File(getTmpDir(), tgtDirName);
+ FileUtils.ensureWriteableDirectory(tgtDirFile);
+ addFiles();
+
+ zeroLengthLinesUnix = setUpLinesFile("zeroLengthLinesUnix",0,0,400,IOUtils.LINE_SEPARATOR_UNIX);
+ zeroLengthLinesWindows = setUpLinesFile("zeroLengthLinesUnix",0,0,400,IOUtils.LINE_SEPARATOR_WINDOWS);
+
+ smallLinesUnix = setUpLinesFile("smallLinesUnix", 0, 25, 400, IOUtils.LINE_SEPARATOR_UNIX);
+ smallLinesWindows = setUpLinesFile("smallLinesWindows", 0, 25, 400, IOUtils.LINE_SEPARATOR_WINDOWS);
+ largeLinesUnix = setUpLinesFile("largeLinesUnix", 128, 256, 5, IOUtils.LINE_SEPARATOR_UNIX);
+ largeLinesWindows = setUpLinesFile("largeLinesWindows", 128, 256, 4096, IOUtils.LINE_SEPARATOR_WINDOWS);
+
+ nakedLastLineUnix = setUpLinesFile("nakedLastLineUnix", 0, 50, 401, IOUtils.LINE_SEPARATOR_UNIX);
+ org.apache.commons.io.FileUtils.writeStringToFile(nakedLastLineUnix,"a");
+ nakedLastLineWindows = setUpLinesFile("nakedLastLineWindows", 0, 50, 401, IOUtils.LINE_SEPARATOR_WINDOWS);
+ org.apache.commons.io.FileUtils.writeStringToFile(nakedLastLineWindows,"a");
+ }
+
+ private void addFiles() throws IOException {
+ addFiles(3, this.getName());
+ }
+
+ private void addFiles(final int howMany, final String baseName)
+ throws IOException {
+ for (int i = 0; i < howMany; i++) {
+ File.createTempFile(baseName, null, this.srcDirFile);
+ }
+ }
+
+ private File setUpLinesFile(String name, int minLineSize, int maxLineSize, int lineCount, String lineEnding) throws IOException {
+ List lines = new LinkedList();
+ StringBuilder sb = new StringBuilder(maxLineSize);
+ for(int i = 0; i< lineSize; j++) {
+ sb.append("-");
+ }
+ lines.add(sb.toString());
+ }
+ File file = File.createTempFile(name, null);
+ org.apache.commons.io.FileUtils.writeLines(file, lines, lineEnding);
+ return file;
+
+ }
+
+ protected void tearDown() throws Exception {
+ super.tearDown();
+ org.apache.commons.io.FileUtils.deleteQuietly(this.srcDirFile);
+ org.apache.commons.io.FileUtils.deleteQuietly(this.tgtDirFile);
+ org.apache.commons.io.FileUtils.deleteQuietly(zeroLengthLinesUnix);
+ org.apache.commons.io.FileUtils.deleteQuietly(zeroLengthLinesWindows);
+ org.apache.commons.io.FileUtils.deleteQuietly(smallLinesUnix);
+ org.apache.commons.io.FileUtils.deleteQuietly(smallLinesWindows);
+ org.apache.commons.io.FileUtils.deleteQuietly(largeLinesUnix);
+ org.apache.commons.io.FileUtils.deleteQuietly(largeLinesWindows);
+ org.apache.commons.io.FileUtils.deleteQuietly(nakedLastLineUnix);
+ org.apache.commons.io.FileUtils.deleteQuietly(nakedLastLineWindows);
+
+ }
+
+ public void testCopyFile() {
+ // Test exception copying nonexistent file.
+ File [] srcFiles = this.srcDirFile.listFiles();
+ srcFiles[0].delete();
+ IOException e = null;
+ try {
+ FileUtils.copyFile(srcFiles[0],
+ new File(this.tgtDirFile, srcFiles[0].getName()));
+ } catch (IOException ioe) {
+ e = ioe;
+ }
+ assertNotNull("Didn't get expected IOE", e);
+ }
+
+ public void testTailLinesZeroLengthUnix() throws IOException {
+ verifyTailLines(zeroLengthLinesUnix);
+ }
+
+ public void testTailLinesZeroLengthWindows() throws IOException {
+ verifyTailLines(zeroLengthLinesWindows);
+ }
+
+ public void testTailLinesSmallUnix() throws IOException {
+ verifyTailLines(smallLinesUnix);
+ }
+
+ public void testTailLinesLargeUnix() throws IOException {
+ verifyTailLines(largeLinesUnix);
+ }
+
+ public void testTailLinesSmallWindows() throws IOException {
+ verifyTailLines(smallLinesWindows);
+ }
+
+ public void testTailLinesLargeWindows() throws IOException {
+ verifyTailLines(largeLinesWindows);
+ }
+
+ public void testTailLinesNakedUnix() throws IOException {
+ verifyTailLines(nakedLastLineUnix);
+ }
+
+ public void testTailLinesNakedWindows() throws IOException {
+ verifyTailLines(nakedLastLineWindows);
+ }
+
+ @SuppressWarnings("unchecked")
+ private void verifyTailLines(File file) throws IOException {
+ List lines = org.apache.commons.io.FileUtils.readLines(file);
+ verifyTailLines(file, lines, 1, 80);
+ verifyTailLines(file, lines, 5, 80);
+ verifyTailLines(file, lines, 10, 80);
+ verifyTailLines(file, lines, 20, 80);
+ verifyTailLines(file, lines, 100, 80);
+ verifyTailLines(file, lines, 1, 1);
+ verifyTailLines(file, lines, 5, 1);
+ verifyTailLines(file, lines, 10, 1);
+ verifyTailLines(file, lines, 20, 1);
+ verifyTailLines(file, lines, 100, 1);
+ }
+
+
+ private void verifyTailLines(File file, List lines, int count, int estimate) throws IOException {
+ List testLines;
+ testLines = getTestTailLines(file,count,estimate);
+ assertEquals("line counts not equal:"+file.getName()+" "+count+" "+estimate,lines.size(),testLines.size());
+ assertEquals("lines not equal: "+file.getName()+" "+count+" "+estimate,lines,testLines);
+ }
+
+ private List getTestTailLines(File file, int count, int estimate) throws IOException {
+ long pos = -1;
+ List testLines = new LinkedList();
+ do {
+ List returnedLines = new LinkedList();
+ LongRange range = FileUtils.pagedLines(file,pos,-count,returnedLines,estimate);
+ Collections.reverse(returnedLines);
+ testLines.addAll(returnedLines);
+ pos = range.getMinimumLong()-1;
+ } while (pos>=0);
+ Collections.reverse(testLines);
+ return testLines;
+ }
+
+ public void testHeadLinesZeroLengthUnix() throws IOException {
+ verifyHeadLines(zeroLengthLinesUnix);
+ }
+
+ public void testHeadLinesZeroLengthWindows() throws IOException {
+ verifyHeadLines(zeroLengthLinesWindows);
+ }
+
+ public void testHeadLinesSmallUnix() throws IOException {
+ verifyHeadLines(smallLinesUnix);
+ }
+
+ public void testHeadLinesLargeUnix() throws IOException {
+ verifyHeadLines(largeLinesUnix);
+ }
+
+ public void testHeadLinesSmallWindows() throws IOException {
+ verifyHeadLines(smallLinesWindows);
+ }
+
+ public void testHeadLinesLargeWindows() throws IOException {
+ verifyHeadLines(largeLinesWindows);
+ }
+
+ public void testHeadLinesNakedUnix() throws IOException {
+ verifyHeadLines(nakedLastLineUnix);
+ }
+
+ public void testHeadLinesNakedWindows() throws IOException {
+ verifyHeadLines(nakedLastLineWindows);
+ }
+
+
+ @SuppressWarnings("unchecked")
+ private void verifyHeadLines(File file) throws IOException {
+ List lines = org.apache.commons.io.FileUtils.readLines(file);
+ verifyHeadLines(file, lines, 1, 80);
+ verifyHeadLines(file, lines, 5, 80);
+ verifyHeadLines(file, lines, 10, 80);
+ verifyHeadLines(file, lines, 20, 80);
+ verifyHeadLines(file, lines, 100, 80);
+ verifyHeadLines(file, lines, 1, 1);
+ verifyHeadLines(file, lines, 5, 1);
+ verifyHeadLines(file, lines, 10, 1);
+ verifyHeadLines(file, lines, 20, 1);
+ verifyHeadLines(file, lines, 100, 1);
+ }
+
+
+ private void verifyHeadLines(File file, List lines, int count, int estimate) throws IOException {
+ List testLines;
+ testLines = getTestHeadLines(file,count,estimate);
+ assertEquals("line counts not equal:"+file.getName()+" "+count+" "+estimate,lines.size(),testLines.size());
+ assertEquals("lines not equal: "+file.getName()+" "+count+" "+estimate,lines,testLines);
+ }
+
+ private List getTestHeadLines(File file, int count, int estimate) throws IOException {
+ long pos = 0;
+ List testLines = new LinkedList();
+ do {
+ LongRange range = FileUtils.pagedLines(file,pos,count,testLines,estimate);
+ pos = range.getMaximumLong();
+ } while (pos m = am.asMap();
+ logger.fine(m.toString());
+ }
+
+ public void testEmptyRecord() throws Exception {
+ byte [] b = ANVLRecord.EMPTY_ANVL_RECORD.getUTF8Bytes();
+ assertEquals(b.length, 2);
+ assertEquals(b[0], '\r');
+ assertEquals(b[1], '\n');
+ }
+
+ public void testFolding() throws Exception {
+ ANVLRecord am = new ANVLRecord();
+ Exception e = null;
+ try {
+ am.addLabel("Label with \n in it");
+ } catch (IllegalArgumentException iae) {
+ e = iae;
+ }
+ assertTrue(e != null && e instanceof IllegalArgumentException);
+ am.addLabelValue("label", "value with \n in it");
+ }
+
+ public void testParse() throws UnsupportedEncodingException, IOException {
+ String record = " a: b\r\n#c#\r\nc:d\r\n \t\t\r\t\n\te" +
+ "\r\nx:\r\n # z\r\n\r\n";
+ ANVLRecord r = ANVLRecord.load(new ByteArrayInputStream(
+ record.getBytes("ISO-8859-1")));
+ logger.fine(r.toString());
+ assertEquals(r.get(0).toString(), "a: b");
+ record = " a: b\r\n\r\nsdfsdsdfds";
+ r = ANVLRecord.load(new ByteArrayInputStream(
+ record.getBytes("ISO-8859-1")));
+ logger.fine(r.toString());
+ record = "x:\r\n # z\r\ny:\r\n\r\n";
+ r = ANVLRecord.load(new ByteArrayInputStream(
+ record.getBytes("ISO-8859-1")));
+ logger.fine(r.toString());
+ assertEquals(r.get(0).toString(), "x:");
+ }
+
+ public void testExampleParse()
+ throws UnsupportedEncodingException, IOException {
+ final String sample = "entry:\t\t\r\n# first ###draft\r\n" +
+ "who:\tGilbert, W.S. | Sullivan, Arthur\r\n" +
+ "what:\tThe Yeoman of\r\n" +
+ "\t\tthe Guard\r\n" +
+ "when/created:\t 1888\r\n\r\n";
+ ANVLRecord r = ANVLRecord.load(new ByteArrayInputStream(
+ sample.getBytes("ISO-8859-1")));
+ logger.fine(r.toString());
+ }
+
+ public void testPoundLabel()
+ throws UnsupportedEncodingException, IOException {
+ final String sample = "ent#ry:\t\t\r\n# first ###draft\r\n" +
+ "who:\tGilbert, W.S. | Sullivan, Arthur\r\n" +
+ "what:\tThe Yeoman of\r\n" +
+ "\t\tthe Guard\r\n" +
+ "when/created:\t 1888\r\n\r\n";
+ ANVLRecord r = ANVLRecord.load(sample);
+ logger.fine(r.toString());
+ }
+
+ public void testNewlineLabel()
+ throws UnsupportedEncodingException, IOException {
+ final String sample = "ent\nry:\t\t\r\n# first ###draft\r\n" +
+ "who:\tGilbert, W.S. | Sullivan, Arthur\r\n" +
+ "what:\tThe Yeoman of\r\n" +
+ "\t\tthe Guard\r\n" +
+ "when/created:\t 1888\r\n\r\n";
+ IllegalArgumentException iae = null;
+ try {
+ ANVLRecord.load(sample);
+ } catch(IllegalArgumentException e) {
+ iae = e;
+ }
+ assertTrue(iae != null);
+ }
+}
From b04f5d82604245461b6a802f1962d86e3d899e98 Mon Sep 17 00:00:00 2001
From: Lauren Ko
Date: Thu, 9 Mar 2017 11:32:03 -0600
Subject: [PATCH 030/216] Updating CHANGES.md
---
CHANGES.md | 1 +
1 file changed, 1 insertion(+)
diff --git a/CHANGES.md b/CHANGES.md
index fee29e16..767881ec 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,5 +1,6 @@
1.1.8
-----
+* [Move unit tests over from heritrix3 to webarchive-commons](https://github.com/iipc/webarchive-commons/issues/25)
* [Strip empty port via URLParser](https://github.com/iipc/webarchive-commons/pull/69/)
* [Use CharsetDetector to guess encoding of HTML documents](https://github.com/iipc/webarchive-commons/pull/68/)
* [Fix last header was lost if LF LF](https://github.com/iipc/webarchive-commons/pull/65/)
From b655796770eb967c931d656b1c80d4967f91e7fc Mon Sep 17 00:00:00 2001
From: Lauren Ko
Date: Tue, 21 Mar 2017 14:20:54 -0500
Subject: [PATCH 031/216] Updating change log.
---
CHANGES.md | 1 +
1 file changed, 1 insertion(+)
diff --git a/CHANGES.md b/CHANGES.md
index 767881ec..ccdc1ce7 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,5 +1,6 @@
1.1.8
-----
+* [Improve HTML link extraction](https://github.com/iipc/webarchive-commons/pull/72)
* [Move unit tests over from heritrix3 to webarchive-commons](https://github.com/iipc/webarchive-commons/issues/25)
* [Strip empty port via URLParser](https://github.com/iipc/webarchive-commons/pull/69/)
* [Use CharsetDetector to guess encoding of HTML documents](https://github.com/iipc/webarchive-commons/pull/68/)
From aee6ff55bfcaa5a9e15092f8c3b1e40ec9faaf87 Mon Sep 17 00:00:00 2001
From: John Erik Halse
Date: Tue, 2 May 2017 12:25:28 +0200
Subject: [PATCH 032/216] [maven-release-plugin] prepare release
webarchive-commons-1.1.8
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index 24780063..63909b90 100644
--- a/pom.xml
+++ b/pom.xml
@@ -9,7 +9,7 @@
org.netpreserve.commonswebarchive-commons
- 1.1.8-SNAPSHOT
+ 1.1.8jarwebarchive-commons
From dfe1f62e416f6a881fe15a2544449fff44dd1e51 Mon Sep 17 00:00:00 2001
From: John Erik Halse
Date: Tue, 2 May 2017 12:25:35 +0200
Subject: [PATCH 033/216] [maven-release-plugin] prepare for next development
iteration
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index 63909b90..23953c06 100644
--- a/pom.xml
+++ b/pom.xml
@@ -9,7 +9,7 @@
org.netpreserve.commonswebarchive-commons
- 1.1.8
+ 1.1.9-SNAPSHOTjarwebarchive-commons
From cf34a3e13c09cfa4a1412492cfcf3503df698931 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Fri, 28 Apr 2017 22:41:56 +0200
Subject: [PATCH 034/216] Do not add value of preceding HTTP header field if
there is no value (or only white space)
---
.../archive/format/http/HttpHeaderParser.java | 4 ++--
.../format/http/HttpResponseParserTest.java | 24 +++++++++++++++++++
2 files changed, 26 insertions(+), 2 deletions(-)
diff --git a/src/main/java/org/archive/format/http/HttpHeaderParser.java b/src/main/java/org/archive/format/http/HttpHeaderParser.java
index d63ec405..bee3c28b 100755
--- a/src/main/java/org/archive/format/http/HttpHeaderParser.java
+++ b/src/main/java/org/archive/format/http/HttpHeaderParser.java
@@ -301,8 +301,9 @@ public ParseState handleByte(byte b, HttpHeaderParser parser) throws HttpParseEx
if(isLWSP(b)) {
return parser.postColonState;
}
+ // reset previous value also in case the header value is empty
+ parser.setValueStartIdx();
if(b == CR) {
- // TODO: THINK more...
parser.valuePreCRState = parser.postColonState;
return parser.valuePostCRState;
}
@@ -310,7 +311,6 @@ public ParseState handleByte(byte b, HttpHeaderParser parser) throws HttpParseEx
// TODO: this is lax, is LFLF an OK terminator?
return parser.lineStartState;
}
- parser.setValueStartIdx();
parser.addValueByte(b);
return parser.valueState;
}
diff --git a/src/test/java/org/archive/format/http/HttpResponseParserTest.java b/src/test/java/org/archive/format/http/HttpResponseParserTest.java
index c0d13230..ea076a69 100644
--- a/src/test/java/org/archive/format/http/HttpResponseParserTest.java
+++ b/src/test/java/org/archive/format/http/HttpResponseParserTest.java
@@ -57,4 +57,28 @@ public void testParseWithLf() throws IOException {
}
+ public void testParseEmptyHeaderField() throws IOException {
+
+ HttpResponseParser parser = new HttpResponseParser();
+ String message = "200 OK\r\nContent-Type: text/plain\r\nServer: \r\n\r\nHi there";
+ try {
+ HttpResponse response =
+ parser.parse(new ByteArrayInputStream(message.getBytes(IAUtils.UTF8)));
+ assertNotNull(response);
+ HttpHeaders headers = response.getHeaders();
+ assertNotNull(headers);
+ assertEquals(2, headers.size());
+ HttpHeader header = headers.get(1);
+ assertEquals("Server",header.getName());
+ System.err.println(header.getValue());
+ assertFalse("text/plain".equals(header.getValue()));
+ TestUtils.assertStreamEquals(response, "Hi there".getBytes(IAUtils.UTF8));
+
+ } catch (HttpParseException e) {
+ e.printStackTrace();
+ fail();
+ }
+
+ }
+
}
From bd08143577ea35cb48047a08b2bb67e806992cc2 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Thu, 29 Sep 2016 11:44:18 +0200
Subject: [PATCH 035/216] Extract also `property` attributes of HTML meta
elements, this fixes #67
---
.../java/org/archive/resource/html/ExtractingParseObserver.java | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
index 826851e0..52989455 100644
--- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
+++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
@@ -406,7 +406,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
private static class MetaTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
- ArrayList l = getAttrList(node,"name","rel","content","http-equiv");
+ ArrayList l = getAttrList(node,"name","rel","content","http-equiv","property");
if(l != null) {
data.addMeta(l);
}
From 4077670acca3f0d2958d926692cdb3a6b29428ca Mon Sep 17 00:00:00 2001
From: Lauren Ko
Date: Tue, 2 May 2017 15:15:06 -0500
Subject: [PATCH 036/216] Fix HTTP-Response-Metadata for wget WARCs. Changes
came from
https://github.com/commoncrawl/ia-web-commons/commit/58e85a60d75707da55ed499f836e57d49347484a
---
.../org/archive/extract/ExtractingResourceFactoryMapper.java | 5 ++++-
src/main/java/org/archive/format/warc/WARCConstants.java | 4 +++-
2 files changed, 7 insertions(+), 2 deletions(-)
diff --git a/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java b/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java
index ad10be40..0afe16fb 100644
--- a/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java
+++ b/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java
@@ -153,7 +153,10 @@ private boolean isWARCInfoResource(MetaData envelope) {
private boolean isHTTPResponseWARCResource(MetaData envelope) {
return childFieldEquals(envelope,WARC_HEADER_METADATA,
WARCConstants.CONTENT_TYPE,
- WARCConstants.HTTP_RESPONSE_MIMETYPE);
+ WARCConstants.HTTP_RESPONSE_MIMETYPE)
+ || childFieldEquals(envelope,WARC_HEADER_METADATA,
+ WARCConstants.CONTENT_TYPE,
+ WARCConstants.HTTP_RESPONSE_MIMETYPE_NS);
}
private boolean isWARCJSONResource(MetaData envelope) {
return childFieldEquals(envelope,WARC_HEADER_METADATA,
diff --git a/src/main/java/org/archive/format/warc/WARCConstants.java b/src/main/java/org/archive/format/warc/WARCConstants.java
index 93a81f96..504dc380 100644
--- a/src/main/java/org/archive/format/warc/WARCConstants.java
+++ b/src/main/java/org/archive/format/warc/WARCConstants.java
@@ -209,7 +209,9 @@ enum WARCRecordType {
"application/http; msgtype=request";
public static final String HTTP_RESPONSE_MIMETYPE =
"application/http; msgtype=response";
-
+ public static final String HTTP_RESPONSE_MIMETYPE_NS =
+ "application/http;msgtype=response"; // wget does this
+
public static final String FTP_CONTROL_CONVERSATION_MIMETYPE =
"text/x-ftp-control-conversation";
From 3bba7e489b7d946eea83344e2150faebe0b35ed2 Mon Sep 17 00:00:00 2001
From: Lauren Ko
Date: Tue, 2 May 2017 15:41:23 -0500
Subject: [PATCH 037/216] Update with fixes for 1.1.9
---
CHANGES.md | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/CHANGES.md b/CHANGES.md
index ccdc1ce7..1ba5c1de 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,3 +1,9 @@
+1.1.9
+-----
+* [Extract `property` attributes of HTML meta elements](https://github.com/iipc/webarchive-commons/pull/75)
+* [Do not add value of preceding HTTP header field if there is no value](https://github.com/iipc/webarchive-commons/pull/74)
+* [Fix WAT records corresponding to response records of Wget generated WARCs](https://github.com/iipc/webarchive-commons/pull/74)
+
1.1.8
-----
* [Improve HTML link extraction](https://github.com/iipc/webarchive-commons/pull/72)
From 4101f7e39cbdcc508a936faf8b519e68258b9639 Mon Sep 17 00:00:00 2001
From: Naomi Dushay
Date: Tue, 8 Aug 2017 16:08:43 -0700
Subject: [PATCH 038/216] use commons-collections v3.2.2 to avoid v3.2.1
vulnerability
---
pom.xml | 29 +++++++++++++++++++++--------
1 file changed, 21 insertions(+), 8 deletions(-)
diff --git a/pom.xml b/pom.xml
index 23953c06..8373cdad 100644
--- a/pom.xml
+++ b/pom.xml
@@ -72,7 +72,7 @@
guava17.0
-
+
org.jsonjson
@@ -89,12 +89,12 @@
juniversalchardet1.0.3
-
+
commons-httpclientcommons-httpclient3.1
-
+
org.apache.hadoop
@@ -128,12 +128,12 @@
tomcatjasper-compiler
-
+
hsqldbhsqldb
-
-
+
+
@@ -160,7 +160,7 @@
libidn1.15
-
+ it.unimi.dsidsiutils2.0.12
@@ -170,13 +170,26 @@
ch.qos.logbacklogback-classic
+
+
+ commons-collections
+ commons-collections
+
+
+
+
+ commons-collections
+ commons-collections
+ 3.2.2
+
+
org.apache.httpcomponentshttpcore4.3
-
+ joda-timejoda-time
From 988bec707c27a01333becfc3bd502af4441ea1e1 Mon Sep 17 00:00:00 2001
From: Lauren Ko
Date: Wed, 9 Aug 2017 10:57:28 -0500
Subject: [PATCH 039/216] Update CHANGES.md for PR 77
---
CHANGES.md | 1 +
1 file changed, 1 insertion(+)
diff --git a/CHANGES.md b/CHANGES.md
index 1ba5c1de..dcb598d9 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,5 +1,6 @@
1.1.9
-----
+* [Use commons-collections v3.2.2 to avoid v3.2.1 vulnerability](https://github.com/iipc/webarchive-commons/pull/77)
* [Extract `property` attributes of HTML meta elements](https://github.com/iipc/webarchive-commons/pull/75)
* [Do not add value of preceding HTTP header field if there is no value](https://github.com/iipc/webarchive-commons/pull/74)
* [Fix WAT records corresponding to response records of Wget generated WARCs](https://github.com/iipc/webarchive-commons/pull/74)
From 2e8cdea3d245c11e1ea3a2a6153c0038479aef12 Mon Sep 17 00:00:00 2001
From: nruest
Date: Tue, 7 May 2019 13:23:28 -0400
Subject: [PATCH 040/216] [maven-release-plugin] prepare release
webarchive-commons-1.1.9
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index 8373cdad..833f42c3 100644
--- a/pom.xml
+++ b/pom.xml
@@ -9,7 +9,7 @@
org.netpreserve.commonswebarchive-commons
- 1.1.9-SNAPSHOT
+ 1.1.9jarwebarchive-commons
From da029db2ba89205b93a5291ed18b9c69155271bb Mon Sep 17 00:00:00 2001
From: nruest
Date: Tue, 7 May 2019 13:23:34 -0400
Subject: [PATCH 041/216] [maven-release-plugin] prepare for next development
iteration
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index 833f42c3..1cbeb99a 100644
--- a/pom.xml
+++ b/pom.xml
@@ -9,7 +9,7 @@
org.netpreserve.commonswebarchive-commons
- 1.1.9
+ 1.1.10-SNAPSHOTjarwebarchive-commons
From 723b18a35f8be786cb073282b5ea88b5d8c643ce Mon Sep 17 00:00:00 2001
From: nruest
Date: Tue, 7 May 2019 13:56:18 -0400
Subject: [PATCH 042/216] Update TravisCI config; resolves #82.
- Test Oracle Java 8
- Test OpenJDK Java 8
- Use trusty
- Require sudo for OpenJDK7
- Remove Oracle Java 7 (it's gone!)
- Remove mvn site from the build process since there is no javadoc site
(at least that I can tell)
---
.travis.yml | 13 +++++++++----
1 file changed, 9 insertions(+), 4 deletions(-)
diff --git a/.travis.yml b/.travis.yml
index 0dfd3f7f..54daf83b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,7 +1,13 @@
+dist: trusty
language: java
+# sudo required for OpenJDK7 support per:
+# https://github.com/travis-ci/travis-ci/issues/7884#issuecomment-309689557
+sudo: required
jdk:
- - oraclejdk7
+ - openjdk7
+ - oraclejdk8
+ - openjdk8
before_install:
- "git clone https://github.com/iipc/travis.git target/travis"
@@ -11,8 +17,8 @@ before_script:
- "export MAVEN_OPTS=-Xmx512m"
- "ulimit -u 2048"
-script:
- - "target/travis/deploy-if.sh"
+script:
+ - mvn install -B -V
# whitelist in the master branch only
branches:
@@ -23,4 +29,3 @@ env:
global:
- secure: "qDKjVdoe4Qcz4WfXiQydU7tyl51T62FUJrjqu4FUPBcgeQhFQiggwhpaE6xCOzOpxbsuBi2R1c8gMQf5esE5iDL5jZMu+kz++dYbuzMTd13ttvZWMW5wRPH0H8iHk609FP/RDtVKKBr7WO0JvvIAZEhWNHZrLXBrrKgdTey171g="
- secure: "FXGBKJNP9X7ePJfS4eYTZtoFo4RT1sxor34XxncSJr7uV6ggtZb4B4WNd16IlLcDk6E32sx8YoWdltaOGwQ5Vg/kux5Ko/wKZCoccS018Ln1bRT86dD1KoPY34rGoNJVQxe7J/1MPqpBKwmi2XCKfzpsEh3W7bbIqg8w9MEOOZA="
-
From 79aed910b44510294367a4acf4f3e6376b1c62c0 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Wed, 23 Aug 2017 17:04:52 +0200
Subject: [PATCH 043/216] ExtractingParseObserver: get links from onClick
attributes - extract links from JavaScript code snippets in onClick
attributes of INPUT and DIV elements
---
.../html/ExtractingParseObserver.java | 40 +++++++++++++++++-
.../html/ExtractingParseObserverTest.java | 10 +++++
.../resource/html/link-extraction-test.warc | 42 +++++++++++++++++++
3 files changed, 91 insertions(+), 1 deletion(-)
diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
index 52989455..e4fa83c7 100644
--- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
+++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
@@ -39,6 +39,15 @@ public class ExtractingParseObserver implements ParseObserver {
protected static Pattern cssUrlTrimPattern = Pattern.compile(cssUrlTrimPatString);
+ protected static String jsOnClickUrl1PatString =
+ "(?i)^(?:javascript:)?(?:(?:window|top|document|self|parent)\\.)?location(?:\\.href)?\\s*=\\s*('|')([^'\"]{3,256})\\1$";
+ protected static String jsOnClickUrl2PatString =
+ "(?i)^(?:javascript:)?(?:window|parent)\\.open\\((['\"]|')([^\"']{3,256}?)\\1[,)]";
+ protected static Pattern[] jsOnClickUrlPatterns = {
+ Pattern.compile(jsOnClickUrl1PatString),
+ Pattern.compile(jsOnClickUrl2PatString)
+ };
+
private final static int MAX_TEXT_LEN = 100;
private static final String PATH = "path";
@@ -51,6 +60,7 @@ public class ExtractingParseObserver implements ParseObserver {
extractors.put("APPLET", new AppletTagExtractor());
extractors.put("AREA", new AreaTagExtractor());
extractors.put("BASE", new BaseTagExtractor());
+ extractors.put("DIV", new DivTagExtractor());
extractors.put("EMBED", new EmbedTagExtractor());
extractors.put("FORM", new FormTagExtractor());
extractors.put("FRAME", new FrameTagExtractor());
@@ -268,7 +278,20 @@ private static void addHrefWithAttrs(HTMLMetaData data, TagNode node,
if(l != null) {
data.addHref(l);
}
- }
+ }
+
+ private static void addHrefsOnclick(HTMLMetaData data, TagNode node) {
+ String onclick = node.getAttribute("onclick");
+ if (onclick != null) {
+ String path = makePath(node.getTagName(), "onclick");
+ for (Pattern pattern : jsOnClickUrlPatterns) {
+ String url = patternJSExtract(pattern, onclick);
+ if (url != null) {
+ data.addHref(PATH, path, "url", url);
+ }
+ }
+ }
+ }
private interface TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs);
@@ -330,6 +353,12 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
}
}
+ private static class DivTagExtractor implements TagExtractor {
+ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
+ addHrefsOnclick(data,node);
+ }
+ }
+
private static class EmbedTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
addBasicHrefs(data,node,"src");
@@ -386,6 +415,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
private static class InputTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
addBasicHrefs(data,node,"src","formaction");
+ addHrefsOnclick(data,node);
}
}
@@ -450,4 +480,12 @@ private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String conten
}
}
}
+
+ private static String patternJSExtract(Pattern pattern, String content) {
+ Matcher m = pattern.matcher(content);
+ if (m.find()) {
+ return m.group(2);
+ }
+ return null;
+ }
}
diff --git a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
index 8f690a06..4828ad64 100644
--- a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
+++ b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
@@ -263,6 +263,16 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
{"http://www.your-domain.com/your-page.html", "DIV@/data-href"}
};
checkLinks(extractor.getNext(), fbSocialLinks);
+ String[][] onClickLinks = {
+ {"webpage.html", "DIV@/onclick"},
+ {"index.html", "INPUT@/onclick"},
+ {"http://www.x.com/", "INPUT@/onclick"},
+ {"button-child.php", "INPUT@/onclick"},
+ {"http://example.com/", "INPUT@/onclick"},
+ {"http://example.com/location/href/1.html", "INPUT@/onclick"},
+ {"http://example.com/location/href/2.html", "INPUT@/onclick"}
+ };
+ checkLinks(extractor.getNext(), onClickLinks);
}
}
diff --git a/src/test/resources/org/archive/resource/html/link-extraction-test.warc b/src/test/resources/org/archive/resource/html/link-extraction-test.warc
index ab0e54c8..1a30598e 100644
--- a/src/test/resources/org/archive/resource/html/link-extraction-test.warc
+++ b/src/test/resources/org/archive/resource/html/link-extraction-test.warc
@@ -318,3 +318,45 @@ Content-Type: text/html
+WARC/1.0
+WARC-Type: response
+WARC-Date: 2017-08-23T13:54:59Z
+Content-Type: application/http;msgtype=response
+Content-Length: 1279
+
+HTTP/1.1 200 OK
+Date: Wed, 23 Aug 2017 13:54:59 GMT
+Server: Apache/2.4.18 (Ubuntu)
+Last-Modified: Wed, 23 Aug 2017 13:54:03 GMT
+ETag: "3ca-5576c0b718ab3"
+Accept-Ranges: bytes
+Content-Length: 971
+Vary: Accept-Encoding
+Keep-Alive: timeout=5, max=100
+Connection: Keep-Alive
+Content-Type: text/html
+
+
+
+Test Extraction of URLs from INPUT onClick Attributes
+
+
+
+
+
Click to load webpage
+
+
+
+
+
+
+
+
From 26b1e7af27abec102ab36faf6a786dfedf9436fd Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Wed, 23 Aug 2017 14:48:05 +0200
Subject: [PATCH 044/216] ExtractingParseObserver: extract rel, hreflang and
type attributes - add "rel" attribute to A and AREA links - add attributes
"hreflang" and "type" (MIME type) to A@/href links
---
.../html/ExtractingParseObserver.java | 19 +++++++++++++++++--
1 file changed, 17 insertions(+), 2 deletions(-)
diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
index 52989455..a487fd34 100644
--- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
+++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
@@ -284,7 +284,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
l.add(makePath("A","href"));
l.add("url");
l.add(url);
- for(String a : new String[] {"target","alt","title"}) {
+ for(String a : new String[] {"target","alt","title","rel","hreflang","type"}) {
String v = node.getAttribute(a);
if(v != null) {
l.add(a);
@@ -311,7 +311,22 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
private static class AreaTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
- addBasicHrefs(data,node,"href");
+ String url = node.getAttribute("href");
+ if(url != null) {
+ ArrayList l = new ArrayList();
+ l.add(PATH);
+ l.add(makePath("AREA","href"));
+ l.add("url");
+ l.add(url);
+ for(String a : new String[] {"rel"}) {
+ String v = node.getAttribute(a);
+ if(v != null) {
+ l.add(a);
+ l.add(v);
+ }
+ }
+ data.addHref(l);
+ }
}
}
From a2cc42cac2777d06ab40e09811cdc883773775b9 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Thu, 11 Jun 2020 14:24:03 +0200
Subject: [PATCH 045/216] WAT extractor: do not fail on missing WARC-Filename
in warcinfo record, fixes #88 - do not throw IOException if there is no
WARC-Filename in warcinfo record - write metadata record (corresponding to
warcinfo) without WARC-Target-URI
---
src/main/java/org/archive/extract/WATExtractorOutput.java | 2 +-
src/main/java/org/archive/format/warc/WARCRecordWriter.java | 5 ++++-
2 files changed, 5 insertions(+), 2 deletions(-)
diff --git a/src/main/java/org/archive/extract/WATExtractorOutput.java b/src/main/java/org/archive/extract/WATExtractorOutput.java
index 3bcfa924..4b5f72ed 100644
--- a/src/main/java/org/archive/extract/WATExtractorOutput.java
+++ b/src/main/java/org/archive/extract/WATExtractorOutput.java
@@ -151,7 +151,7 @@ private void writeWARC(OutputStream recOut, MetaData md) throws IOException {
String warcType = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Type");
String targetURI;
if(warcType.equals("warcinfo")) {
- targetURI = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Filename");
+ targetURI = JSONUtils.extractSingle(md, "Envelope.WARC-Header-Metadata.WARC-Filename");
} else {
targetURI = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Target-URI");
}
diff --git a/src/main/java/org/archive/format/warc/WARCRecordWriter.java b/src/main/java/org/archive/format/warc/WARCRecordWriter.java
index 0aab83b7..3278b289 100644
--- a/src/main/java/org/archive/format/warc/WARCRecordWriter.java
+++ b/src/main/java/org/archive/format/warc/WARCRecordWriter.java
@@ -88,7 +88,10 @@ public void writeJSONMetadataRecord( OutputStream out,
{
HttpHeaders headers = new HttpHeaders();
headers.add(HEADER_KEY_TYPE, WARCRecordType.metadata.name());
- headers.add(HEADER_KEY_URI, targetURI);
+ if (targetURI != null) {
+ // WARC-Target-URI is optional in metadata records
+ headers.add(HEADER_KEY_URI, targetURI);
+ }
headers.add(HEADER_KEY_DATE, DateUtils.getLog14Date(originalDate));
headers.add(HEADER_KEY_ID, makeRecordId());
headers.add(HEADER_KEY_REFERS_TO, origRecordId);
From 04e10397b9137a36812c17276826bc60d1a37ede Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Mon, 15 Jun 2020 13:29:25 +0200
Subject: [PATCH 046/216] Update change log to include #85, #86 and #89
---
CHANGES.md | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/CHANGES.md b/CHANGES.md
index dcb598d9..bf985ada 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,3 +1,10 @@
+1.1.10
+------
+* [WAT extractor: do not fail on missing WARC-Filename in warcinfo record](https://github.com/iipc/webarchive-commons/pull/89)
+* [ExtractingParseObserver: extract rel, hreflang and type attributes](https://github.com/iipc/webarchive-commons/pull/86)
+* [ExtractingParseObserver: extract links from onClick attributes](https://github.com/iipc/webarchive-commons/pull/85)
+* [Update TravisCI config](https://github.com/iipc/webarchive-commons/pull/83)
+
1.1.9
-----
* [Use commons-collections v3.2.2 to avoid v3.2.1 vulnerability](https://github.com/iipc/webarchive-commons/pull/77)
From 9041ff4e96f6554658742affe490223dc0241d06 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 13 Oct 2020 01:28:48 +0000
Subject: [PATCH 047/216] Bump junit from 3.8.1 to 4.13.1
Bumps [junit](https://github.com/junit-team/junit4) from 3.8.1 to 4.13.1.
- [Release notes](https://github.com/junit-team/junit4/releases)
- [Changelog](https://github.com/junit-team/junit4/blob/main/doc/ReleaseNotes4.13.1.md)
- [Commits](https://github.com/junit-team/junit4/commits/r4.13.1)
Signed-off-by: dependabot[bot]
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index 1cbeb99a..5ca7e1a3 100644
--- a/pom.xml
+++ b/pom.xml
@@ -64,7 +64,7 @@
junitjunit
- 3.8.1
+ 4.13.1
From c2530d77b73838c31f4e83f2be941ec61032ebb2 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Tue, 16 Mar 2021 11:58:11 +0100
Subject: [PATCH 048/216] Fix InterruptibleCharSequenceTest
(testInterruptibility) to run on JDK 11 - if thread running the regexp
matching is already finished after the initial/current sleeping time, rerun
the test again with a shorter sleeping time until the expected
RuntimeException is hit
---
.../util/InterruptibleCharSequenceTest.java | 26 +++++++++++++------
1 file changed, 18 insertions(+), 8 deletions(-)
diff --git a/src/test/java/org/archive/util/InterruptibleCharSequenceTest.java b/src/test/java/org/archive/util/InterruptibleCharSequenceTest.java
index a3a5f180..8b5c5d1b 100644
--- a/src/test/java/org/archive/util/InterruptibleCharSequenceTest.java
+++ b/src/test/java/org/archive/util/InterruptibleCharSequenceTest.java
@@ -107,14 +107,24 @@ public void testNoninterruptible() throws InterruptedException {
}
public void testInterruptibility() throws InterruptedException {
- BlockingQueue
-
-
- cloudera
- Cloudera Hadoop
- https://repository.cloudera.com/artifactory/cloudera-repos/
- default
-
-
- true
- daily
- warn
-
-
- true
- daily
- warn
-
-
-
-
)
+ * |ID1|ID2|CM |FLG| MTIME |XFL|OS | (more-->)
* +---+---+---+---+---+---+---+---+---+---+
*/
public class GZIPStaticHeader implements GZIPConstants {
diff --git a/src/main/java/org/archive/io/ReplayCharSequence.java b/src/main/java/org/archive/io/ReplayCharSequence.java
index aa9b9587..e456e293 100644
--- a/src/main/java/org/archive/io/ReplayCharSequence.java
+++ b/src/main/java/org/archive/io/ReplayCharSequence.java
@@ -59,7 +59,7 @@ public interface ReplayCharSequence extends CharSequence, Closeable {
public long getDecodeExceptionCount();
/**
- * Return the first coding-exception encountered, if the count > 0.
+ * Return the first coding-exception encountered, if the count > 0.
* @return CharacterCodingException
*/
public CharacterCodingException getCodingException();
diff --git a/src/main/java/org/archive/io/arc/ARCWriter.java b/src/main/java/org/archive/io/arc/ARCWriter.java
index 0bd0ef9b..c7042943 100644
--- a/src/main/java/org/archive/io/arc/ARCWriter.java
+++ b/src/main/java/org/archive/io/arc/ARCWriter.java
@@ -86,7 +86,7 @@
* write our own GZIP*Streams, ones that resettable and consious of gzip
* members.
*
- *
This class will write until we hit >= maxSize. The check is done at
+ *
This class will write until we hit >= maxSize. The check is done at
* record boundary. Records do not span ARC files. We will then close current
* file and open another and then continue writing.
*
@@ -95,9 +95,9 @@
* alexa
* ARC c-tools:
*
* Examine the produced cdx file to make sure it makes sense. Search
* for 'no-type 0'. If found, then we're opening a gzip record w/o data to
diff --git a/src/main/java/org/archive/util/DateUtils.java b/src/main/java/org/archive/util/DateUtils.java
index 0be20e63..7d6a7c98 100755
--- a/src/main/java/org/archive/util/DateUtils.java
+++ b/src/main/java/org/archive/util/DateUtils.java
@@ -557,7 +557,7 @@ private static String doubleToString(double val, int maxFractionDigits, int minF
* Takes a byte size and formats it for display with 'friendly' units.
*
* This involves converting it to the largest unit
- * (of B, KiB, MiB, GiB, TiB) for which the amount will be > 1.
+ * (of B, KiB, MiB, GiB, TiB) for which the amount will be > 1.
*
* Additionally, at least 2 significant digits are always displayed.
*
From 0d881e967daf2a023006032dd0d015b714821b11 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Tue, 15 Oct 2024 17:42:23 +0900
Subject: [PATCH 070/216] [maven-release-plugin] prepare release
webarchive-commons-1.1.10
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index f0c6ac73..2dd9223b 100644
--- a/pom.xml
+++ b/pom.xml
@@ -9,7 +9,7 @@
org.netpreserve.commonswebarchive-commons
- 1.1.10-SNAPSHOT
+ 1.1.10jarwebarchive-commons
From 76d95ccd75ddc31c5b8c3e9136f9e422ab528898 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Tue, 15 Oct 2024 17:42:28 +0900
Subject: [PATCH 071/216] [maven-release-plugin] prepare for next development
iteration
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index 2dd9223b..dc3088f0 100644
--- a/pom.xml
+++ b/pom.xml
@@ -9,7 +9,7 @@
org.netpreserve.commonswebarchive-commons
- 1.1.10
+ 1.1.11-SNAPSHOTjarwebarchive-commons
From 835f4e115b2cd288bed3f703136a7325c81fa751 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Sat, 9 Nov 2024 20:27:47 +0100
Subject: [PATCH 072/216] Make MetaData multi-valued to preserve values of
repeating WARC and HTTP headers
- code cleanup: fix indentation, remove unneeded return statements
---
src/main/java/org/archive/resource/MetaData.java | 10 ++++------
1 file changed, 4 insertions(+), 6 deletions(-)
diff --git a/src/main/java/org/archive/resource/MetaData.java b/src/main/java/org/archive/resource/MetaData.java
index 30ce849b..fb3b24a4 100755
--- a/src/main/java/org/archive/resource/MetaData.java
+++ b/src/main/java/org/archive/resource/MetaData.java
@@ -83,7 +83,6 @@ public int optInt(String key, int defaultValue) {
return super.getInt(key);
} catch(JSONException e) {
LOG.severe(e.getMessage());
- return defaultValue;
}
}
return defaultValue;
@@ -106,7 +105,6 @@ public long optLong(String key, long defaultValue) {
return super.getLong(key);
} catch(JSONException e) {
LOG.severe(e.getMessage());
- return defaultValue;
}
}
return defaultValue;
@@ -167,10 +165,10 @@ public JSONObject put(String key, Object value) {
((JSONArray) super.get(key)).put(value);
return this;
} else {
- JSONArray array = new JSONArray();
- array.put(super.get(key));
- array.put(value);
- super.put(key, array);
+ JSONArray array = new JSONArray();
+ array.put(super.get(key));
+ array.put(value);
+ super.put(key, array);
}
return super.accumulate(key, value);
}
From a4748d9e79abb972a6571f5f4d46951be6049b1a Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Wed, 27 Nov 2024 13:24:17 +0100
Subject: [PATCH 073/216] URLParser and WaybackURLKeyMaker fail on URLs with
IPv6 address hostname
---
src/main/java/org/archive/url/URLParser.java | 11 ++++++++++-
.../java/org/archive/url/URLRegexTransformer.java | 4 ++++
src/test/java/org/archive/url/URLParserTest.java | 3 +++
.../java/org/archive/url/WaybackURLKeyMakerTest.java | 3 +++
4 files changed, 20 insertions(+), 1 deletion(-)
diff --git a/src/main/java/org/archive/url/URLParser.java b/src/main/java/org/archive/url/URLParser.java
index a7860b02..bcd0b7fb 100644
--- a/src/main/java/org/archive/url/URLParser.java
+++ b/src/main/java/org/archive/url/URLParser.java
@@ -226,7 +226,16 @@ public static HandyURL parse(String urlString) throws URISyntaxException {
String colonPort = null;
int atIndex = uriAuthority.indexOf(COMMERCIAL_AT);
- int portColonIndex = uriAuthority.indexOf(COLON,(atIndex<0)?0:atIndex);
+ int portColonIndex = -1;
+ int startColonIndex = 0;
+ if (atIndex > -1) {
+ startColonIndex = atIndex;
+ }
+ if (uriAuthority.charAt(startColonIndex) == '[') {
+ // IPv6 address
+ startColonIndex = uriAuthority.indexOf(']', (startColonIndex + 1));
+ }
+ portColonIndex = uriAuthority.indexOf(COLON, startColonIndex);
if(atIndex<0 && portColonIndex<0) {
// most common case: neither userinfo nor port
diff --git a/src/main/java/org/archive/url/URLRegexTransformer.java b/src/main/java/org/archive/url/URLRegexTransformer.java
index 617e0225..5f31c81c 100644
--- a/src/main/java/org/archive/url/URLRegexTransformer.java
+++ b/src/main/java/org/archive/url/URLRegexTransformer.java
@@ -121,6 +121,10 @@ public static String hostToSURT(String host) {
// TODO: ensure we DONT reverse IP addresses!
String parts[] = host.split("\\.",-1);
if(parts.length == 1) {
+ // strip enclosing "[" and "]" from IPv6 hosts
+ if (host.charAt(0) == '[' && host.charAt(host.length() - 1) == ']') {
+ return host.substring(1, host.length() - 1);
+ }
return host;
}
StringBuilder sb = new StringBuilder(host.length());
diff --git a/src/test/java/org/archive/url/URLParserTest.java b/src/test/java/org/archive/url/URLParserTest.java
index b060ffa7..68dfcd23 100644
--- a/src/test/java/org/archive/url/URLParserTest.java
+++ b/src/test/java/org/archive/url/URLParserTest.java
@@ -86,6 +86,9 @@ public void testParse() throws UnsupportedEncodingException, URISyntaxException
checkParse(" \n http://:****@www.archive.org:8080/inde\rx.html?query#foo \r\n \t ",
null, "http", "", "****", "www.archive.org", 8080, "/index.html", "query", "foo",
"http://:****@www.archive.org:8080/index.html?query#foo", "/index.html?query");
+ checkParse("https://[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/robots.txt", null, "https", null, null,
+ "[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]", -1, "/robots.txt", null, null,
+ "https://[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/robots.txt", "/robots.txt");
}
private void checkParse(String s, String opaque, String scheme, String authUser,
diff --git a/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java b/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java
index 26161456..1a1403ee 100644
--- a/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java
+++ b/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java
@@ -23,6 +23,9 @@ public void testMakeKey() throws URISyntaxException {
assertEquals("org,archive)/goo?a&b", km.makeKey("http://archive.org/goo/?b&a"));
assertEquals("org,archive)/goo?a=1&a=2&b", km.makeKey("http://archive.org/goo/?a=2&b&a=1"));
assertEquals("org,archive)/", km.makeKey("http://archive.org:/"));
+ assertEquals("192,211,203,34)/robots.txt", km.makeKey("https://34.203.211.192/robots.txt"));
+ assertEquals("2600:1f18:200d:fb00:2b74:867c:ab0c:150a)/robots.txt",
+ km.makeKey("https://[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/robots.txt"));
}
}
From 8e89847d79ea2882bc55e2d00939fd8a2ca21865 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Wed, 27 Nov 2024 21:51:58 +0900
Subject: [PATCH 074/216] Update release plugins
---
pom.xml | 110 ++++++++++++++++++++++++++++++++++----------------------
1 file changed, 67 insertions(+), 43 deletions(-)
diff --git a/pom.xml b/pom.xml
index dc3088f0..048787a5 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1,12 +1,6 @@
4.0.0
-
- org.sonatype.oss
- oss-parent
- 7
-
-
org.netpreserve.commonswebarchive-commons1.1.11-SNAPSHOT
@@ -45,19 +39,13 @@
scm:git:git@github.com:iipc/webarchive-commons.gitscm:git:git@github.com:iipc/webarchive-commons.git
- git@github.com:iipc/webarchive-commons.git
+ https://github.com/iipc/webarchive-commonsUTF-8${maven.build.timestamp}yyyyMMddhhmmss
-
-
- sonatype-nexus-staging
- https://oss.sonatype.org/service/local/staging/deploy/maven2/
- sonatype-nexus-snapshots
- https://oss.sonatype.org/content/repositories/snapshots/
@@ -201,24 +189,6 @@
8
-
- maven-assembly-plugin
- 2.4
-
-
- jar-with-dependencies
-
- webarchive-commons
-
-
-
- package
-
- single
-
-
-
- org.apache.maven.pluginsmaven-enforcer-plugin
@@ -251,17 +221,71 @@
-
+
+
+ release
+
+
+ ossrh
+ https://oss.sonatype.org/content/repositories/snapshots
+
+
+
+
+
+ org.sonatype.plugins
+ nexus-staging-maven-plugin
+ 1.6.7
+ true
+
+ ossrh
+ https://oss.sonatype.org/
+ true
+
+
+
+ org.apache.maven.plugins
+ maven-source-plugin
+ 2.2.1
+
+
+ attach-sources
+
+ jar-no-fork
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-javadoc-plugin
+ 2.9.1
+
+
+ attach-javadocs
+
+ jar
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-gpg-plugin
+ 1.5
+
+
+ sign-artifacts
+ verify
+
+ sign
+
+
+
+
+
+
+
+
From 829566b1385a8dae6bc9774cd1299469f37e78c3 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Wed, 27 Nov 2024 21:53:22 +0900
Subject: [PATCH 075/216] [maven-release-plugin] prepare release
webarchive-commons-1.1.11
---
pom.xml | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/pom.xml b/pom.xml
index 048787a5..28bd9145 100644
--- a/pom.xml
+++ b/pom.xml
@@ -3,7 +3,7 @@
org.netpreserve.commonswebarchive-commons
- 1.1.11-SNAPSHOT
+ 1.1.11jarwebarchive-commons
@@ -40,7 +40,8 @@
scm:git:git@github.com:iipc/webarchive-commons.gitscm:git:git@github.com:iipc/webarchive-commons.githttps://github.com/iipc/webarchive-commons
-
+ webarchive-commons-1.1.11
+
UTF-8
From 9b0bbcfdeea7a9c2ac9a28b245bce2f8e9df5dce Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Wed, 27 Nov 2024 21:53:27 +0900
Subject: [PATCH 076/216] [maven-release-plugin] prepare for next development
iteration
---
pom.xml | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/pom.xml b/pom.xml
index 28bd9145..c86add9f 100644
--- a/pom.xml
+++ b/pom.xml
@@ -3,7 +3,7 @@
org.netpreserve.commonswebarchive-commons
- 1.1.11
+ 1.1.12-SNAPSHOTjarwebarchive-commons
@@ -40,7 +40,7 @@
scm:git:git@github.com:iipc/webarchive-commons.gitscm:git:git@github.com:iipc/webarchive-commons.githttps://github.com/iipc/webarchive-commons
- webarchive-commons-1.1.11
+ HEAD
From 9e4723b313a542320a4f09f4b4e2dbccdc0f58ac Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Wed, 27 Nov 2024 21:58:55 +0900
Subject: [PATCH 077/216] Update CHANGES.md
---
CHANGES.md | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/CHANGES.md b/CHANGES.md
index 6fe7c4bd..579b659f 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,7 +1,14 @@
+1.1.11
+------
+
+#### Bug fixes
+
+* Fixed URLParser and WaybackURLKeyMaker failing on URLs with IPv6 address hostnames [#100](https://github.com/iipc/webarchive-commons/pull/100)
+
1.1.10
------
-#### Fixes
+#### Bug fixes
* [WAT extractor: do not fail on missing WARC-Filename in warcinfo record](https://github.com/iipc/webarchive-commons/pull/89)
* [ExtractingParseObserver: extract rel, hreflang and type attributes](https://github.com/iipc/webarchive-commons/pull/86)
From cd2da63f1f56d41705e014e2c3290635fcc99099 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Wed, 27 Nov 2024 22:00:18 +0900
Subject: [PATCH 078/216] Add description to pom.xml (now mandatory for
central)
---
pom.xml | 1 +
1 file changed, 1 insertion(+)
diff --git a/pom.xml b/pom.xml
index c86add9f..18aca329 100644
--- a/pom.xml
+++ b/pom.xml
@@ -8,6 +8,7 @@
webarchive-commonshttps://github.com/iipc/webarchive-commons
+ Common web archive utility codeThe International Internet Preservation Consortium
From 7b6df0c619899ae70e350fb0d955c00b59ba68e5 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Wed, 27 Nov 2024 22:02:31 +0900
Subject: [PATCH 079/216] [maven-release-plugin] prepare release
webarchive-commons-1.1.11
---
pom.xml | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/pom.xml b/pom.xml
index 18aca329..a57230d9 100644
--- a/pom.xml
+++ b/pom.xml
@@ -3,7 +3,7 @@
org.netpreserve.commonswebarchive-commons
- 1.1.12-SNAPSHOT
+ 1.1.11jarwebarchive-commons
@@ -41,7 +41,7 @@
scm:git:git@github.com:iipc/webarchive-commons.gitscm:git:git@github.com:iipc/webarchive-commons.githttps://github.com/iipc/webarchive-commons
- HEAD
+ webarchive-commons-1.1.11
From a70f23e8b654d3a661877641f2fa7e51d696ceeb Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Wed, 27 Nov 2024 22:02:36 +0900
Subject: [PATCH 080/216] [maven-release-plugin] prepare for next development
iteration
---
pom.xml | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/pom.xml b/pom.xml
index a57230d9..18aca329 100644
--- a/pom.xml
+++ b/pom.xml
@@ -3,7 +3,7 @@
org.netpreserve.commonswebarchive-commons
- 1.1.11
+ 1.1.12-SNAPSHOTjarwebarchive-commons
@@ -41,7 +41,7 @@
scm:git:git@github.com:iipc/webarchive-commons.gitscm:git:git@github.com:iipc/webarchive-commons.githttps://github.com/iipc/webarchive-commons
- webarchive-commons-1.1.11
+ HEAD
From 0514b2387decaf5e40e24bcda0f7c70b438d0997 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Wed, 27 Nov 2024 22:08:04 +0900
Subject: [PATCH 081/216] Add Maven Central and Javadoc shields to README
---
README.md | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/README.md b/README.md
index 72858a52..55be6e68 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,6 @@
IIPC Web Archive Commons
========================
-
-[](https://travis-ci.org/iipc/webarchive-commons/)
+[](https://maven-badges.herokuapp.com/maven-central/org.netpreserve.commons/webarchive-commons) [](https://www.javadoc.io/doc/org.netpreserve.commons/webarchive-commons)
This repository contains common utility code for [OpenWayback][1] and other projects.
From c6095082fdecadd6882456a51c5f91b8a3d4faa5 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Fri, 29 Nov 2024 15:42:05 +0900
Subject: [PATCH 082/216] Bump guava from 33.3.0-jre to 33.3.1-jre
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index 18aca329..0ac11df9 100644
--- a/pom.xml
+++ b/pom.xml
@@ -60,7 +60,7 @@
com.google.guavaguava
- 33.3.0-jre
+ 33.3.1-jre
From 23c8887c2a3eb4d4d5b0bac0cf805c71fcaeabaf Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Fri, 29 Nov 2024 15:42:41 +0900
Subject: [PATCH 083/216] Bump commons-io from 2.14.0 to 2.18.0
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index 0ac11df9..84822a4f 100644
--- a/pom.xml
+++ b/pom.xml
@@ -140,7 +140,7 @@
commons-iocommons-io
- 2.14.0
+ 2.18.0
From f13c7b2a3b254a83827ad5a1c27131c6980c79eb Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Fri, 29 Nov 2024 15:47:28 +0900
Subject: [PATCH 084/216] Bump commons-lang from 2.5 to 2.6
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index 84822a4f..3d5f995f 100644
--- a/pom.xml
+++ b/pom.xml
@@ -134,7 +134,7 @@
commons-langcommons-lang
- 2.5
+ 2.6
From 5528afc05f77189b7ef59dbb9cdcce2bd35656e7 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Fri, 29 Nov 2024 16:24:04 +0900
Subject: [PATCH 085/216] Bump junit from 4.13.1 to 4.13.2
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index 3d5f995f..46f26766 100644
--- a/pom.xml
+++ b/pom.xml
@@ -54,7 +54,7 @@
junitjunit
- 4.13.1
+ 4.13.2
From 7426c563310f73a0820a9af729b5f3621cea57f4 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Fri, 29 Nov 2024 16:24:52 +0900
Subject: [PATCH 086/216] Bump hadoop from 3.4.0 to 3.4.1
---
pom.xml | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/pom.xml b/pom.xml
index 46f26766..c1bc7798 100644
--- a/pom.xml
+++ b/pom.xml
@@ -95,7 +95,7 @@
org.apache.hadoophadoop-common
- 3.4.0
+ 3.4.1true
@@ -108,7 +108,7 @@
org.apache.hadoophadoop-mapreduce-client-core
- 3.4.0
+ 3.4.1true
From 88607b2ed67c8c73e8b199adf85ac1ddf2fcdddb Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Fri, 29 Nov 2024 16:26:16 +0900
Subject: [PATCH 087/216] Bump httpcore from 4.3 to 4.4.16
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index c1bc7798..a993945e 100644
--- a/pom.xml
+++ b/pom.xml
@@ -176,7 +176,7 @@
org.apache.httpcomponentshttpcore
- 4.3
+ 4.4.16
From 0256ae6131e80c49e1ed4a16e5631ccff0d74e36 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Fri, 29 Nov 2024 16:27:04 +0900
Subject: [PATCH 088/216] Bump htmlparser from 1.6 to 2.1
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index a993945e..ce0a2aec 100644
--- a/pom.xml
+++ b/pom.xml
@@ -71,7 +71,7 @@
org.htmlparserhtmlparser
- 1.6
+ 2.1
From e1d458a86a2203ca1cd5cab967fb17f268994082 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Fri, 29 Nov 2024 16:27:31 +0900
Subject: [PATCH 089/216] Bump json from 20231013 to 20240303
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index ce0a2aec..1023560c 100644
--- a/pom.xml
+++ b/pom.xml
@@ -66,7 +66,7 @@
org.jsonjson
- 20231013
+ 20240303org.htmlparser
From c839700d472bac5b4625ea4fe10ef47ee02a5a31 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Fri, 29 Nov 2024 16:35:12 +0900
Subject: [PATCH 090/216] Update CHANGES.md
---
CHANGES.md | 18 ++++++++++++++++++
1 file changed, 18 insertions(+)
diff --git a/CHANGES.md b/CHANGES.md
index 579b659f..e3afd137 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,3 +1,21 @@
+1.2.0
+-----
+
+#### New features
+
+* MetaData is now multivalued to support repeated WARC and HTTP headers. [#98](https://github.com/iipc/webarchive-commons/pull/98/files)
+
+#### Dependency upgrades
+
+* commons-io 2.18.0
+* commons-lang 2.6
+* guava 33.3.1-jre
+* hadoop 3.4.1
+* htmlparser 2.1
+* httpcore 4.4.16
+* json 20240303
+* junit 4.13.2
+
1.1.11
------
From 91c01ddb0561798d204c957fefafa782c0b53921 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Fri, 29 Nov 2024 16:37:15 +0900
Subject: [PATCH 091/216] [maven-release-plugin] prepare release
webarchive-commons-1.2.0
---
pom.xml | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/pom.xml b/pom.xml
index 1023560c..12dfae9f 100644
--- a/pom.xml
+++ b/pom.xml
@@ -3,7 +3,7 @@
org.netpreserve.commonswebarchive-commons
- 1.1.12-SNAPSHOT
+ 1.2.0jarwebarchive-commons
@@ -41,7 +41,7 @@
scm:git:git@github.com:iipc/webarchive-commons.gitscm:git:git@github.com:iipc/webarchive-commons.githttps://github.com/iipc/webarchive-commons
- HEAD
+ webarchive-commons-1.2.0
From f37418d08d8fa7fd4ccad4fbb919cc0fc371f2f2 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Fri, 29 Nov 2024 16:37:20 +0900
Subject: [PATCH 092/216] [maven-release-plugin] prepare for next development
iteration
---
pom.xml | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/pom.xml b/pom.xml
index 12dfae9f..0d84b0d2 100644
--- a/pom.xml
+++ b/pom.xml
@@ -3,7 +3,7 @@
org.netpreserve.commonswebarchive-commons
- 1.2.0
+ 1.2.1-SNAPSHOTjarwebarchive-commons
@@ -41,7 +41,7 @@
scm:git:git@github.com:iipc/webarchive-commons.gitscm:git:git@github.com:iipc/webarchive-commons.githttps://github.com/iipc/webarchive-commons
- webarchive-commons-1.2.0
+ HEAD
From 3ae5720ad43e2e80b5ab853078e891ee53641a3c Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Tue, 3 Dec 2024 20:22:10 +0900
Subject: [PATCH 093/216] Remove dependency on dsiutils
---
pom.xml | 16 ++--------------
.../java/org/archive/url/UsableURIFactory.java | 5 ++---
2 files changed, 4 insertions(+), 17 deletions(-)
diff --git a/pom.xml b/pom.xml
index 0d84b0d2..da2e14da 100644
--- a/pom.xml
+++ b/pom.xml
@@ -150,20 +150,8 @@
it.unimi.dsi
- dsiutils
- 2.2.8
- compile
-
-
- ch.qos.logback
- logback-classic
-
-
-
- commons-collections
- commons-collections
-
-
+ fastutil
+ 7.0.10
diff --git a/src/main/java/org/archive/url/UsableURIFactory.java b/src/main/java/org/archive/url/UsableURIFactory.java
index d44b5c84..3dfc33a7 100644
--- a/src/main/java/org/archive/url/UsableURIFactory.java
+++ b/src/main/java/org/archive/url/UsableURIFactory.java
@@ -20,7 +20,6 @@
import gnu.inet.encoding.IDNA;
import gnu.inet.encoding.IDNAException;
-import it.unimi.dsi.lang.MutableString;
import java.io.UnsupportedEncodingException;
import java.util.BitSet;
@@ -485,7 +484,7 @@ private String fixup(String uri, final URI base, final String charset)
// Preallocate. The '1's and '2's in below are space for ':',
// '//', etc. URI characters.
- MutableString s = new MutableString(
+ StringBuilder s = new StringBuilder(
((uriScheme != null)? uriScheme.length(): 0)
+ 1 // ';'
+ ((uriAuthority != null)? uriAuthority.length(): 0)
@@ -707,7 +706,7 @@ private String checkPort(String uriAuthority)
* @param substr Suffix or prefix to use if str is not null.
* @param suffix True if substr is a suffix.
*/
- private void appendNonNull(MutableString b, String str, String substr,
+ private void appendNonNull(StringBuilder b, String str, String substr,
boolean suffix) {
if (str != null && str.length() > 0) {
if (!suffix) {
From 33556bf741eaa10421b9214bbbd69f40618d27d1 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Tue, 3 Dec 2024 20:38:46 +0900
Subject: [PATCH 094/216] Remove pom-cdh4.xml
---
pom-cdh4.xml | 229 ---------------------------------------------------
1 file changed, 229 deletions(-)
delete mode 100644 pom-cdh4.xml
diff --git a/pom-cdh4.xml b/pom-cdh4.xml
deleted file mode 100644
index de19d8d0..00000000
--- a/pom-cdh4.xml
+++ /dev/null
@@ -1,229 +0,0 @@
-
- 4.0.0
-
- org.archive
- ia-web-commons
- 1.0-SNAPSHOT
- jar
-
- ia-web-commons
- http://maven.apache.org
-
-
- UTF-8
- ${maven.build.timestamp}
- yyyyMMddhhmmss
-
-
-
-
- junit
- junit
- 3.8.1
- test
-
-
-
- com.google.guava
- guava
- 14.0.1
-
-
-
- org.json
- json
- 20090211
-
-
- org.htmlparser
- htmlparser
- 1.6
-
-
-
- org.mozilla
- juniversalchardet
- 1.0.3
-
-
-
- commons-httpclient
- commons-httpclient
- 3.1
-
-
-
- org.apache.hadoop
- hadoop-core
- 2.0.0-mr1-cdh4.2.0
-
-
- commons-httpclient
- commons-httpclient
-
-
- javax.servlet
- servlet-api
-
-
- javax.servlet.jsp
- jsp-api
-
-
- org.mortbay.jetty
- jetty
-
-
- org.mortbay.jetty
- jetty-util
-
-
- tomcat
- jasper-runtime
-
-
- tomcat
- jasper-compiler
-
-
-
-
- org.apache.hadoop
- hadoop-common
- 2.0.0-cdh4.2.0
-
-
- org.apache.hadoop
- hadoop-mapreduce-client-common
- 2.0.0-cdh4.2.0
-
-
- org.apache.hadoop
- hadoop-mapreduce-client-core
- 2.0.0-cdh4.2.0
-
-
-
- org.apache.pig
- pig
- 0.11.1
- provided
-
-
-
- commons-lang
- commons-lang
- 2.5
-
-
-
- commons-io
- commons-io
- 2.4
-
-
-
- org.gnu.inet
- libidn
- 1.15
-
-
- it.unimi.dsi
- mg4j
- 1.0.1
- compile
-
-
- org.apache.httpcomponents
- httpcore
- 4.3
-
-
-
-
-
-
- org.apache.maven.plugins
- maven-compiler-plugin
- 2.3.2
-
- 1.6
- 1.6
-
-
-
- maven-assembly-plugin
- 2.4
-
-
- jar-with-dependencies
-
- ia-web-commons
-
-
-
- package
-
- single
-
-
-
-
-
-
-
- src/main/resources
- true
-
-
-
-
-
-
- internetarchive
- Internet Archive Maven Repository
- http://builds.archive.org:8080/maven2
- default
-
-
- true
- daily
- warn
-
-
- true
- daily
- warn
-
-
-
-
- cloudera
- Cloudera Hadoop
- https://repository.cloudera.com/artifactory/cloudera-repos/
- default
-
-
- true
- daily
- warn
-
-
- true
- daily
- warn
-
-
-
-
-
-
-
- repository
-
- ${repository.url}
-
-
-
-
From 4bb03baec41d90795e312e4a2865abb0395670f3 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Tue, 3 Dec 2024 20:42:29 +0900
Subject: [PATCH 095/216] Use Files.createLink instead of shelling out to ln
---
.../io/ObjectPlusFilesOutputStream.java | 19 ++++---------------
1 file changed, 4 insertions(+), 15 deletions(-)
diff --git a/src/main/java/org/archive/io/ObjectPlusFilesOutputStream.java b/src/main/java/org/archive/io/ObjectPlusFilesOutputStream.java
index 224f24e7..bd5c1eea 100644
--- a/src/main/java/org/archive/io/ObjectPlusFilesOutputStream.java
+++ b/src/main/java/org/archive/io/ObjectPlusFilesOutputStream.java
@@ -18,10 +18,8 @@
*/
package org.archive.io;
-import java.io.File;
-import java.io.IOException;
-import java.io.ObjectOutputStream;
-import java.io.OutputStream;
+import java.io.*;
+import java.nio.file.Files;
import java.util.LinkedList;
import org.archive.util.FileUtils;
@@ -116,19 +114,10 @@ public void snapshotAppendOnlyFile(File file) throws IOException {
* @throws IOException
*/
private void hardlinkOrCopy(File file, File destination) throws IOException {
- // For Linux/UNIX, try a hard link first.
- Process link = Runtime.getRuntime().exec("ln "+file.getAbsolutePath()+" "+destination.getAbsolutePath());
- // TODO NTFS also supports hard links; add appropriate try
try {
- link.waitFor();
- } catch (InterruptedException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- if(link.exitValue()!=0) {
- // hard link failed
+ Files.createLink(destination.toPath(), file.toPath());
+ } catch (UnsupportedEncodingException e) {
FileUtils.copyFile(file,destination);
}
}
-
}
From 328aef2788313a2abc6123c385f9c31b863d6f1b Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Wed, 4 Dec 2024 15:07:23 +0900
Subject: [PATCH 096/216] Remove dependency on fastutil
Fastutil is our largest dependency and consumes a third of the overall Heritrix distribution size. If we update to the latest version it will be even larger. But we're only using two tiny classes from it: the trivial RepositionableStream interface and the unsynchronized FastBufferedOutputStream.
Some downstream users (e.g. lockss-core) actually implement RepositionableStream, so to preserve API compatiblity this change includes a copy of just that interface while keeping the same package name.
Regarding FastBufferedOutputStream, for WARC writing the outer GZIPOutputStream is synchronized anyway. And RecordingOutputStream will typically be doing moderately large writes copying from the network. So in both usages it seems unlikely that there's much practical benefit in using it here over the standard BufferedOutputStream. The JVM JIT has a lot of optimizations for synchronized these days too.
---
pom.xml | 5 ---
.../dsi/fastutil/io/RepositionableStream.java | 42 +++++++++++++++++++
.../org/archive/io/RecordingOutputStream.java | 5 +--
.../java/org/archive/io/WriterPoolMember.java | 5 +--
4 files changed, 46 insertions(+), 11 deletions(-)
create mode 100644 src/main/java/it/unimi/dsi/fastutil/io/RepositionableStream.java
diff --git a/pom.xml b/pom.xml
index da2e14da..5e5fa419 100644
--- a/pom.xml
+++ b/pom.xml
@@ -148,11 +148,6 @@
libidn1.15
-
- it.unimi.dsi
- fastutil
- 7.0.10
-
diff --git a/src/main/java/it/unimi/dsi/fastutil/io/RepositionableStream.java b/src/main/java/it/unimi/dsi/fastutil/io/RepositionableStream.java
new file mode 100644
index 00000000..a81645f0
--- /dev/null
+++ b/src/main/java/it/unimi/dsi/fastutil/io/RepositionableStream.java
@@ -0,0 +1,42 @@
+// copied from fastutil, keeping the original package name to avoid breaking
+// compatibility with existing user code that implements this interface
+package it.unimi.dsi.fastutil.io;
+
+/*
+ * Copyright (C) 2005-2015 Sebastiano Vigna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/** A basic interface specifying positioning methods for a byte stream.
+ *
+ * @author Sebastiano Vigna
+ * @since 4.4
+ */
+
+public interface RepositionableStream {
+
+ /** Sets the current stream position.
+ *
+ * @param newPosition the new stream position.
+ */
+ void position( long newPosition ) throws java.io.IOException;
+
+ /** Returns the current stream position.
+ *
+ * @return the current stream position.
+ */
+ long position() throws java.io.IOException;
+
+}
diff --git a/src/main/java/org/archive/io/RecordingOutputStream.java b/src/main/java/org/archive/io/RecordingOutputStream.java
index 7d2ff212..6c77997b 100644
--- a/src/main/java/org/archive/io/RecordingOutputStream.java
+++ b/src/main/java/org/archive/io/RecordingOutputStream.java
@@ -19,8 +19,7 @@
package org.archive.io;
-import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
-
+import java.io.BufferedOutputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
@@ -207,7 +206,7 @@ public void open(OutputStream wrappedStream) throws IOException {
protected OutputStream ensureDiskStream() throws FileNotFoundException {
if (this.diskStream == null) {
FileOutputStream fis = new FileOutputStream(this.backingFilename);
- this.diskStream = new FastBufferedOutputStream(fis);
+ this.diskStream = new BufferedOutputStream(fis);
}
return this.diskStream;
}
diff --git a/src/main/java/org/archive/io/WriterPoolMember.java b/src/main/java/org/archive/io/WriterPoolMember.java
index 893007ec..e10d443b 100644
--- a/src/main/java/org/archive/io/WriterPoolMember.java
+++ b/src/main/java/org/archive/io/WriterPoolMember.java
@@ -19,8 +19,7 @@
package org.archive.io;
-import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
-
+import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
@@ -200,7 +199,7 @@ protected String createFile(final File file) throws IOException {
close();
this.f = file;
FileOutputStream fos = new FileOutputStream(this.f);
- this.countOut = new MiserOutputStream(new FastBufferedOutputStream(fos),settings.getFrequentFlushes());
+ this.countOut = new MiserOutputStream(new BufferedOutputStream(fos),settings.getFrequentFlushes());
this.out = this.countOut;
logger.fine("Opened " + this.f.getAbsolutePath());
return this.f.getName();
From 8988fbbc3528afcc7f792bcc967189311e8a1286 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Wed, 4 Dec 2024 16:54:03 +0900
Subject: [PATCH 097/216] Deprecate some classes specific to HttpClient 3
These are intended to be removed in webarchive-commons 2. #78
---
.../java/org/archive/httpclient/HttpRecorderGetMethod.java | 2 ++
src/main/java/org/archive/httpclient/HttpRecorderMethod.java | 2 ++
.../java/org/archive/httpclient/HttpRecorderPostMethod.java | 2 ++
.../org/archive/httpclient/SingleHttpConnectionManager.java | 2 ++
.../archive/httpclient/ThreadLocalHttpConnectionManager.java | 4 +++-
.../util/binsearch/impl/HTTPSeekableLineReaderFactory.java | 1 +
.../archive/util/binsearch/impl/http/ApacheHttp31SLR.java | 4 ++++
.../util/binsearch/impl/http/ApacheHttp31SLRFactory.java | 5 +++++
8 files changed, 21 insertions(+), 1 deletion(-)
diff --git a/src/main/java/org/archive/httpclient/HttpRecorderGetMethod.java b/src/main/java/org/archive/httpclient/HttpRecorderGetMethod.java
index ef241b48..1a94af1f 100644
--- a/src/main/java/org/archive/httpclient/HttpRecorderGetMethod.java
+++ b/src/main/java/org/archive/httpclient/HttpRecorderGetMethod.java
@@ -70,7 +70,9 @@
*
* @author stack
* @version $Revision$, $Date$
+ * @deprecated Commons HttpClient 3 is end of life, this will be removed in webarchive-commons 2.0
*/
+@Deprecated
public class HttpRecorderGetMethod extends GetMethod {
protected static Logger logger =
diff --git a/src/main/java/org/archive/httpclient/HttpRecorderMethod.java b/src/main/java/org/archive/httpclient/HttpRecorderMethod.java
index 932e7e98..b08bc0bd 100644
--- a/src/main/java/org/archive/httpclient/HttpRecorderMethod.java
+++ b/src/main/java/org/archive/httpclient/HttpRecorderMethod.java
@@ -34,7 +34,9 @@
*
* @author stack
* @version $Revision$, $Date$
+ * @deprecated Commons HttpClient 3 is end of life, this will be removed in webarchive-commons 2.0
*/
+@Deprecated
public class HttpRecorderMethod {
protected static Logger logger =
Logger.getLogger(HttpRecorderMethod.class.getName());
diff --git a/src/main/java/org/archive/httpclient/HttpRecorderPostMethod.java b/src/main/java/org/archive/httpclient/HttpRecorderPostMethod.java
index 20f1bfd1..d55d816a 100644
--- a/src/main/java/org/archive/httpclient/HttpRecorderPostMethod.java
+++ b/src/main/java/org/archive/httpclient/HttpRecorderPostMethod.java
@@ -36,7 +36,9 @@
*
* @author stack
* @version $Date$ $Revision$
+ * @deprecated Commons HttpClient 3 is end of life, this will be removed in webarchive-commons 2.0
*/
+@Deprecated
public class HttpRecorderPostMethod extends PostMethod {
/**
* Instance of http recorder method.
diff --git a/src/main/java/org/archive/httpclient/SingleHttpConnectionManager.java b/src/main/java/org/archive/httpclient/SingleHttpConnectionManager.java
index 4ba6a837..d6cf27ab 100644
--- a/src/main/java/org/archive/httpclient/SingleHttpConnectionManager.java
+++ b/src/main/java/org/archive/httpclient/SingleHttpConnectionManager.java
@@ -32,7 +32,9 @@
* with external mechanisms.
*
* @author gojomo
+ * @deprecated Commons HttpClient 3 is end of life, this will be removed in webarchive-commons 2.0
*/
+@Deprecated
public class SingleHttpConnectionManager extends SimpleHttpConnectionManager {
public SingleHttpConnectionManager() {
diff --git a/src/main/java/org/archive/httpclient/ThreadLocalHttpConnectionManager.java b/src/main/java/org/archive/httpclient/ThreadLocalHttpConnectionManager.java
index 91e850ea..16821b36 100644
--- a/src/main/java/org/archive/httpclient/ThreadLocalHttpConnectionManager.java
+++ b/src/main/java/org/archive/httpclient/ThreadLocalHttpConnectionManager.java
@@ -36,8 +36,10 @@
*
* Java >= 1.4 is recommended.
*
- * @author Christian Kohlschuetter
+ * @author Christian Kohlschuetter
+ * @deprecated Commons HttpClient 3 is end of life, this will be removed in webarchive-commons 2.0
*/
+@Deprecated
public final class ThreadLocalHttpConnectionManager implements
HttpConnectionManager {
diff --git a/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReaderFactory.java b/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReaderFactory.java
index b4a23db0..68ee6551 100644
--- a/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReaderFactory.java
+++ b/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReaderFactory.java
@@ -20,6 +20,7 @@ protected HTTPSeekableLineReaderFactory()
public enum HttpLibs
{
+ @Deprecated
APACHE_31,
APACHE_43,
URLCONN,
diff --git a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java
index c4fdbba8..124d3d03 100644
--- a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java
+++ b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java
@@ -14,6 +14,10 @@
import org.apache.commons.io.input.CountingInputStream;
import org.archive.util.binsearch.impl.HTTPSeekableLineReader;
+/**
+ * @deprecated Commons HttpClient 3 is end of life, this will be removed in webarchive-commons 2.0
+ */
+@Deprecated
public class ApacheHttp31SLR extends HTTPSeekableLineReader {
private HttpClient http;
diff --git a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java
index bc5b83f4..2af03dab 100644
--- a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java
+++ b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java
@@ -15,6 +15,11 @@
import org.archive.util.binsearch.impl.HTTPSeekableLineReader;
import org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory;
+/**
+ *
+ * @deprecated Commons HttpClient 3 is end of life, this will be removed in webarchive-commons 2.0
+ */
+@Deprecated
public class ApacheHttp31SLRFactory extends HTTPSeekableLineReaderFactory {
private final static Logger LOGGER = Logger.getLogger(ApacheHttp31SLRFactory.class.getName());
From b8a91bb3b7e8a36b2162251314ff52b42a379221 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Thu, 5 Dec 2024 07:49:10 +0900
Subject: [PATCH 098/216] Remove unused dependency on commons-collections
---
pom.xml | 7 -------
1 file changed, 7 deletions(-)
diff --git a/pom.xml b/pom.xml
index 5e5fa419..6dec154c 100644
--- a/pom.xml
+++ b/pom.xml
@@ -149,13 +149,6 @@
1.15
-
-
- commons-collections
- commons-collections
- 3.2.2
-
-
org.apache.httpcomponentshttpcore
From a80b98dfe4b1c2a7556e7df2574c16426849f6d9 Mon Sep 17 00:00:00 2001
From: Tom Morris
Date: Sat, 26 Aug 2023 20:05:34 -0400
Subject: [PATCH 099/216] Add failing test from Sebastian's issue
---
src/test/java/org/archive/url/BasicURLCanonicalizerTest.java | 3 +++
src/test/java/org/archive/url/WaybackURLKeyMakerTest.java | 4 ++++
2 files changed, 7 insertions(+)
diff --git a/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java b/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java
index c21bcbe8..cc100e4c 100644
--- a/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java
+++ b/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java
@@ -143,6 +143,9 @@ public void testUnescapeRepeatedly() {
assertEquals("%",guc.unescapeRepeatedly("%25%32%35"));
assertEquals("168.188.99.26",guc.unescapeRepeatedly("%31%36%38%2e%31%38%38%2e%39%39%2e%32%36"));
+
+ assertEquals("tag=%E4%EE%F8%EA%EE%EB%FC%ED%EE%E5",
+ guc.unescapeRepeatedly("tag=%E4%EE%F8%EA%EE%EB%FC%ED%EE%E5"));
}
public void testAttemptIPFormats() throws URIException {
diff --git a/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java b/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java
index 1a1403ee..86250972 100644
--- a/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java
+++ b/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java
@@ -26,6 +26,10 @@ public void testMakeKey() throws URISyntaxException {
assertEquals("192,211,203,34)/robots.txt", km.makeKey("https://34.203.211.192/robots.txt"));
assertEquals("2600:1f18:200d:fb00:2b74:867c:ab0c:150a)/robots.txt",
km.makeKey("https://[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/robots.txt"));
+ assertEquals("ua,1kr)/newslist.html?tag=%e4%ee%f8%ea%ee%eb%fc%ed%ee%e5",
+ km.makeKey("http://1kr.ua/newslist.html?tag=%E4%EE%F8%EA%EE%EB%FC%ED%EE%E5"));
+ assertEquals("com,aluroba)/tags/%c3%ce%ca%c7%d1%e5%c7.htm",
+ km.makeKey("http://www.aluroba.com/tags/%C3%CE%CA%C7%D1%E5%C7.htm"));
}
}
From 5161306d9ec993d1986f0d092c056f33ba3abdfe Mon Sep 17 00:00:00 2001
From: Tom Morris
Date: Sun, 27 Aug 2023 13:01:19 -0400
Subject: [PATCH 100/216] Add non-UTF-8 encoded test from mailing list
---
src/test/java/org/archive/url/WaybackURLKeyMakerTest.java | 2 ++
1 file changed, 2 insertions(+)
diff --git a/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java b/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java
index 86250972..26371ba8 100644
--- a/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java
+++ b/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java
@@ -30,6 +30,8 @@ public void testMakeKey() throws URISyntaxException {
km.makeKey("http://1kr.ua/newslist.html?tag=%E4%EE%F8%EA%EE%EB%FC%ED%EE%E5"));
assertEquals("com,aluroba)/tags/%c3%ce%ca%c7%d1%e5%c7.htm",
km.makeKey("http://www.aluroba.com/tags/%C3%CE%CA%C7%D1%E5%C7.htm"));
+ assertEquals("ac,insbase)/xoops2/modules/xpwiki?%a4%d5%a4%af%a4%aa%a4%ab%b8%a9%a4%aa%a4%aa%a4%ce%a4%b8%a4%e7%a4%a6%bb%d4",
+ km.makeKey("https://www.insbase.ac/xoops2/modules/xpwiki/?%A4%D5%A4%AF%A4%AA%A4%AB%B8%A9%A4%AA%A4%AA%A4%CE%A4%B8%A4%E7%A4%A6%BB%D4"));
}
}
From f7be47bc523c4d06cc7960dc2d3b1b58f9580906 Mon Sep 17 00:00:00 2001
From: Tom Morris
Date: Sun, 27 Aug 2023 13:11:30 -0400
Subject: [PATCH 101/216] Handle non-UTF-8 encoded characters. Fixes #6
---
.../archive/url/BasicURLCanonicalizer.java | 27 +++++++++++++++----
1 file changed, 22 insertions(+), 5 deletions(-)
diff --git a/src/main/java/org/archive/url/BasicURLCanonicalizer.java b/src/main/java/org/archive/url/BasicURLCanonicalizer.java
index c09ad6e6..37b448c1 100644
--- a/src/main/java/org/archive/url/BasicURLCanonicalizer.java
+++ b/src/main/java/org/archive/url/BasicURLCanonicalizer.java
@@ -15,18 +15,18 @@
/**
* Canonicalizer that does more or less basic fixup. Based initially on rules
* specified at https://developers.google.com/safe-browsing/developers_guide_v2#
- * Canonicalization. These rules are designed for clients of google's
+ * Canonicalization. These rules are designed for clients of Google's
* "experimental" Safe Browsing API to "check URLs against Google's
* constantly-updated blacklists of suspected phishing and malware pages".
*
*
- * This class differs from google in treatment of non-ascii input. Google's
+ * This class differs from Google in treatment of non-ascii input. Google's
* rules don't really address this except with one example test case, which
* seems to suggest taking raw input bytes and pct-encoding them byte for byte.
* Since the input to this class consists of java strings, not raw bytes, that
- * wouldn't be possible, even if deemed preferable. Instead
+ * wouldn't be possible, even if deemed preferable. Instead,
* BasicURLCanonicalizer expresses non-ascii characters pct-encoded UTF-8.
*/
public class BasicURLCanonicalizer implements URLCanonicalizer {
@@ -212,6 +212,10 @@ protected static Charset UTF8() {
return _UTF8;
}
+ /**
+ * @param input String to be percent-encoded. Assumed to be fully unescaped.
+ * @return percent-encoded string
+ */
public String escapeOnce(String input) {
if (input == null) {
return null;
@@ -243,6 +247,19 @@ public String escapeOnce(String input) {
*/
sb = new StringBuilder(input.substring(0, i));
}
+ if (b == '%' && i < utf8bytes.length - 2) {
+ // Any hex escapes left at this point represent non-UTF-8 encoded characters
+ // Unescape them, so they don't get double escaped
+ int hex1 = getHex(utf8bytes[i + 1]);
+ if (hex1 >= 0) {
+ int hex2 = getHex(utf8bytes[i + 2]);
+ if (hex2 >= 0) {
+ i = i+2;
+ b = hex1 * 16 + hex2;
+ }
+ }
+
+ }
sb.append("%");
String hex = Integer.toHexString(b).toUpperCase();
if (hex.length() == 1) {
@@ -337,7 +354,7 @@ public String decode(String input) {
* Decodes bytes in bbuf as utf-8 and appends decoded characters to sb. If
* decoding of any portion fails, appends the un-decodable %xx%xx sequence
* extracted from inputStr instead of decoded characters. See "bad unicode"
- * tests in GoogleCanonicalizerTest#testDecode(). Variables only make sense
+ * tests in BasicURLCanonicalizerTest#testDecode(). Variables only make sense
* within context of {@link #decode(String)}.
*
* @param sb
From 6a3cf1b317c87305d05faee73d2c3ee3f5ec08b0 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Wed, 11 Dec 2024 21:14:06 +0100
Subject: [PATCH 102/216] WAT: Duplicated payload metadata values for
"Actual-Content-Length" and "Trailing-Slop-Length"
---
.../org/archive/resource/arc/ARCResource.java | 2 +
.../http/HTTPHeadersResourceFactory.java | 11 +++--
.../archive/resource/warc/WARCResource.java | 14 ++++--
.../record/WARCMetaDataResourceFactory.java | 10 +++-
.../archive/resource/arc/ARCResourceTest.java | 48 +++++++++++++++++++
.../resource/warc/WARCResourceTest.java | 46 ++++++++++++++++++
6 files changed, 123 insertions(+), 8 deletions(-)
create mode 100644 src/test/java/org/archive/resource/arc/ARCResourceTest.java
create mode 100644 src/test/java/org/archive/resource/warc/WARCResourceTest.java
diff --git a/src/main/java/org/archive/resource/arc/ARCResource.java b/src/main/java/org/archive/resource/arc/ARCResource.java
index b6e0a1c1..b0195f08 100644
--- a/src/main/java/org/archive/resource/arc/ARCResource.java
+++ b/src/main/java/org/archive/resource/arc/ARCResource.java
@@ -64,10 +64,12 @@ public ARCResource(MetaData metaData, ResourceContainer container,
}
}
+ @Override
public InputStream getInputStream() {
return new EOFNotifyingInputStream(digIS, this);
}
+ @Override
public void notifyEOF() throws IOException {
metaData.putLong(PAYLOAD_LENGTH, countingIS.getCount());
String digString = Base32.encode(digIS.getMessageDigest().digest());
diff --git a/src/main/java/org/archive/resource/http/HTTPHeadersResourceFactory.java b/src/main/java/org/archive/resource/http/HTTPHeadersResourceFactory.java
index 79805090..eb25d821 100644
--- a/src/main/java/org/archive/resource/http/HTTPHeadersResourceFactory.java
+++ b/src/main/java/org/archive/resource/http/HTTPHeadersResourceFactory.java
@@ -31,6 +31,7 @@ public HTTPHeadersResourceFactory(String name, String type) {
parser = new HttpHeaderParser();
}
+ @Override
public Resource getResource(InputStream is, MetaData parentMetaData,
ResourceContainer container) throws ResourceParseException,
IOException {
@@ -40,9 +41,13 @@ public Resource getResource(InputStream is, MetaData parentMetaData,
if(headers.isCorrupt()) {
parentMetaData.putBoolean(HTTP_HEADERS_CORRUPT, true);
}
- parentMetaData.putLong(PAYLOAD_LENGTH, bytes);
-
- parentMetaData.putLong(PAYLOAD_SLOP_BYTES, StreamCopy.readToEOF(is));
+ if (!parentMetaData.has(PAYLOAD_LENGTH) || bytes != parentMetaData.getLong(PAYLOAD_LENGTH)) {
+ parentMetaData.putLong(PAYLOAD_LENGTH, bytes);
+ }
+ long trailingSlopBytes = StreamCopy.readToEOF(is);
+ if (!parentMetaData.has(PAYLOAD_SLOP_BYTES) || trailingSlopBytes > 0) {
+ parentMetaData.putLong(PAYLOAD_SLOP_BYTES, trailingSlopBytes);
+ }
if(type != null) {
parentMetaData.putString(PAYLOAD_CONTENT_TYPE, type);
}
diff --git a/src/main/java/org/archive/resource/warc/WARCResource.java b/src/main/java/org/archive/resource/warc/WARCResource.java
index d538a25d..a9c3fcc3 100644
--- a/src/main/java/org/archive/resource/warc/WARCResource.java
+++ b/src/main/java/org/archive/resource/warc/WARCResource.java
@@ -53,7 +53,7 @@ public WARCResource(MetaData metaData, ResourceContainer container,
countingIS = new CountingInputStream(
ByteStreams.limit(response, length));
} else {
- throw new ResourceParseException(null);
+ throw new ResourceParseException(new Exception("Zero or negative length: " + length));
}
try {
digIS = new DigestInputStream(countingIS,
@@ -63,14 +63,18 @@ public WARCResource(MetaData metaData, ResourceContainer container,
}
}
+ @Override
public InputStream getInputStream() {
return new EOFNotifyingInputStream(digIS, this);
}
+ @Override
public void notifyEOF() throws IOException {
String digString = Base32.encode(digIS.getMessageDigest().digest());
if(container.isCompressed()) {
- metaData.putLong(PAYLOAD_LENGTH, countingIS.getCount());
+ if (!metaData.has(PAYLOAD_LENGTH) || countingIS.getCount() != metaData.getLong(PAYLOAD_LENGTH)) {
+ metaData.putLong(PAYLOAD_LENGTH, countingIS.getCount());
+ }
metaData.putLong(PAYLOAD_SLOP_BYTES, StreamCopy.readToEOF(response));
metaData.putString(PAYLOAD_DIGEST, "sha1:"+digString);
} else {
@@ -81,13 +85,17 @@ public void notifyEOF() throws IOException {
(PushBackOneByteInputStream) raw;
long numNewlines = StreamCopy.skipChars(pb1bis, CR_NL_CHARS);
if(numNewlines > 0) {
- metaData.putLong(PAYLOAD_LENGTH, countingIS.getCount());
+ long payloadLength = countingIS.getCount();
+ if (!metaData.has(PAYLOAD_LENGTH) || payloadLength != metaData.getLong(PAYLOAD_LENGTH)) {
+ metaData.putLong(PAYLOAD_LENGTH, payloadLength);
+ }
metaData.putLong(PAYLOAD_SLOP_BYTES, numNewlines);
metaData.putString(PAYLOAD_DIGEST, "sha1:"+digString);
}
}
}
}
+
public MetaData getEnvelopeMetaData() {
return envelope;
}
diff --git a/src/main/java/org/archive/resource/warc/record/WARCMetaDataResourceFactory.java b/src/main/java/org/archive/resource/warc/record/WARCMetaDataResourceFactory.java
index 0dfb2834..ba8a35da 100644
--- a/src/main/java/org/archive/resource/warc/record/WARCMetaDataResourceFactory.java
+++ b/src/main/java/org/archive/resource/warc/record/WARCMetaDataResourceFactory.java
@@ -21,6 +21,7 @@ public WARCMetaDataResourceFactory() {
parser = new HttpHeaderParser();
}
+ @Override
public Resource getResource(InputStream is, MetaData parentMetaData,
ResourceContainer container) throws ResourceParseException,
IOException {
@@ -33,8 +34,13 @@ public Resource getResource(InputStream is, MetaData parentMetaData,
if(headers.isCorrupt()) {
md.putBoolean(WARC_META_FIELDS_CORRUPT, true);
}
- parentMetaData.putLong(PAYLOAD_SLOP_BYTES, StreamCopy.readToEOF(is));
- parentMetaData.putLong(PAYLOAD_LENGTH, bytes);
+ long trailingSlopBytes = StreamCopy.readToEOF(is);
+ if (!parentMetaData.has(PAYLOAD_SLOP_BYTES) || trailingSlopBytes > 0) {
+ parentMetaData.putLong(PAYLOAD_SLOP_BYTES, trailingSlopBytes);
+ }
+ if (!parentMetaData.has(PAYLOAD_LENGTH) || bytes != parentMetaData.getLong(PAYLOAD_LENGTH)) {
+ parentMetaData.putLong(PAYLOAD_LENGTH, bytes);
+ }
return new WARCMetaDataResource(md,container, headers);
} catch (HttpParseException e) {
diff --git a/src/test/java/org/archive/resource/arc/ARCResourceTest.java b/src/test/java/org/archive/resource/arc/ARCResourceTest.java
new file mode 100644
index 00000000..43116af7
--- /dev/null
+++ b/src/test/java/org/archive/resource/arc/ARCResourceTest.java
@@ -0,0 +1,48 @@
+package org.archive.resource.arc;
+
+
+import static org.archive.resource.ResourceConstants.PAYLOAD_LENGTH;
+import static org.archive.resource.ResourceConstants.PAYLOAD_SLOP_BYTES;
+
+import java.io.IOException;
+
+import org.archive.extract.ExtractingResourceFactoryMapper;
+import org.archive.extract.ExtractingResourceProducer;
+import org.archive.extract.ProducerUtils;
+import org.archive.extract.ResourceFactoryMapper;
+import org.archive.resource.Resource;
+import org.archive.resource.ResourceParseException;
+import org.archive.resource.ResourceProducer;
+import org.archive.util.StreamCopy;
+
+import org.json.JSONObject;
+
+import junit.framework.TestCase;
+
+public class ARCResourceTest extends TestCase {
+
+ public void testARCResource() throws ResourceParseException, IOException {
+ String testFileName = "../../format/arc/IAH-20080430204825-00000-blackbook-truncated.arc";
+ ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath());
+ ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
+ ExtractingResourceProducer extractor = new ExtractingResourceProducer(producer, mapper);
+
+ Resource resource = extractor.getNext();
+
+ while (resource != null) {
+ JSONObject payloadMD = resource.getMetaData().getTopMetaData().getJSONObject("Envelope")
+ .getJSONObject("Payload-Metadata");
+ System.err.println(payloadMD);
+
+ if (payloadMD.has(PAYLOAD_LENGTH)) {
+ assertTrue(payloadMD.getLong(PAYLOAD_LENGTH) != -1);
+ }
+ if (payloadMD.has(PAYLOAD_SLOP_BYTES)) {
+ // does not occur with the tested ARC file
+ }
+
+ StreamCopy.readToEOF(resource.getInputStream());
+ resource = extractor.getNext();
+ }
+ }
+}
diff --git a/src/test/java/org/archive/resource/warc/WARCResourceTest.java b/src/test/java/org/archive/resource/warc/WARCResourceTest.java
new file mode 100644
index 00000000..1b935405
--- /dev/null
+++ b/src/test/java/org/archive/resource/warc/WARCResourceTest.java
@@ -0,0 +1,46 @@
+package org.archive.resource.warc;
+
+import static org.archive.resource.ResourceConstants.PAYLOAD_LENGTH;
+import static org.archive.resource.ResourceConstants.PAYLOAD_SLOP_BYTES;
+
+import java.io.IOException;
+
+import org.archive.extract.ExtractingResourceFactoryMapper;
+import org.archive.extract.ExtractingResourceProducer;
+import org.archive.extract.ProducerUtils;
+import org.archive.extract.ResourceFactoryMapper;
+import org.archive.resource.Resource;
+import org.archive.resource.ResourceParseException;
+import org.archive.resource.ResourceProducer;
+import org.archive.util.StreamCopy;
+
+import org.json.JSONObject;
+
+import junit.framework.TestCase;
+
+public class WARCResourceTest extends TestCase {
+
+ public void testWARCResource() throws ResourceParseException, IOException {
+ String testFileName = "../../format/warc/IAH-urls-wget.warc";
+ ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath());
+ ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
+ ExtractingResourceProducer extractor = new ExtractingResourceProducer(producer, mapper);
+
+ Resource resource = extractor.getNext();
+
+ while (resource != null) {
+ JSONObject payloadMD = resource.getMetaData().getTopMetaData().getJSONObject("Envelope")
+ .getJSONObject("Payload-Metadata");
+
+ if (payloadMD.has(PAYLOAD_LENGTH)) {
+ assertTrue(payloadMD.getLong(PAYLOAD_LENGTH) != -1);
+ }
+ if (payloadMD.has(PAYLOAD_SLOP_BYTES)) {
+ assertEquals(4, payloadMD.getLong(PAYLOAD_SLOP_BYTES));
+ }
+
+ StreamCopy.readToEOF(resource.getInputStream());
+ resource = extractor.getNext();
+ }
+ }
+}
From c5b779128edd1f0fad2709d4ab1b797326c2cb6c Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Fri, 20 Dec 2024 14:10:44 +0900
Subject: [PATCH 103/216] Update CHANGES.md for 1.3.0
---
CHANGES.md | 37 +++++++++++++++++++++++++++++++++++++
1 file changed, 37 insertions(+)
diff --git a/CHANGES.md b/CHANGES.md
index e3afd137..8a0a7d20 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,3 +1,40 @@
+1.3.0
+-----
+
+#### URL Canonicalization Changed
+
+The output of WaybackURLKeyMaker and other canonicalizers based on BasicURLCanonicalizer has changed for URLs that
+contain non UTF-8 percent encoded sequences. For example when a URL contains "%C3%23" it will now be normalised to
+"%c3%23" whereas previous releases produced "%25c3%23". This change brings webarchive-commons more inline with pywb,
+surt (Python), warcio.js and RFC 3986. While CDX file compatibility with these newer tools should improve, note that CDX
+files generated by the new release which contain such URLs may not work correctly with existing versions of
+OpenWayback that use the older webarchive-commons. [#102](https://github.com/iipc/webarchive-commons/pull/102)
+
+#### Bug fixes
+
+* WAT: Duplicated payload metadata values for "Actual-Content-Length" and "Trailing-Slop-Length" [#103](https://github.com/iipc/webarchive-commons/pull/103)
+* ObjectPlusFilesOutputStream.hardlinkOrCopy now uses `Files.createLink()` instead of executing `ln`. This
+ prevents the potential for security vulnerabilities from command line option injection and improves portability.
+
+#### Dependency upgrades
+
+* fastutil removed
+* dsiutils removed
+
+#### Deprecations
+
+The following classes and enum members have been marked deprecated as a step towards removal of the dependency on
+Apache Commons HttpClient 3.1.
+
+* org.archive.httpclient.HttpRecorderGetMethod
+* org.archive.httpclient.HttpRecorderMethod
+* org.archive.httpclient.HttpRecorderPostMethod
+* org.archive.httpclient.SingleHttpConnectionManager
+* org.archive.httpclient.ThreadLocalHttpConnectionManager
+* org.archive.util.binsearch.impl.http.ApacheHttp31SLR
+* org.archive.util.binsearch.impl.http.ApacheHttp31SLRFactory
+* org.archive.util.binsearch.impl.http.HTTPSeekableLineReaderFactory.HttpLibs.APACHE_31
+
1.2.0
-----
From eee48cc18017dde59b1d12f11654a2c752c63d45 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Fri, 20 Dec 2024 14:12:09 +0900
Subject: [PATCH 104/216] [maven-release-plugin] prepare release
webarchive-commons-1.3.0
---
pom.xml | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/pom.xml b/pom.xml
index 6dec154c..f489826c 100644
--- a/pom.xml
+++ b/pom.xml
@@ -3,7 +3,7 @@
org.netpreserve.commonswebarchive-commons
- 1.2.1-SNAPSHOT
+ 1.3.0jarwebarchive-commons
@@ -41,7 +41,7 @@
scm:git:git@github.com:iipc/webarchive-commons.gitscm:git:git@github.com:iipc/webarchive-commons.githttps://github.com/iipc/webarchive-commons
- HEAD
+ webarchive-commons-1.3.0
From a8fd8a74b83d3327bc074cf783f6315659fbc715 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Fri, 20 Dec 2024 14:12:13 +0900
Subject: [PATCH 105/216] [maven-release-plugin] prepare for next development
iteration
---
pom.xml | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/pom.xml b/pom.xml
index f489826c..74a4bbe6 100644
--- a/pom.xml
+++ b/pom.xml
@@ -3,7 +3,7 @@
org.netpreserve.commonswebarchive-commons
- 1.3.0
+ 1.3.1-SNAPSHOTjarwebarchive-commons
@@ -41,7 +41,7 @@
scm:git:git@github.com:iipc/webarchive-commons.gitscm:git:git@github.com:iipc/webarchive-commons.githttps://github.com/iipc/webarchive-commons
- webarchive-commons-1.3.0
+ HEAD
From a3a39598fc7b6947e38161e9f27f6842eed95456 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Tue, 11 Mar 2025 10:20:00 +0100
Subject: [PATCH 106/216] Upgrade GitHub workflow actions cache
---
.github/workflows/maven.yml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.github/workflows/maven.yml b/.github/workflows/maven.yml
index 8bb55c4e..60fac096 100644
--- a/.github/workflows/maven.yml
+++ b/.github/workflows/maven.yml
@@ -24,7 +24,7 @@ jobs:
distribution: 'temurin'
cache: maven
- name: Cache local Maven repository
- uses: actions/cache@v2
+ uses: actions/cache@v4
with:
path: ~/.m2/repository
key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}
From c427a12e82f3cebd6ba57152209d0bb5b9de2619 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Sun, 18 May 2025 09:39:48 +0900
Subject: [PATCH 107/216] Upgrade to JUnit 5
---
CHANGES.md | 7 +
pom.xml | 7 +-
.../java/org/archive/util/TmpDirTestCase.java | 119 ----
.../extract/RealCDXExtractorOutputTest.java | 31 +-
.../format/dns/DNSResponseParserTest.java | 10 +-
.../format/gzip/GZIPMemberSeriesTest.java | 38 +-
.../format/gzip/GZIPMemberWriterTest.java | 5 +-
.../format/gzip/zipnum/ZipNumWriterTest.java | 16 +-
.../http/HttpRequestMessageParserTest.java | 12 +-
.../format/http/HttpResponseParserTest.java | 14 +-
.../json/CompoundORJSONPathSpecTest.java | 5 +-
.../format/json/JSONPathSpecFactoryTest.java | 5 +-
.../org/archive/format/json/JSONViewTest.java | 9 +-
.../format/json/SimpleJSONPathSpecTest.java | 5 +-
.../format/text/html/CDATALexerTest.java | 14 +-
.../archive/io/ArchiveReaderFactoryTest.java | 27 +-
.../io/BufferedSeekInputStreamTest.java | 9 +-
.../archive/io/HeaderedArchiveRecordTest.java | 22 +-
.../archive/io/RecordingInputStreamTest.java | 39 +-
.../archive/io/RecordingOutputStreamTest.java | 74 ++-
.../archive/io/ReplayCharSequenceTest.java | 110 ++--
.../io/RepositionableInputStreamTest.java | 20 +-
.../archive/io/arc/ARCReaderFactoryTest.java | 13 +-
.../org/archive/io/arc/ARCWriterPoolTest.java | 41 +-
.../org/archive/io/arc/ARCWriterTest.java | 121 ++--
.../io/warc/WARCReaderFactoryTest.java | 7 +-
.../org/archive/io/warc/WARCWriterTest.java | 67 ++-
.../org/archive/net/PublicSuffixesTest.java | 55 +-
.../org/archive/resource/MetaDataTest.java | 21 +-
.../archive/resource/arc/ARCResourceTest.java | 6 +-
.../html/ExtractingParseObserverTest.java | 24 +-
.../resource/html/HTMLMetaDataTest.java | 12 +-
.../resource/warc/WARCResourceTest.java | 7 +-
.../org/archive/uid/UUIDGeneratorTest.java | 7 +-
.../url/AggressiveIAURLCanonicalizerTest.java | 9 +-
.../url/BasicURLCanonicalizerTest.java | 39 +-
.../java/org/archive/url/HandyURLTest.java | 13 +-
.../archive/url/IAURLCanonicalizerTest.java | 13 +-
.../url/OrdinaryIAURLCanonicalizerTest.java | 10 +-
.../java/org/archive/url/URLParserTest.java | 11 +-
.../archive/url/URLRegexTransformerTest.java | 45 +-
.../org/archive/url/UsableURIFactoryTest.java | 564 +++++++++---------
.../java/org/archive/url/UsableURITest.java | 16 +-
.../archive/url/WaybackURLKeyMakerTest.java | 7 +-
.../org/archive/util/ArchiveUtilsTest.java | 231 ++++---
.../java/org/archive/util/ByteOpTest.java | 14 +-
.../org/archive/util/CrossProductTest.java | 8 +-
.../java/org/archive/util/FileUtilsTest.java | 69 ++-
.../util/InterruptibleCharSequenceTest.java | 21 +-
.../org/archive/util/MimetypeUtilsTest.java | 63 +-
.../org/archive/util/PropertyUtilsTest.java | 11 +-
.../util/StringFieldExtractorTest.java | 10 +-
src/test/java/org/archive/util/TestUtils.java | 17 +-
.../org/archive/util/anvl/ANVLRecordTest.java | 56 +-
.../util/binsearch/SortedTextFileTest.java | 8 +-
.../iterator/CachingStringFilterTest.java | 5 +-
.../iterator/FilterStringIteratorTest.java | 25 +-
.../iterator/SortedCompositeIteratorTest.java | 8 +-
.../util/zip/GZIPMembersInputStreamTest.java | 157 ++---
59 files changed, 1236 insertions(+), 1173 deletions(-)
delete mode 100644 src/main/java/org/archive/util/TmpDirTestCase.java
diff --git a/CHANGES.md b/CHANGES.md
index 8a0a7d20..478238bf 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,3 +1,10 @@
+Unreleased
+----------
+
+#### Dependency upgrades
+
+- **junit**: 4.13.2 → 5.12.2
+
1.3.0
-----
diff --git a/pom.xml b/pom.xml
index 74a4bbe6..c70a2cd7 100644
--- a/pom.xml
+++ b/pom.xml
@@ -52,9 +52,10 @@
- junit
- junit
- 4.13.2
+ org.junit.jupiter
+ junit-jupiter
+ 5.12.2
+ test
diff --git a/src/main/java/org/archive/util/TmpDirTestCase.java b/src/main/java/org/archive/util/TmpDirTestCase.java
deleted file mode 100644
index 09ec345b..00000000
--- a/src/main/java/org/archive/util/TmpDirTestCase.java
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * This file is part of the Heritrix web crawler (crawler.archive.org).
- *
- * Licensed to the Internet Archive (IA) by one or more individual
- * contributors.
- *
- * The IA licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.archive.util;
-
-import java.io.File;
-import java.io.IOException;
-
-import junit.framework.TestCase;
-
-
-/**
- * Base class for TestCases that want access to a tmp dir for the writing
- * of files.
- *
- * @author stack
- */
-public abstract class TmpDirTestCase extends TestCase
-{
- /**
- * Name of the system property that holds pointer to tmp directory into
- * which we can safely write files.
- */
- public static final String TEST_TMP_SYSTEM_PROPERTY_NAME = "testtmpdir";
-
- /**
- * Default test tmp.
- */
- public static final String DEFAULT_TEST_TMP_DIR = File.separator + "tmp" +
- File.separator + "heritrix-junit-tests";
-
- /**
- * Directory to write temporary files to.
- */
- private File tmpDir = null;
-
-
- public TmpDirTestCase()
- {
- super();
- }
-
- public TmpDirTestCase(String testName)
- {
- super(testName);
- }
-
- /*
- * @see TestCase#setUp()
- */
- protected void setUp() throws Exception {
- super.setUp();
- this.tmpDir = tmpDir();
- }
-
- /**
- * @return Returns the tmpDir.
- */
- public File getTmpDir()
- {
- return this.tmpDir;
- }
-
- /**
- * Delete any files left over from previous run.
- *
- * @param basename Base name of files we're to clean up.
- */
- public void cleanUpOldFiles(String basename) {
- cleanUpOldFiles(getTmpDir(), basename);
- }
-
- /**
- * Delete any files left over from previous run.
- *
- * @param prefix Base name of files we're to clean up.
- * @param basedir Directory to start cleaning in.
- */
- public void cleanUpOldFiles(File basedir, String prefix) {
- File [] files = FileUtils.getFilesWithPrefix(basedir, prefix);
- if (files != null) {
- for (int i = 0; i < files.length; i++) {
- org.apache.commons.io.FileUtils.deleteQuietly(files[i]);
- }
- }
- }
-
-
- public static File tmpDir() throws IOException {
- String tmpDirStr = System.getProperty(TEST_TMP_SYSTEM_PROPERTY_NAME);
- tmpDirStr = (tmpDirStr == null)? DEFAULT_TEST_TMP_DIR: tmpDirStr;
- File tmpDir = new File(tmpDirStr);
- FileUtils.ensureWriteableDirectory(tmpDir);
-
- if (!tmpDir.canWrite())
- {
- throw new IOException(tmpDir.getAbsolutePath() +
- " is unwriteable.");
- }
-
- return tmpDir;
- }
-}
diff --git a/src/test/java/org/archive/extract/RealCDXExtractorOutputTest.java b/src/test/java/org/archive/extract/RealCDXExtractorOutputTest.java
index 14f8489d..a716df82 100644
--- a/src/test/java/org/archive/extract/RealCDXExtractorOutputTest.java
+++ b/src/test/java/org/archive/extract/RealCDXExtractorOutputTest.java
@@ -1,28 +1,29 @@
package org.archive.extract;
-import java.net.MalformedURLException;
import java.net.URI;
-import java.net.URISyntaxException;
-import java.net.URL;
-import java.net.URLEncoder;
-import junit.framework.TestCase;
+import org.junit.jupiter.api.Test;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
-public class RealCDXExtractorOutputTest extends TestCase {
+public class RealCDXExtractorOutputTest {
+
+ @Test
public void testEscapeResolvedUrl() throws Exception {
- String context ="http://www.uni-giessen.de/cms/studium/dateien/informationberatung/merkblattpdf";
- String spec = "http://fss.plone.uni-giessen.de/fß/studium/dateien/informationberatung/merkblattpdf/file/Mérkblatt zur Gestaltung von Nachteilsausgleichen.pdf?föo=bar#änchor";
- String escaped = RealCDXExtractorOutput.resolve(context, spec);
- assertTrue(escaped.indexOf(" ") < 0);
- URI parsed = new URI(escaped);
- assertEquals("änchor", parsed.getFragment());
+ String context = "http://www.uni-giessen.de/cms/studium/dateien/informationberatung/merkblattpdf";
+ String spec = "http://fss.plone.uni-giessen.de/fß/studium/dateien/informationberatung/merkblattpdf/file/Mérkblatt zur Gestaltung von Nachteilsausgleichen.pdf?föo=bar#änchor";
+ String escaped = RealCDXExtractorOutput.resolve(context, spec);
+ assertTrue(escaped.indexOf(" ") < 0);
+ URI parsed = new URI(escaped);
+ assertEquals("änchor", parsed.getFragment());
}
+ @Test
public void testNoDoubleEscaping() throws Exception {
- String spec = "https://www.google.com/search?q=java+escape+url+spaces&ie=utf-8&oe=utf-8";
- String resolved = RealCDXExtractorOutput.resolve(spec, spec);
- assertTrue(spec.equals(resolved));
+ String spec = "https://www.google.com/search?q=java+escape+url+spaces&ie=utf-8&oe=utf-8";
+ String resolved = RealCDXExtractorOutput.resolve(spec, spec);
+ assertTrue(spec.equals(resolved));
}
}
diff --git a/src/test/java/org/archive/format/dns/DNSResponseParserTest.java b/src/test/java/org/archive/format/dns/DNSResponseParserTest.java
index 27d0fdad..7ade0ad5 100644
--- a/src/test/java/org/archive/format/dns/DNSResponseParserTest.java
+++ b/src/test/java/org/archive/format/dns/DNSResponseParserTest.java
@@ -3,15 +3,13 @@
import java.io.ByteArrayInputStream;
import java.io.IOException;
-import org.archive.format.dns.DNSParseException;
-import org.archive.format.dns.DNSRecord;
-import org.archive.format.dns.DNSResponse;
-import org.archive.format.dns.DNSResponseParser;
+import org.junit.jupiter.api.Test;
-import junit.framework.TestCase;
+import static org.junit.jupiter.api.Assertions.assertEquals;
-public class DNSResponseParserTest extends TestCase {
+public class DNSResponseParserTest {
DNSResponseParser parser = new DNSResponseParser();
+ @Test
public void testParse() throws DNSParseException, IOException {
verifyResults("20110328212258\nfarm6.static.flickr.a06.yahoodns.net.\t300\tIN\tA\t98.136.170.121\n",
"20110328212258",new String[][] {{"farm6.static.flickr.a06.yahoodns.net.","300","IN","A","98.136.170.121"}});
diff --git a/src/test/java/org/archive/format/gzip/GZIPMemberSeriesTest.java b/src/test/java/org/archive/format/gzip/GZIPMemberSeriesTest.java
index 2eec46ec..6f218ebb 100644
--- a/src/test/java/org/archive/format/gzip/GZIPMemberSeriesTest.java
+++ b/src/test/java/org/archive/format/gzip/GZIPMemberSeriesTest.java
@@ -9,9 +9,6 @@
import org.archive.util.ByteOp;
import org.archive.util.IAUtils;
import org.archive.util.TestUtils;
-import org.archive.format.gzip.GZIPFormatException;
-import org.archive.format.gzip.GZIPMemberSeries;
-import org.archive.format.gzip.GZIPSeriesMember;
import org.archive.streamcontext.ByteArrayWrappedStream;
import org.archive.streamcontext.SimpleStream;
import org.archive.streamcontext.Stream;
@@ -19,10 +16,13 @@
import com.google.common.io.ByteStreams;
import com.google.common.primitives.Bytes;
-import junit.framework.TestCase;
+import org.junit.jupiter.api.Test;
-public class GZIPMemberSeriesTest extends TestCase {
+import static org.junit.jupiter.api.Assertions.*;
+public class GZIPMemberSeriesTest {
+
+ @Test
public void testSingle() throws IndexOutOfBoundsException, FileNotFoundException, IOException {
InputStream is = getClass().getResourceAsStream("abcd.gz");
@@ -38,6 +38,7 @@ public void testSingle() throws IndexOutOfBoundsException, FileNotFoundException
assertNull(s.getNextMember());
}
+ @Test
public void testSingleEmpty() throws IndexOutOfBoundsException, FileNotFoundException, IOException {
InputStream is = getClass().getResourceAsStream("empty.gz");
@@ -59,6 +60,7 @@ public void testSingleEmpty() throws IndexOutOfBoundsException, FileNotFoundExce
assertTrue(s.gotEOF());
}
+ @Test
public void testDouble() throws IndexOutOfBoundsException, FileNotFoundException, IOException {
InputStream is = getClass().getResourceAsStream("abcd.gz");
@@ -81,14 +83,14 @@ public void testDouble() throws IndexOutOfBoundsException, FileNotFoundException
assertNull(s.getNextMember());
}
-
+ @Test
public void testSingleCRCStrict() throws IndexOutOfBoundsException, FileNotFoundException, IOException {
InputStream is = getClass().getResourceAsStream("abcd.gz");
byte abcd[] = ByteStreams.toByteArray(is);
byte oldb = abcd[abcd.length-1];
abcd[abcd.length-1] = (byte) (abcd[abcd.length-1] + 1);
- assertFalse(oldb == abcd[abcd.length-1]);
+ assertNotEquals(oldb, abcd[abcd.length - 1]);
ByteArrayInputStream bais = new ByteArrayInputStream(abcd);
Stream stream = new SimpleStream(bais);
@@ -117,14 +119,15 @@ public void testSingleCRCStrict() throws IndexOutOfBoundsException, FileNotFound
}
assertNotNull(e);
}
-
+
+ @Test
public void testSingleCRCLAX() throws IndexOutOfBoundsException, FileNotFoundException, IOException {
InputStream is = getClass().getResourceAsStream("abcd.gz");
byte abcd[] = ByteStreams.toByteArray(is);
byte oldb = abcd[abcd.length-1];
abcd[abcd.length-1] = (byte) (abcd[abcd.length-1] + 1);
- assertFalse(oldb == abcd[abcd.length-1]);
+ assertNotEquals(oldb, abcd[abcd.length - 1]);
ByteArrayInputStream bais = new ByteArrayInputStream(abcd);
Stream stream = new SimpleStream(bais);
@@ -154,7 +157,8 @@ public void testSingleCRCLAX() throws IndexOutOfBoundsException, FileNotFoundExc
assertNull(e);
assertNull(s.getNextMember());
}
-
+
+ @Test
public void testDoubleCRC1LAX() throws IndexOutOfBoundsException, FileNotFoundException, IOException {
InputStream is = getClass().getResourceAsStream("abcd.gz");
@@ -162,7 +166,7 @@ public void testDoubleCRC1LAX() throws IndexOutOfBoundsException, FileNotFoundEx
byte abcdorig[] = ByteOp.copy(abcd);
byte oldb = abcd[abcd.length-1];
abcd[abcd.length-1] = (byte) (abcd[abcd.length-1] + 1);
- assertFalse(oldb == abcd[abcd.length-1]);
+ assertNotEquals(oldb, abcd[abcd.length - 1]);
byte both[] = Bytes.concat(abcd,abcdorig);
@@ -195,7 +199,8 @@ public void testDoubleCRC1LAX() throws IndexOutOfBoundsException, FileNotFoundEx
assertNotNull(m);
TestUtils.assertStreamEquals(m,"abcd".getBytes(IAUtils.UTF8));
}
-
+
+ @Test
public void testSingleDeflateError() throws IndexOutOfBoundsException, IOException {
InputStream is = getClass().getResourceAsStream("abcd.gz");
@@ -240,7 +245,7 @@ public void testSingleDeflateError() throws IndexOutOfBoundsException, IOExcepti
assertNull(m);
}
-
+ @Test
public void testDoubleDeflateError() throws IndexOutOfBoundsException, IOException {
InputStream is = getClass().getResourceAsStream("abcd.gz");
@@ -290,7 +295,8 @@ public void testDoubleDeflateError() throws IndexOutOfBoundsException, IOExcepti
assertFalse(s.gotIOError());
}
-
+
+ @Test
public void testDoubleBiggerDeflateErrOnFirst() throws IOException {
String resource = "double-single-inflate-error.gz";
InputStream is = getClass().getResourceAsStream(resource);
@@ -333,7 +339,8 @@ public void testDoubleBiggerDeflateErrOnFirst() throws IOException {
}
-
+
+ @Test
public void testAutoSkip() throws IOException {
InputStream is = getClass().getResourceAsStream("abcd.gz");
byte abcd[] = ByteStreams.toByteArray(is);
@@ -375,6 +382,7 @@ public void testAutoSkip() throws IOException {
assertTrue(s.gotEOF());
}
+ @Test
public void testWgetProblem() throws IndexOutOfBoundsException, FileNotFoundException, IOException {
InputStream is = getClass().getResourceAsStream("IAH-urls-wget.warc.gz");
new GZIPDecoder().parseHeader(is);
diff --git a/src/test/java/org/archive/format/gzip/GZIPMemberWriterTest.java b/src/test/java/org/archive/format/gzip/GZIPMemberWriterTest.java
index 483d2baf..45bc18e4 100644
--- a/src/test/java/org/archive/format/gzip/GZIPMemberWriterTest.java
+++ b/src/test/java/org/archive/format/gzip/GZIPMemberWriterTest.java
@@ -7,10 +7,11 @@
import org.archive.util.IAUtils;
-import junit.framework.TestCase;
+import org.junit.jupiter.api.Test;
-public class GZIPMemberWriterTest extends TestCase {
+public class GZIPMemberWriterTest {
+ @Test
public void testWrite() throws IOException {
File outFile = File.createTempFile("tmp", ".gz");
GZIPMemberWriter gzw = new GZIPMemberWriter(new FileOutputStream(outFile));
diff --git a/src/test/java/org/archive/format/gzip/zipnum/ZipNumWriterTest.java b/src/test/java/org/archive/format/gzip/zipnum/ZipNumWriterTest.java
index cfadbd79..25a5eaa7 100644
--- a/src/test/java/org/archive/format/gzip/zipnum/ZipNumWriterTest.java
+++ b/src/test/java/org/archive/format/gzip/zipnum/ZipNumWriterTest.java
@@ -10,19 +10,21 @@
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
-import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
import org.archive.format.gzip.GZIPMemberSeries;
import org.archive.format.gzip.GZIPSeriesMember;
import org.archive.streamcontext.SimpleStream;
-import junit.framework.TestCase;
+import org.junit.jupiter.api.Test;
-public class ZipNumWriterTest extends TestCase {
+import static org.junit.jupiter.api.Assertions.assertEquals;
+public class ZipNumWriterTest {
+
+ @Test
public void testAddRecord() throws IOException {
- Charset UTF8 = Charset.forName("UTF-8");
- File main = File.createTempFile("test-znw",".main");
+ File main = File.createTempFile("test-znw",".main");
File summ = File.createTempFile("test-znw",".summ");
main.deleteOnExit();
summ.deleteOnExit();
@@ -31,11 +33,11 @@ public void testAddRecord() throws IOException {
ZipNumWriter znw = new ZipNumWriter(new FileOutputStream(main,false),
new FileOutputStream(summ,false), limit);
for(int i = 0; i < 1000; i++) {
- znw.addRecord(String.format("%06d\n",i).getBytes(UTF8));
+ znw.addRecord(String.format("%06d\n",i).getBytes(StandardCharsets.UTF_8));
}
znw.close();
InputStreamReader isr =
- new InputStreamReader(new FileInputStream(summ),UTF8);
+ new InputStreamReader(new FileInputStream(summ), StandardCharsets.UTF_8);
BufferedReader br = new BufferedReader(isr);
String line = null;
int count = 0;
diff --git a/src/test/java/org/archive/format/http/HttpRequestMessageParserTest.java b/src/test/java/org/archive/format/http/HttpRequestMessageParserTest.java
index 50df9dde..9a5d69af 100644
--- a/src/test/java/org/archive/format/http/HttpRequestMessageParserTest.java
+++ b/src/test/java/org/archive/format/http/HttpRequestMessageParserTest.java
@@ -3,16 +3,16 @@
import java.io.ByteArrayInputStream;
import java.io.IOException;
-import org.archive.format.http.HttpConstants;
-import org.archive.format.http.HttpParseException;
-import org.archive.format.http.HttpRequestMessage;
-import org.archive.format.http.HttpRequestMessageParser;
import org.archive.util.IAUtils;
-import junit.framework.TestCase;
+import org.junit.jupiter.api.Test;
-public class HttpRequestMessageParserTest extends TestCase implements HttpConstants {
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+public class HttpRequestMessageParserTest implements HttpConstants {
HttpRequestMessageParser parser = new HttpRequestMessageParser();
+
+ @Test
public void testParse() throws IOException {
assertParse("GET / HTTP/1.0\r\n", METHOD_GET, "/", VERSION_0);
assertParse("GET / HTTP/1.1\r\n", METHOD_GET, "/", VERSION_1);
diff --git a/src/test/java/org/archive/format/http/HttpResponseParserTest.java b/src/test/java/org/archive/format/http/HttpResponseParserTest.java
index ea076a69..631d67c7 100644
--- a/src/test/java/org/archive/format/http/HttpResponseParserTest.java
+++ b/src/test/java/org/archive/format/http/HttpResponseParserTest.java
@@ -5,16 +5,14 @@
import org.archive.util.IAUtils;
import org.archive.util.TestUtils;
-import org.archive.format.http.HttpHeader;
-import org.archive.format.http.HttpHeaders;
-import org.archive.format.http.HttpParseException;
-import org.archive.format.http.HttpResponse;
-import org.archive.format.http.HttpResponseParser;
-import junit.framework.TestCase;
+import org.junit.jupiter.api.Test;
-public class HttpResponseParserTest extends TestCase {
+import static org.junit.jupiter.api.Assertions.*;
+public class HttpResponseParserTest {
+
+ @Test
public void testParse() throws IOException {
HttpResponseParser parser = new HttpResponseParser();
@@ -38,6 +36,7 @@ public void testParse() throws IOException {
}
+ @Test
public void testParseWithLf() throws IOException {
HttpResponseParser parser = new HttpResponseParser();
@@ -57,6 +56,7 @@ public void testParseWithLf() throws IOException {
}
+ @Test
public void testParseEmptyHeaderField() throws IOException {
HttpResponseParser parser = new HttpResponseParser();
diff --git a/src/test/java/org/archive/format/json/CompoundORJSONPathSpecTest.java b/src/test/java/org/archive/format/json/CompoundORJSONPathSpecTest.java
index 57c21965..ef8c2fa0 100644
--- a/src/test/java/org/archive/format/json/CompoundORJSONPathSpecTest.java
+++ b/src/test/java/org/archive/format/json/CompoundORJSONPathSpecTest.java
@@ -6,11 +6,12 @@
import org.json.JSONException;
import org.json.JSONObject;
-import junit.framework.TestCase;
+import org.junit.jupiter.api.Test;
-public class CompoundORJSONPathSpecTest extends TestCase {
+public class CompoundORJSONPathSpecTest {
String json1S = "{\"a\":\"A\"}";
String json2S = "{\"b\":\"B\"}";
+ @Test
public void testExtract() throws JSONException {
JSONObject json1 = new JSONObject(json1S);
JSONObject json2 = new JSONObject(json2S);
diff --git a/src/test/java/org/archive/format/json/JSONPathSpecFactoryTest.java b/src/test/java/org/archive/format/json/JSONPathSpecFactoryTest.java
index ab999dca..257cb112 100644
--- a/src/test/java/org/archive/format/json/JSONPathSpecFactoryTest.java
+++ b/src/test/java/org/archive/format/json/JSONPathSpecFactoryTest.java
@@ -4,9 +4,9 @@
import org.json.JSONException;
import org.json.JSONObject;
-import junit.framework.TestCase;
+import org.junit.jupiter.api.Test;
-public class JSONPathSpecFactoryTest extends TestCase {
+public class JSONPathSpecFactoryTest {
String json1S = "{\"a\":\"A\"}";
String json2S = "{\"b\":\"B\"}";
@@ -14,6 +14,7 @@ public class JSONPathSpecFactoryTest extends TestCase {
String json4S = "{\"b\":[{\"x\":\"x1\", \"y\":\"y1\"},{\"x\":\"x2\", \"y\":\"y2\"}]}";
+ @Test
public void testGet() throws JSONException {
JSONObject json1 = new JSONObject(json1S);
JSONObject json2 = new JSONObject(json2S);
diff --git a/src/test/java/org/archive/format/json/JSONViewTest.java b/src/test/java/org/archive/format/json/JSONViewTest.java
index 20bd4fe6..aabbe7df 100644
--- a/src/test/java/org/archive/format/json/JSONViewTest.java
+++ b/src/test/java/org/archive/format/json/JSONViewTest.java
@@ -4,14 +4,15 @@
import org.json.JSONException;
import org.json.JSONObject;
-import junit.framework.TestCase;
+import org.junit.jupiter.api.Test;
-public class JSONViewTest extends TestCase {
+public class JSONViewTest {
public int getInt(byte b[]) {
return b[0] & 0xff;
}
-
+
+ @Test
public void testBytes() throws JSONException {
JSONObject o = new JSONObject();
o.append("name1", "val\\rue1");
@@ -28,6 +29,8 @@ public void testBytes() throws JSONException {
System.out.format("I(%d) gi(%d)\n",i,gi);
}
}
+
+ @Test
public void testApply() throws JSONException {
String json1S = "{\"url\":\"a\",\"link\":[{\"zz\":\"1\",\"qq\":\"qa\"},{\"zz2\":\"2\",\"qq\":\"qb\"},{\"zz\":\"3\",\"qq\":\"qc\"},{\"zz\":\"4\"}]}";
JSONObject json1 = new JSONObject(json1S);
diff --git a/src/test/java/org/archive/format/json/SimpleJSONPathSpecTest.java b/src/test/java/org/archive/format/json/SimpleJSONPathSpecTest.java
index a703b49a..640a5a80 100644
--- a/src/test/java/org/archive/format/json/SimpleJSONPathSpecTest.java
+++ b/src/test/java/org/archive/format/json/SimpleJSONPathSpecTest.java
@@ -4,15 +4,16 @@
import org.json.JSONException;
import org.json.JSONObject;
-import junit.framework.TestCase;
+import org.junit.jupiter.api.Test;
-public class SimpleJSONPathSpecTest extends TestCase {
+public class SimpleJSONPathSpecTest {
String json1 = "{\"a\": { \"b\": \"Foo\" }}";
String json2 = "{\"a\": { \"b\": [{\"a\":\"1\"},{\"a\":\"2\"}] }}";
String json3 = "{\"a\": { \"b\": {\"A\":\"11\",\"B\":\"22\"} }}";
String json4 = "{\"a\": { \"b\": [{\"A\":\"11\",\"B\":\"22\"},{\"A\":\"33\",\"B\":\"44\"}] }}";
+ @Test
public void testExtract() throws JSONException {
JSONObject json = new JSONObject(json1);
JSONPathSpec spec = new SimpleJSONPathSpec("a.b");
diff --git a/src/test/java/org/archive/format/text/html/CDATALexerTest.java b/src/test/java/org/archive/format/text/html/CDATALexerTest.java
index 481a3eda..856576ba 100644
--- a/src/test/java/org/archive/format/text/html/CDATALexerTest.java
+++ b/src/test/java/org/archive/format/text/html/CDATALexerTest.java
@@ -1,17 +1,16 @@
package org.archive.format.text.html;
-import org.archive.format.text.html.CDATALexer;
-import org.archive.format.text.html.NodeUtils;
import org.htmlparser.Node;
import org.htmlparser.lexer.Page;
-//import org.htmlparser.nodes.RemarkNode;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.util.ParserException;
-import junit.framework.TestCase;
+import org.junit.jupiter.api.Test;
-public class CDATALexerTest extends TestCase {
+import static org.junit.jupiter.api.Assertions.*;
+
+public class CDATALexerTest {
CDATALexer l;
Node n;
private CDATALexer makeLexer(String html) {
@@ -19,7 +18,8 @@ private CDATALexer makeLexer(String html) {
t.setPage(new Page(html));
return t;
}
-
+
+ @Test
public void testNextNode() throws ParserException {
l = makeLexer("blem");
n = l.nextNode();
@@ -35,6 +35,7 @@ public void testNextNode() throws ParserException {
assertNull(l.nextNode());
}
+ @Test
public void testInJS() throws ParserException {
l = makeLexer("");
assertFalse(l.inCSS());
@@ -54,6 +55,7 @@ public void testInJS() throws ParserException {
assertTrue(NodeUtils.isCloseTagNodeNamed(n, "SCRIPT"));
}
+ @Test
public void testInCSS() throws ParserException {
l = makeLexer("");
assertFalse(l.inCSS());
diff --git a/src/test/java/org/archive/io/ArchiveReaderFactoryTest.java b/src/test/java/org/archive/io/ArchiveReaderFactoryTest.java
index 2313868c..f7ad75d2 100644
--- a/src/test/java/org/archive/io/ArchiveReaderFactoryTest.java
+++ b/src/test/java/org/archive/io/ArchiveReaderFactoryTest.java
@@ -21,29 +21,34 @@
import java.io.File;
import java.io.IOException;
-import java.net.MalformedURLException;
import java.net.URL;
import java.util.Iterator;
import org.apache.commons.lang.StringUtils;
-import org.archive.io.ArchiveRecord;
import org.archive.io.arc.ARCWriterTest;
-import org.archive.util.TmpDirTestCase;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+public class ArchiveReaderFactoryTest {
+ @TempDir
+ File tempDir;
-public class ArchiveReaderFactoryTest extends TmpDirTestCase {
/**
* Test local file as URL
* @throws IOException
*/
+ @Test
public void testGetFileURL() throws IOException {
- File arc = ARCWriterTest.createARCFile(getTmpDir(), true);
+ File arc = ARCWriterTest.createARCFile(tempDir, true);
ArchiveReader reader = null;
try {
reader = ArchiveReaderFactory.
get(new URL("file:////" + arc.getAbsolutePath()));
for (Iterator i = reader.iterator(); i.hasNext();) {
ArchiveRecord r = (ArchiveRecord)i.next();
- assertTrue("mime unread",StringUtils.isNotBlank(r.getHeader().getMimetype()));
+ assertTrue(StringUtils.isNotBlank(r.getHeader().getMimetype()),"mime unread");
}
} finally {
if (reader != null) {
@@ -56,14 +61,15 @@ public void testGetFileURL() throws IOException {
* Test local file as File
* @throws IOException
*/
+ @Test
public void testGetFile() throws IOException {
- File arc = ARCWriterTest.createARCFile(getTmpDir(), true);
+ File arc = ARCWriterTest.createARCFile(tempDir, true);
ArchiveReader reader = null;
try {
reader = ArchiveReaderFactory.get(arc.getAbsoluteFile());
for (Iterator i = reader.iterator(); i.hasNext();) {
ArchiveRecord r = (ArchiveRecord)i.next();
- assertTrue("mime unread",StringUtils.isNotBlank(r.getHeader().getMimetype()));
+ assertTrue(StringUtils.isNotBlank(r.getHeader().getMimetype()),"mime unread");
}
} finally {
if (reader != null) {
@@ -76,14 +82,15 @@ public void testGetFile() throws IOException {
* Test local file as String path
* @throws IOException
*/
+ @Test
public void testGetPath() throws IOException {
- File arc = ARCWriterTest.createARCFile(getTmpDir(), true);
+ File arc = ARCWriterTest.createARCFile(tempDir, true);
ArchiveReader reader = null;
try {
reader = ArchiveReaderFactory.get(arc.getAbsoluteFile().getAbsolutePath());
for (Iterator i = reader.iterator(); i.hasNext();) {
ArchiveRecord r = (ArchiveRecord)i.next();
- assertTrue("mime unread",StringUtils.isNotBlank(r.getHeader().getMimetype()));
+ assertTrue(StringUtils.isNotBlank(r.getHeader().getMimetype()),"mime unread");
}
} finally {
if (reader != null) {
diff --git a/src/test/java/org/archive/io/BufferedSeekInputStreamTest.java b/src/test/java/org/archive/io/BufferedSeekInputStreamTest.java
index 270e45e0..f7e8e0b2 100644
--- a/src/test/java/org/archive/io/BufferedSeekInputStreamTest.java
+++ b/src/test/java/org/archive/io/BufferedSeekInputStreamTest.java
@@ -18,9 +18,11 @@
*/
package org.archive.io;
+import org.junit.jupiter.api.Test;
+
import java.util.Random;
-import junit.framework.TestCase;
+import static org.junit.jupiter.api.Assertions.assertEquals;
/**
@@ -29,11 +31,12 @@
*
* @author pjack
*/
-public class BufferedSeekInputStreamTest extends TestCase {
+public class BufferedSeekInputStreamTest {
private static byte[] TEST_DATA = makeTestData();
-
+
+ @Test
public void testPosition() throws Exception {
Random random = new Random();
ArraySeekInputStream asis = new ArraySeekInputStream(TEST_DATA);
diff --git a/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java b/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java
index 9f7e2a15..7988cb2b 100644
--- a/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java
+++ b/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java
@@ -26,13 +26,15 @@
import java.util.Map;
import java.util.Set;
-import junit.framework.TestCase;
-
import org.apache.commons.httpclient.Header;
import org.archive.io.arc.ARCRecord;
import org.archive.io.warc.WARCRecord;
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
-public class HeaderedArchiveRecordTest extends TestCase {
+public class HeaderedArchiveRecordTest {
private static final String HTTPHEADER = "HTTP/1.1 200 OK\r\n"
+ "Last-Modified: Sun, 28 Aug 2005 14:10:55 GMT\r\n"
+ "Content-Length: 108\r\n" + "Connection: close\r\n"
@@ -41,6 +43,7 @@ public class HeaderedArchiveRecordTest extends TestCase {
+ " Neue Seite 1\r\n" + " \r\n"
+ " \r\n" + " \r\n" + "