From a2cc42cac2777d06ab40e09811cdc883773775b9 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Thu, 11 Jun 2020 14:24:03 +0200
Subject: [PATCH 001/174] WAT extractor: do not fail on missing WARC-Filename
in warcinfo record, fixes #88 - do not throw IOException if there is no
WARC-Filename in warcinfo record - write metadata record (corresponding to
warcinfo) without WARC-Target-URI
---
src/main/java/org/archive/extract/WATExtractorOutput.java | 2 +-
src/main/java/org/archive/format/warc/WARCRecordWriter.java | 5 ++++-
2 files changed, 5 insertions(+), 2 deletions(-)
diff --git a/src/main/java/org/archive/extract/WATExtractorOutput.java b/src/main/java/org/archive/extract/WATExtractorOutput.java
index 3bcfa924..4b5f72ed 100644
--- a/src/main/java/org/archive/extract/WATExtractorOutput.java
+++ b/src/main/java/org/archive/extract/WATExtractorOutput.java
@@ -151,7 +151,7 @@ private void writeWARC(OutputStream recOut, MetaData md) throws IOException {
String warcType = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Type");
String targetURI;
if(warcType.equals("warcinfo")) {
- targetURI = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Filename");
+ targetURI = JSONUtils.extractSingle(md, "Envelope.WARC-Header-Metadata.WARC-Filename");
} else {
targetURI = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Target-URI");
}
diff --git a/src/main/java/org/archive/format/warc/WARCRecordWriter.java b/src/main/java/org/archive/format/warc/WARCRecordWriter.java
index 0aab83b7..3278b289 100644
--- a/src/main/java/org/archive/format/warc/WARCRecordWriter.java
+++ b/src/main/java/org/archive/format/warc/WARCRecordWriter.java
@@ -88,7 +88,10 @@ public void writeJSONMetadataRecord( OutputStream out,
{
HttpHeaders headers = new HttpHeaders();
headers.add(HEADER_KEY_TYPE, WARCRecordType.metadata.name());
- headers.add(HEADER_KEY_URI, targetURI);
+ if (targetURI != null) {
+ // WARC-Target-URI is optional in metadata records
+ headers.add(HEADER_KEY_URI, targetURI);
+ }
headers.add(HEADER_KEY_DATE, DateUtils.getLog14Date(originalDate));
headers.add(HEADER_KEY_ID, makeRecordId());
headers.add(HEADER_KEY_REFERS_TO, origRecordId);
From 04e10397b9137a36812c17276826bc60d1a37ede Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Mon, 15 Jun 2020 13:29:25 +0200
Subject: [PATCH 002/174] Update change log to include #85, #86 and #89
---
CHANGES.md | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/CHANGES.md b/CHANGES.md
index dcb598d9..bf985ada 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,3 +1,10 @@
+1.1.10
+------
+* [WAT extractor: do not fail on missing WARC-Filename in warcinfo record](https://github.com/iipc/webarchive-commons/pull/89)
+* [ExtractingParseObserver: extract rel, hreflang and type attributes](https://github.com/iipc/webarchive-commons/pull/86)
+* [ExtractingParseObserver: extract links from onClick attributes](https://github.com/iipc/webarchive-commons/pull/85)
+* [Update TravisCI config](https://github.com/iipc/webarchive-commons/pull/83)
+
1.1.9
-----
* [Use commons-collections v3.2.2 to avoid v3.2.1 vulnerability](https://github.com/iipc/webarchive-commons/pull/77)
From 9041ff4e96f6554658742affe490223dc0241d06 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 13 Oct 2020 01:28:48 +0000
Subject: [PATCH 003/174] Bump junit from 3.8.1 to 4.13.1
Bumps [junit](https://github.com/junit-team/junit4) from 3.8.1 to 4.13.1.
- [Release notes](https://github.com/junit-team/junit4/releases)
- [Changelog](https://github.com/junit-team/junit4/blob/main/doc/ReleaseNotes4.13.1.md)
- [Commits](https://github.com/junit-team/junit4/commits/r4.13.1)
Signed-off-by: dependabot[bot]
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index 1cbeb99a..5ca7e1a3 100644
--- a/pom.xml
+++ b/pom.xml
@@ -64,7 +64,7 @@
junit
junit
- 3.8.1
+ 4.13.1
From c2530d77b73838c31f4e83f2be941ec61032ebb2 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Tue, 16 Mar 2021 11:58:11 +0100
Subject: [PATCH 004/174] Fix InterruptibleCharSequenceTest
(testInterruptibility) to run on JDK 11 - if thread running the regexp
matching is already finished after the initial/current sleeping time, rerun
the test again with a shorter sleeping time until the expected
RuntimeException is hit
---
.../util/InterruptibleCharSequenceTest.java | 26 +++++++++++++------
1 file changed, 18 insertions(+), 8 deletions(-)
diff --git a/src/test/java/org/archive/util/InterruptibleCharSequenceTest.java b/src/test/java/org/archive/util/InterruptibleCharSequenceTest.java
index a3a5f180..8b5c5d1b 100644
--- a/src/test/java/org/archive/util/InterruptibleCharSequenceTest.java
+++ b/src/test/java/org/archive/util/InterruptibleCharSequenceTest.java
@@ -107,14 +107,24 @@ public void testNoninterruptible() throws InterruptedException {
}
public void testInterruptibility() throws InterruptedException {
- BlockingQueue