From c2530d77b73838c31f4e83f2be941ec61032ebb2 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Tue, 16 Mar 2021 11:58:11 +0100
Subject: [PATCH 001/169] Fix InterruptibleCharSequenceTest
(testInterruptibility) to run on JDK 11 - if thread running the regexp
matching is already finished after the initial/current sleeping time, rerun
the test again with a shorter sleeping time until the expected
RuntimeException is hit
---
.../util/InterruptibleCharSequenceTest.java | 26 +++++++++++++------
1 file changed, 18 insertions(+), 8 deletions(-)
diff --git a/src/test/java/org/archive/util/InterruptibleCharSequenceTest.java b/src/test/java/org/archive/util/InterruptibleCharSequenceTest.java
index a3a5f180..8b5c5d1b 100644
--- a/src/test/java/org/archive/util/InterruptibleCharSequenceTest.java
+++ b/src/test/java/org/archive/util/InterruptibleCharSequenceTest.java
@@ -107,14 +107,24 @@ public void testNoninterruptible() throws InterruptedException {
}
public void testInterruptibility() throws InterruptedException {
- BlockingQueue q = new LinkedBlockingQueue();
- Thread t = tryMatchInThread(new InterruptibleCharSequence(INPUT), BACKTRACKER, q);
- Thread.sleep(500);
- t.interrupt();
- Object result = q.take();
- if(result instanceof Boolean) {
- System.err.println(result+" match beat interrupt");
+ long sleepMillis = 512;
+ while (sleepMillis > 0) {
+ BlockingQueue q = new LinkedBlockingQueue();
+ Thread t = tryMatchInThread(new InterruptibleCharSequence(INPUT), BACKTRACKER, q);
+ Thread.sleep(sleepMillis);
+ if (t.getState() == Thread.State.TERMINATED) {
+ sleepMillis /= 2;
+ System.err.println("already done, retrying with shorter sleep time: " + sleepMillis + "ms");
+ continue;
+ }
+ t.interrupt();
+ Object result = q.take();
+ if(result instanceof Boolean) {
+ System.err.println(result+" match beat interrupt");
+ }
+ assertTrue("exception not thrown",result instanceof RuntimeException);
+ return;
}
- assertTrue("exception not thrown",result instanceof RuntimeException);
+ fail("failed to interrupt InterruptibleCharSequence with given sleeping intervals");
}
}
From ad6904bc0d43538806d3a4c00c636183e40392a4 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 26 Apr 2021 17:20:03 +0000
Subject: [PATCH 002/169] Bump commons-io from 2.4 to 2.7
Bumps commons-io from 2.4 to 2.7.
Signed-off-by: dependabot[bot]
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index 5ca7e1a3..67785dd2 100644
--- a/pom.xml
+++ b/pom.xml
@@ -152,7 +152,7 @@
commons-io
commons-io
- 2.4
+ 2.7
From efbd7616bbc4b7d700b3923a52ade2de2f5a00e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kristinn=20Sigur=C3=B0sson?=
Date: Fri, 3 Nov 2023 09:32:36 +0000
Subject: [PATCH 003/169] Update to dsiutils 2.2.8
Oldest version to not depend on log4j 1
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index 5ca7e1a3..067ae72b 100644
--- a/pom.xml
+++ b/pom.xml
@@ -163,7 +163,7 @@
it.unimi.dsi
dsiutils
- 2.0.12
+ 2.2.8
compile
From 4a9f2807396a6199b9681a02dceb1b63e59f4863 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Tue, 10 Sep 2024 14:03:58 +0900
Subject: [PATCH 004/169] Add github action to run tests
---
.github/workflows/maven.yml | 38 +++++++++++++++++++++++++++++++++++++
1 file changed, 38 insertions(+)
create mode 100644 .github/workflows/maven.yml
diff --git a/.github/workflows/maven.yml b/.github/workflows/maven.yml
new file mode 100644
index 00000000..ea8a34e9
--- /dev/null
+++ b/.github/workflows/maven.yml
@@ -0,0 +1,38 @@
+name: Java CI with Maven
+
+on:
+ push:
+ branches: [ "master" ]
+ pull_request:
+ branches: [ "master" ]
+
+jobs:
+ build:
+ strategy:
+ matrix:
+ jdk: [8, 11, 17, 21, 22]
+
+ runs-on: ubuntu-latest
+ timeout-minutes: 30
+
+ steps:
+ - uses: actions/checkout@v4
+ - name: Set up JDK ${{ matrix.jdk }}
+ uses: actions/setup-java@v4
+ with:
+ java-version: ${{ matrix.jdk }}
+ distribution: 'temurin'
+ cache: maven
+ - name: Cache local Maven repository
+ uses: actions/cache@v2
+ with:
+ path: ~/.m2/repository
+ key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}
+ restore-keys: |
+ ${{ runner.os }}-maven-
+ - name: Build with Maven
+ run: mvn -B package --file pom.xml
+
+ # Optional: Uploads the full dependency graph to GitHub to improve the quality of Dependabot alerts this repository can receive
+ - name: Update dependency graph
+ uses: advanced-security/maven-dependency-submission-action@571e99aab1055c2e71a1e2309b9691de18d6b7d6
From b389fe5b3b880f9eeb7a6b3612a91724ad968347 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Tue, 10 Sep 2024 14:10:57 +0900
Subject: [PATCH 005/169] Update source & target version from 1.6 to 8
1.6 isn't supported on newer JDK versions.
---
pom.xml | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/pom.xml b/pom.xml
index 5ca7e1a3..3272c7b6 100644
--- a/pom.xml
+++ b/pom.xml
@@ -204,8 +204,8 @@
maven-compiler-plugin
2.3.2
- 1.6
- 1.6
+ 8
+ 8
From 7f9dc992fe374d3230f77e979971e6971414f86e Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Tue, 10 Sep 2024 14:38:48 +0900
Subject: [PATCH 006/169] Add matrix config to
maven-dependency-submission-action
---
.github/workflows/maven.yml | 3 +++
1 file changed, 3 insertions(+)
diff --git a/.github/workflows/maven.yml b/.github/workflows/maven.yml
index ea8a34e9..8b675913 100644
--- a/.github/workflows/maven.yml
+++ b/.github/workflows/maven.yml
@@ -36,3 +36,6 @@ jobs:
# Optional: Uploads the full dependency graph to GitHub to improve the quality of Dependabot alerts this repository can receive
- name: Update dependency graph
uses: advanced-security/maven-dependency-submission-action@571e99aab1055c2e71a1e2309b9691de18d6b7d6
+ with:
+ directory: ${{ matrix.directory }}
+ correlator: ${{ github.job }}-${{ matrix.directory }}
\ No newline at end of file
From db88f33fb492f0f0a98ff7d851ff9cb06e0a5ec5 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Tue, 10 Sep 2024 14:41:47 +0900
Subject: [PATCH 007/169] Update to maven-dependency-submission-action@v4.1.1
and remove matrix stuff
I don't think the matrix stuff actually fixes the problem.
---
.github/workflows/maven.yml | 5 +----
1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/.github/workflows/maven.yml b/.github/workflows/maven.yml
index 8b675913..db0d8677 100644
--- a/.github/workflows/maven.yml
+++ b/.github/workflows/maven.yml
@@ -35,7 +35,4 @@ jobs:
# Optional: Uploads the full dependency graph to GitHub to improve the quality of Dependabot alerts this repository can receive
- name: Update dependency graph
- uses: advanced-security/maven-dependency-submission-action@571e99aab1055c2e71a1e2309b9691de18d6b7d6
- with:
- directory: ${{ matrix.directory }}
- correlator: ${{ github.job }}-${{ matrix.directory }}
\ No newline at end of file
+ uses: advanced-security/maven-dependency-submission-action@v4.1.1
\ No newline at end of file
From 0f0d68bb771d5e2fb66590c782aa6da50043599a Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Tue, 10 Sep 2024 14:46:03 +0900
Subject: [PATCH 008/169] Only run maven-dependency-submission-action on push
Hopefully fixes test failures on PRs.
---
.github/workflows/maven.yml | 1 +
1 file changed, 1 insertion(+)
diff --git a/.github/workflows/maven.yml b/.github/workflows/maven.yml
index db0d8677..8bb55c4e 100644
--- a/.github/workflows/maven.yml
+++ b/.github/workflows/maven.yml
@@ -35,4 +35,5 @@ jobs:
# Optional: Uploads the full dependency graph to GitHub to improve the quality of Dependabot alerts this repository can receive
- name: Update dependency graph
+ if: ${{ github.event_name == 'push' }}
uses: advanced-security/maven-dependency-submission-action@v4.1.1
\ No newline at end of file
From cddea766638c92bd74072346de3466b2a570c714 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 10 Sep 2024 05:51:53 +0000
Subject: [PATCH 009/169] Bump org.json:json from 20131018 to 20231013
Bumps [org.json:json](https://github.com/douglascrockford/JSON-java) from 20131018 to 20231013.
- [Release notes](https://github.com/douglascrockford/JSON-java/releases)
- [Changelog](https://github.com/stleary/JSON-java/blob/master/docs/RELEASES.md)
- [Commits](https://github.com/douglascrockford/JSON-java/commits)
---
updated-dependencies:
- dependency-name: org.json:json
dependency-type: direct:production
...
Signed-off-by: dependabot[bot]
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index fee61789..a496d3b8 100644
--- a/pom.xml
+++ b/pom.xml
@@ -76,7 +76,7 @@
org.json
json
- 20131018
+ 20231013
org.htmlparser
From cb1b2af3de78289c648d3856692a188d19e5a412 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Wed, 11 Sep 2024 14:13:02 +0900
Subject: [PATCH 010/169] Bump httpclient from 3.1 to 4.5.14
---
pom.xml | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/pom.xml b/pom.xml
index a496d3b8..4c325df8 100644
--- a/pom.xml
+++ b/pom.xml
@@ -91,9 +91,9 @@
- commons-httpclient
- commons-httpclient
- 3.1
+ org.apache.httpcomponents
+ httpclient
+ 4.5.14
From 573443b0a8d1541fe1a164dbf43ce81aa69e3c04 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Wed, 11 Sep 2024 14:16:19 +0900
Subject: [PATCH 011/169] Bump guava from 17.0 to 33.3.0-jre
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index 4c325df8..83d34490 100644
--- a/pom.xml
+++ b/pom.xml
@@ -70,7 +70,7 @@
com.google.guava
guava
- 17.0
+ 33.3.0-jre
From 282cecce3fc92cfc891624c4575ce49a63669f8d Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Wed, 11 Sep 2024 14:39:18 +0900
Subject: [PATCH 012/169] Bump hadoop from 0.20.2-cdh3u4 to 3.4.0 and mark
optional
Most dependent applications don't actually use hadoop and those that do will likely want to specify their own version of it. It also pulls in a lot of transitive dependencies that consuming projects often have to exclude.
---
pom.xml | 66 +++++++++------------------------------------------------
1 file changed, 10 insertions(+), 56 deletions(-)
diff --git a/pom.xml b/pom.xml
index 83d34490..8f4dba1a 100644
--- a/pom.xml
+++ b/pom.xml
@@ -98,42 +98,16 @@
org.apache.hadoop
- hadoop-core
- 0.20.2-cdh3u4
-
-
- commons-httpclient
- commons-httpclient
-
-
- javax.servlet
- servlet-api
-
-
- javax.servlet.jsp
- jsp-api
-
-
- org.mortbay.jetty
- jetty
-
-
- org.mortbay.jetty
- jetty-util
-
-
- tomcat
- jasper-runtime
-
-
- tomcat
- jasper-compiler
-
-
- hsqldb
- hsqldb
-
-
+ hadoop-common
+ 3.4.0
+ true
+
+
+
+ org.apache.hadoop
+ hadoop-mapreduce-client-core
+ 3.4.0
+ true
@@ -257,26 +231,6 @@
-
-
- cloudera
- Cloudera Hadoop
- https://repository.cloudera.com/artifactory/cloudera-repos/
- default
-
-
- true
- daily
- warn
-
-
- true
- daily
- warn
-
-
-
-
)
+ * |ID1|ID2|CM |FLG| MTIME |XFL|OS | (more-->)
* +---+---+---+---+---+---+---+---+---+---+
*/
public class GZIPStaticHeader implements GZIPConstants {
diff --git a/src/main/java/org/archive/io/ReplayCharSequence.java b/src/main/java/org/archive/io/ReplayCharSequence.java
index aa9b9587..e456e293 100644
--- a/src/main/java/org/archive/io/ReplayCharSequence.java
+++ b/src/main/java/org/archive/io/ReplayCharSequence.java
@@ -59,7 +59,7 @@ public interface ReplayCharSequence extends CharSequence, Closeable {
public long getDecodeExceptionCount();
/**
- * Return the first coding-exception encountered, if the count > 0.
+ * Return the first coding-exception encountered, if the count > 0.
* @return CharacterCodingException
*/
public CharacterCodingException getCodingException();
diff --git a/src/main/java/org/archive/io/arc/ARCWriter.java b/src/main/java/org/archive/io/arc/ARCWriter.java
index 0bd0ef9b..c7042943 100644
--- a/src/main/java/org/archive/io/arc/ARCWriter.java
+++ b/src/main/java/org/archive/io/arc/ARCWriter.java
@@ -86,7 +86,7 @@
* write our own GZIP*Streams, ones that resettable and consious of gzip
* members.
*
- * This class will write until we hit >= maxSize. The check is done at
+ *
This class will write until we hit >= maxSize. The check is done at
* record boundary. Records do not span ARC files. We will then close current
* file and open another and then continue writing.
*
@@ -95,9 +95,9 @@
* alexa
* ARC c-tools :
*
- * % av_procarc hx20040109230030-0.arc.gz | av_ziparc > \
+ * % av_procarc hx20040109230030-0.arc.gz | av_ziparc > \
* /tmp/hx20040109230030-0.dat.gz
- * % av_ripdat /tmp/hx20040109230030-0.dat.gz > /tmp/hx20040109230030-0.cdx
+ * % av_ripdat /tmp/hx20040109230030-0.dat.gz > /tmp/hx20040109230030-0.cdx
*
* Examine the produced cdx file to make sure it makes sense. Search
* for 'no-type 0'. If found, then we're opening a gzip record w/o data to
diff --git a/src/main/java/org/archive/util/DateUtils.java b/src/main/java/org/archive/util/DateUtils.java
index 0be20e63..7d6a7c98 100755
--- a/src/main/java/org/archive/util/DateUtils.java
+++ b/src/main/java/org/archive/util/DateUtils.java
@@ -557,7 +557,7 @@ private static String doubleToString(double val, int maxFractionDigits, int minF
* Takes a byte size and formats it for display with 'friendly' units.
*
* This involves converting it to the largest unit
- * (of B, KiB, MiB, GiB, TiB) for which the amount will be > 1.
+ * (of B, KiB, MiB, GiB, TiB) for which the amount will be > 1.
*
* Additionally, at least 2 significant digits are always displayed.
*
From 0d881e967daf2a023006032dd0d015b714821b11 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Tue, 15 Oct 2024 17:42:23 +0900
Subject: [PATCH 023/169] [maven-release-plugin] prepare release
webarchive-commons-1.1.10
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index f0c6ac73..2dd9223b 100644
--- a/pom.xml
+++ b/pom.xml
@@ -9,7 +9,7 @@
org.netpreserve.commons
webarchive-commons
- 1.1.10-SNAPSHOT
+ 1.1.10
jar
webarchive-commons
From 76d95ccd75ddc31c5b8c3e9136f9e422ab528898 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Tue, 15 Oct 2024 17:42:28 +0900
Subject: [PATCH 024/169] [maven-release-plugin] prepare for next development
iteration
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index 2dd9223b..dc3088f0 100644
--- a/pom.xml
+++ b/pom.xml
@@ -9,7 +9,7 @@
org.netpreserve.commons
webarchive-commons
- 1.1.10
+ 1.1.11-SNAPSHOT
jar
webarchive-commons
From 835f4e115b2cd288bed3f703136a7325c81fa751 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Sat, 9 Nov 2024 20:27:47 +0100
Subject: [PATCH 025/169] Make MetaData multi-valued to preserve values of
repeating WARC and HTTP headers
- code cleanup: fix indentation, remove unneeded return statements
---
src/main/java/org/archive/resource/MetaData.java | 10 ++++------
1 file changed, 4 insertions(+), 6 deletions(-)
diff --git a/src/main/java/org/archive/resource/MetaData.java b/src/main/java/org/archive/resource/MetaData.java
index 30ce849b..fb3b24a4 100755
--- a/src/main/java/org/archive/resource/MetaData.java
+++ b/src/main/java/org/archive/resource/MetaData.java
@@ -83,7 +83,6 @@ public int optInt(String key, int defaultValue) {
return super.getInt(key);
} catch(JSONException e) {
LOG.severe(e.getMessage());
- return defaultValue;
}
}
return defaultValue;
@@ -106,7 +105,6 @@ public long optLong(String key, long defaultValue) {
return super.getLong(key);
} catch(JSONException e) {
LOG.severe(e.getMessage());
- return defaultValue;
}
}
return defaultValue;
@@ -167,10 +165,10 @@ public JSONObject put(String key, Object value) {
((JSONArray) super.get(key)).put(value);
return this;
} else {
- JSONArray array = new JSONArray();
- array.put(super.get(key));
- array.put(value);
- super.put(key, array);
+ JSONArray array = new JSONArray();
+ array.put(super.get(key));
+ array.put(value);
+ super.put(key, array);
}
return super.accumulate(key, value);
}
From a4748d9e79abb972a6571f5f4d46951be6049b1a Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Wed, 27 Nov 2024 13:24:17 +0100
Subject: [PATCH 026/169] URLParser and WaybackURLKeyMaker fail on URLs with
IPv6 address hostname
---
src/main/java/org/archive/url/URLParser.java | 11 ++++++++++-
.../java/org/archive/url/URLRegexTransformer.java | 4 ++++
src/test/java/org/archive/url/URLParserTest.java | 3 +++
.../java/org/archive/url/WaybackURLKeyMakerTest.java | 3 +++
4 files changed, 20 insertions(+), 1 deletion(-)
diff --git a/src/main/java/org/archive/url/URLParser.java b/src/main/java/org/archive/url/URLParser.java
index a7860b02..bcd0b7fb 100644
--- a/src/main/java/org/archive/url/URLParser.java
+++ b/src/main/java/org/archive/url/URLParser.java
@@ -226,7 +226,16 @@ public static HandyURL parse(String urlString) throws URISyntaxException {
String colonPort = null;
int atIndex = uriAuthority.indexOf(COMMERCIAL_AT);
- int portColonIndex = uriAuthority.indexOf(COLON,(atIndex<0)?0:atIndex);
+ int portColonIndex = -1;
+ int startColonIndex = 0;
+ if (atIndex > -1) {
+ startColonIndex = atIndex;
+ }
+ if (uriAuthority.charAt(startColonIndex) == '[') {
+ // IPv6 address
+ startColonIndex = uriAuthority.indexOf(']', (startColonIndex + 1));
+ }
+ portColonIndex = uriAuthority.indexOf(COLON, startColonIndex);
if(atIndex<0 && portColonIndex<0) {
// most common case: neither userinfo nor port
diff --git a/src/main/java/org/archive/url/URLRegexTransformer.java b/src/main/java/org/archive/url/URLRegexTransformer.java
index 617e0225..5f31c81c 100644
--- a/src/main/java/org/archive/url/URLRegexTransformer.java
+++ b/src/main/java/org/archive/url/URLRegexTransformer.java
@@ -121,6 +121,10 @@ public static String hostToSURT(String host) {
// TODO: ensure we DONT reverse IP addresses!
String parts[] = host.split("\\.",-1);
if(parts.length == 1) {
+ // strip enclosing "[" and "]" from IPv6 hosts
+ if (host.charAt(0) == '[' && host.charAt(host.length() - 1) == ']') {
+ return host.substring(1, host.length() - 1);
+ }
return host;
}
StringBuilder sb = new StringBuilder(host.length());
diff --git a/src/test/java/org/archive/url/URLParserTest.java b/src/test/java/org/archive/url/URLParserTest.java
index b060ffa7..68dfcd23 100644
--- a/src/test/java/org/archive/url/URLParserTest.java
+++ b/src/test/java/org/archive/url/URLParserTest.java
@@ -86,6 +86,9 @@ public void testParse() throws UnsupportedEncodingException, URISyntaxException
checkParse(" \n http://:****@www.archive.org:8080/inde\rx.html?query#foo \r\n \t ",
null, "http", "", "****", "www.archive.org", 8080, "/index.html", "query", "foo",
"http://:****@www.archive.org:8080/index.html?query#foo", "/index.html?query");
+ checkParse("https://[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/robots.txt", null, "https", null, null,
+ "[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]", -1, "/robots.txt", null, null,
+ "https://[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/robots.txt", "/robots.txt");
}
private void checkParse(String s, String opaque, String scheme, String authUser,
diff --git a/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java b/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java
index 26161456..1a1403ee 100644
--- a/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java
+++ b/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java
@@ -23,6 +23,9 @@ public void testMakeKey() throws URISyntaxException {
assertEquals("org,archive)/goo?a&b", km.makeKey("http://archive.org/goo/?b&a"));
assertEquals("org,archive)/goo?a=1&a=2&b", km.makeKey("http://archive.org/goo/?a=2&b&a=1"));
assertEquals("org,archive)/", km.makeKey("http://archive.org:/"));
+ assertEquals("192,211,203,34)/robots.txt", km.makeKey("https://34.203.211.192/robots.txt"));
+ assertEquals("2600:1f18:200d:fb00:2b74:867c:ab0c:150a)/robots.txt",
+ km.makeKey("https://[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/robots.txt"));
}
}
From 8e89847d79ea2882bc55e2d00939fd8a2ca21865 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Wed, 27 Nov 2024 21:51:58 +0900
Subject: [PATCH 027/169] Update release plugins
---
pom.xml | 110 ++++++++++++++++++++++++++++++++++----------------------
1 file changed, 67 insertions(+), 43 deletions(-)
diff --git a/pom.xml b/pom.xml
index dc3088f0..048787a5 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1,12 +1,6 @@
4.0.0
-
- org.sonatype.oss
- oss-parent
- 7
-
-
org.netpreserve.commons
webarchive-commons
1.1.11-SNAPSHOT
@@ -45,19 +39,13 @@
scm:git:git@github.com:iipc/webarchive-commons.git
scm:git:git@github.com:iipc/webarchive-commons.git
- git@github.com:iipc/webarchive-commons.git
+ https://github.com/iipc/webarchive-commons
UTF-8
${maven.build.timestamp}
yyyyMMddhhmmss
-
-
- sonatype-nexus-staging
- https://oss.sonatype.org/service/local/staging/deploy/maven2/
- sonatype-nexus-snapshots
- https://oss.sonatype.org/content/repositories/snapshots/
@@ -201,24 +189,6 @@
8
-
- maven-assembly-plugin
- 2.4
-
-
- jar-with-dependencies
-
- webarchive-commons
-
-
-
- package
-
- single
-
-
-
-
org.apache.maven.plugins
maven-enforcer-plugin
@@ -251,17 +221,71 @@
-
+
+
+ release
+
+
+ ossrh
+ https://oss.sonatype.org/content/repositories/snapshots
+
+
+
+
+
+ org.sonatype.plugins
+ nexus-staging-maven-plugin
+ 1.6.7
+ true
+
+ ossrh
+ https://oss.sonatype.org/
+ true
+
+
+
+ org.apache.maven.plugins
+ maven-source-plugin
+ 2.2.1
+
+
+ attach-sources
+
+ jar-no-fork
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-javadoc-plugin
+ 2.9.1
+
+
+ attach-javadocs
+
+ jar
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-gpg-plugin
+ 1.5
+
+
+ sign-artifacts
+ verify
+
+ sign
+
+
+
+
+
+
+
+
From 829566b1385a8dae6bc9774cd1299469f37e78c3 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Wed, 27 Nov 2024 21:53:22 +0900
Subject: [PATCH 028/169] [maven-release-plugin] prepare release
webarchive-commons-1.1.11
---
pom.xml | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/pom.xml b/pom.xml
index 048787a5..28bd9145 100644
--- a/pom.xml
+++ b/pom.xml
@@ -3,7 +3,7 @@
org.netpreserve.commons
webarchive-commons
- 1.1.11-SNAPSHOT
+ 1.1.11
jar
webarchive-commons
@@ -40,7 +40,8 @@
scm:git:git@github.com:iipc/webarchive-commons.git
scm:git:git@github.com:iipc/webarchive-commons.git
https://github.com/iipc/webarchive-commons
-
+ webarchive-commons-1.1.11
+
UTF-8
From 9b0bbcfdeea7a9c2ac9a28b245bce2f8e9df5dce Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Wed, 27 Nov 2024 21:53:27 +0900
Subject: [PATCH 029/169] [maven-release-plugin] prepare for next development
iteration
---
pom.xml | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/pom.xml b/pom.xml
index 28bd9145..c86add9f 100644
--- a/pom.xml
+++ b/pom.xml
@@ -3,7 +3,7 @@
org.netpreserve.commons
webarchive-commons
- 1.1.11
+ 1.1.12-SNAPSHOT
jar
webarchive-commons
@@ -40,7 +40,7 @@
scm:git:git@github.com:iipc/webarchive-commons.git
scm:git:git@github.com:iipc/webarchive-commons.git
https://github.com/iipc/webarchive-commons
- webarchive-commons-1.1.11
+ HEAD
From 9e4723b313a542320a4f09f4b4e2dbccdc0f58ac Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Wed, 27 Nov 2024 21:58:55 +0900
Subject: [PATCH 030/169] Update CHANGES.md
---
CHANGES.md | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/CHANGES.md b/CHANGES.md
index 6fe7c4bd..579b659f 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,7 +1,14 @@
+1.1.11
+------
+
+#### Bug fixes
+
+* Fixed URLParser and WaybackURLKeyMaker failing on URLs with IPv6 address hostnames [#100](https://github.com/iipc/webarchive-commons/pull/100)
+
1.1.10
------
-#### Fixes
+#### Bug fixes
* [WAT extractor: do not fail on missing WARC-Filename in warcinfo record](https://github.com/iipc/webarchive-commons/pull/89)
* [ExtractingParseObserver: extract rel, hreflang and type attributes](https://github.com/iipc/webarchive-commons/pull/86)
From cd2da63f1f56d41705e014e2c3290635fcc99099 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Wed, 27 Nov 2024 22:00:18 +0900
Subject: [PATCH 031/169] Add description to pom.xml (now mandatory for
central)
---
pom.xml | 1 +
1 file changed, 1 insertion(+)
diff --git a/pom.xml b/pom.xml
index c86add9f..18aca329 100644
--- a/pom.xml
+++ b/pom.xml
@@ -8,6 +8,7 @@
webarchive-commons
https://github.com/iipc/webarchive-commons
+ Common web archive utility code
The International Internet Preservation Consortium
From 7b6df0c619899ae70e350fb0d955c00b59ba68e5 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Wed, 27 Nov 2024 22:02:31 +0900
Subject: [PATCH 032/169] [maven-release-plugin] prepare release
webarchive-commons-1.1.11
---
pom.xml | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/pom.xml b/pom.xml
index 18aca329..a57230d9 100644
--- a/pom.xml
+++ b/pom.xml
@@ -3,7 +3,7 @@
org.netpreserve.commons
webarchive-commons
- 1.1.12-SNAPSHOT
+ 1.1.11
jar
webarchive-commons
@@ -41,7 +41,7 @@
scm:git:git@github.com:iipc/webarchive-commons.git
scm:git:git@github.com:iipc/webarchive-commons.git
https://github.com/iipc/webarchive-commons
- HEAD
+ webarchive-commons-1.1.11
From a70f23e8b654d3a661877641f2fa7e51d696ceeb Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Wed, 27 Nov 2024 22:02:36 +0900
Subject: [PATCH 033/169] [maven-release-plugin] prepare for next development
iteration
---
pom.xml | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/pom.xml b/pom.xml
index a57230d9..18aca329 100644
--- a/pom.xml
+++ b/pom.xml
@@ -3,7 +3,7 @@
org.netpreserve.commons
webarchive-commons
- 1.1.11
+ 1.1.12-SNAPSHOT
jar
webarchive-commons
@@ -41,7 +41,7 @@
scm:git:git@github.com:iipc/webarchive-commons.git
scm:git:git@github.com:iipc/webarchive-commons.git
https://github.com/iipc/webarchive-commons
- webarchive-commons-1.1.11
+ HEAD
From 0514b2387decaf5e40e24bcda0f7c70b438d0997 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Wed, 27 Nov 2024 22:08:04 +0900
Subject: [PATCH 034/169] Add Maven Central and Javadoc shields to README
---
README.md | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/README.md b/README.md
index 72858a52..55be6e68 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,6 @@
IIPC Web Archive Commons
========================
-
-[](https://travis-ci.org/iipc/webarchive-commons/)
+[](https://maven-badges.herokuapp.com/maven-central/org.netpreserve.commons/webarchive-commons) [](https://www.javadoc.io/doc/org.netpreserve.commons/webarchive-commons)
This repository contains common utility code for [OpenWayback][1] and other projects.
From c6095082fdecadd6882456a51c5f91b8a3d4faa5 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Fri, 29 Nov 2024 15:42:05 +0900
Subject: [PATCH 035/169] Bump guava from 33.3.0-jre to 33.3.1-jre
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index 18aca329..0ac11df9 100644
--- a/pom.xml
+++ b/pom.xml
@@ -60,7 +60,7 @@
com.google.guava
guava
- 33.3.0-jre
+ 33.3.1-jre
From 23c8887c2a3eb4d4d5b0bac0cf805c71fcaeabaf Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Fri, 29 Nov 2024 15:42:41 +0900
Subject: [PATCH 036/169] Bump commons-io from 2.14.0 to 2.18.0
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index 0ac11df9..84822a4f 100644
--- a/pom.xml
+++ b/pom.xml
@@ -140,7 +140,7 @@
commons-io
commons-io
- 2.14.0
+ 2.18.0
From f13c7b2a3b254a83827ad5a1c27131c6980c79eb Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Fri, 29 Nov 2024 15:47:28 +0900
Subject: [PATCH 037/169] Bump commons-lang from 2.5 to 2.6
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index 84822a4f..3d5f995f 100644
--- a/pom.xml
+++ b/pom.xml
@@ -134,7 +134,7 @@
commons-lang
commons-lang
- 2.5
+ 2.6
From 5528afc05f77189b7ef59dbb9cdcce2bd35656e7 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Fri, 29 Nov 2024 16:24:04 +0900
Subject: [PATCH 038/169] Bump junit from 4.13.1 to 4.13.2
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index 3d5f995f..46f26766 100644
--- a/pom.xml
+++ b/pom.xml
@@ -54,7 +54,7 @@
junit
junit
- 4.13.1
+ 4.13.2
From 7426c563310f73a0820a9af729b5f3621cea57f4 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Fri, 29 Nov 2024 16:24:52 +0900
Subject: [PATCH 039/169] Bump hadoop from 3.4.0 to 3.4.1
---
pom.xml | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/pom.xml b/pom.xml
index 46f26766..c1bc7798 100644
--- a/pom.xml
+++ b/pom.xml
@@ -95,7 +95,7 @@
org.apache.hadoop
hadoop-common
- 3.4.0
+ 3.4.1
true
@@ -108,7 +108,7 @@
org.apache.hadoop
hadoop-mapreduce-client-core
- 3.4.0
+ 3.4.1
true
From 88607b2ed67c8c73e8b199adf85ac1ddf2fcdddb Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Fri, 29 Nov 2024 16:26:16 +0900
Subject: [PATCH 040/169] Bump httpcore from 4.3 to 4.4.16
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index c1bc7798..a993945e 100644
--- a/pom.xml
+++ b/pom.xml
@@ -176,7 +176,7 @@
org.apache.httpcomponents
httpcore
- 4.3
+ 4.4.16
From 0256ae6131e80c49e1ed4a16e5631ccff0d74e36 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Fri, 29 Nov 2024 16:27:04 +0900
Subject: [PATCH 041/169] Bump htmlparser from 1.6 to 2.1
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index a993945e..ce0a2aec 100644
--- a/pom.xml
+++ b/pom.xml
@@ -71,7 +71,7 @@
org.htmlparser
htmlparser
- 1.6
+ 2.1
From e1d458a86a2203ca1cd5cab967fb17f268994082 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Fri, 29 Nov 2024 16:27:31 +0900
Subject: [PATCH 042/169] Bump json from 20231013 to 20240303
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index ce0a2aec..1023560c 100644
--- a/pom.xml
+++ b/pom.xml
@@ -66,7 +66,7 @@
org.json
json
- 20231013
+ 20240303
org.htmlparser
From c839700d472bac5b4625ea4fe10ef47ee02a5a31 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Fri, 29 Nov 2024 16:35:12 +0900
Subject: [PATCH 043/169] Update CHANGES.md
---
CHANGES.md | 18 ++++++++++++++++++
1 file changed, 18 insertions(+)
diff --git a/CHANGES.md b/CHANGES.md
index 579b659f..e3afd137 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,3 +1,21 @@
+1.2.0
+-----
+
+#### New features
+
+* MetaData is now multivalued to support repeated WARC and HTTP headers. [#98](https://github.com/iipc/webarchive-commons/pull/98/files)
+
+#### Dependency upgrades
+
+* commons-io 2.18.0
+* commons-lang 2.6
+* guava 33.3.1-jre
+* hadoop 3.4.1
+* htmlparser 2.1
+* httpcore 4.4.16
+* json 20240303
+* junit 4.13.2
+
1.1.11
------
From 91c01ddb0561798d204c957fefafa782c0b53921 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Fri, 29 Nov 2024 16:37:15 +0900
Subject: [PATCH 044/169] [maven-release-plugin] prepare release
webarchive-commons-1.2.0
---
pom.xml | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/pom.xml b/pom.xml
index 1023560c..12dfae9f 100644
--- a/pom.xml
+++ b/pom.xml
@@ -3,7 +3,7 @@
org.netpreserve.commons
webarchive-commons
- 1.1.12-SNAPSHOT
+ 1.2.0
jar
webarchive-commons
@@ -41,7 +41,7 @@
scm:git:git@github.com:iipc/webarchive-commons.git
scm:git:git@github.com:iipc/webarchive-commons.git
https://github.com/iipc/webarchive-commons
- HEAD
+ webarchive-commons-1.2.0
From f37418d08d8fa7fd4ccad4fbb919cc0fc371f2f2 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Fri, 29 Nov 2024 16:37:20 +0900
Subject: [PATCH 045/169] [maven-release-plugin] prepare for next development
iteration
---
pom.xml | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/pom.xml b/pom.xml
index 12dfae9f..0d84b0d2 100644
--- a/pom.xml
+++ b/pom.xml
@@ -3,7 +3,7 @@
org.netpreserve.commons
webarchive-commons
- 1.2.0
+ 1.2.1-SNAPSHOT
jar
webarchive-commons
@@ -41,7 +41,7 @@
scm:git:git@github.com:iipc/webarchive-commons.git
scm:git:git@github.com:iipc/webarchive-commons.git
https://github.com/iipc/webarchive-commons
- webarchive-commons-1.2.0
+ HEAD
From 3ae5720ad43e2e80b5ab853078e891ee53641a3c Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Tue, 3 Dec 2024 20:22:10 +0900
Subject: [PATCH 046/169] Remove dependency on dsiutils
---
pom.xml | 16 ++--------------
.../java/org/archive/url/UsableURIFactory.java | 5 ++---
2 files changed, 4 insertions(+), 17 deletions(-)
diff --git a/pom.xml b/pom.xml
index 0d84b0d2..da2e14da 100644
--- a/pom.xml
+++ b/pom.xml
@@ -150,20 +150,8 @@
it.unimi.dsi
- dsiutils
- 2.2.8
- compile
-
-
- ch.qos.logback
- logback-classic
-
-
-
- commons-collections
- commons-collections
-
-
+ fastutil
+ 7.0.10
diff --git a/src/main/java/org/archive/url/UsableURIFactory.java b/src/main/java/org/archive/url/UsableURIFactory.java
index d44b5c84..3dfc33a7 100644
--- a/src/main/java/org/archive/url/UsableURIFactory.java
+++ b/src/main/java/org/archive/url/UsableURIFactory.java
@@ -20,7 +20,6 @@
import gnu.inet.encoding.IDNA;
import gnu.inet.encoding.IDNAException;
-import it.unimi.dsi.lang.MutableString;
import java.io.UnsupportedEncodingException;
import java.util.BitSet;
@@ -485,7 +484,7 @@ private String fixup(String uri, final URI base, final String charset)
// Preallocate. The '1's and '2's in below are space for ':',
// '//', etc. URI characters.
- MutableString s = new MutableString(
+ StringBuilder s = new StringBuilder(
((uriScheme != null)? uriScheme.length(): 0)
+ 1 // ';'
+ ((uriAuthority != null)? uriAuthority.length(): 0)
@@ -707,7 +706,7 @@ private String checkPort(String uriAuthority)
* @param substr Suffix or prefix to use if str is not null.
* @param suffix True if substr is a suffix.
*/
- private void appendNonNull(MutableString b, String str, String substr,
+ private void appendNonNull(StringBuilder b, String str, String substr,
boolean suffix) {
if (str != null && str.length() > 0) {
if (!suffix) {
From 33556bf741eaa10421b9214bbbd69f40618d27d1 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Tue, 3 Dec 2024 20:38:46 +0900
Subject: [PATCH 047/169] Remove pom-cdh4.xml
---
pom-cdh4.xml | 229 ---------------------------------------------------
1 file changed, 229 deletions(-)
delete mode 100644 pom-cdh4.xml
diff --git a/pom-cdh4.xml b/pom-cdh4.xml
deleted file mode 100644
index de19d8d0..00000000
--- a/pom-cdh4.xml
+++ /dev/null
@@ -1,229 +0,0 @@
-
- 4.0.0
-
- org.archive
- ia-web-commons
- 1.0-SNAPSHOT
- jar
-
- ia-web-commons
- http://maven.apache.org
-
-
- UTF-8
- ${maven.build.timestamp}
- yyyyMMddhhmmss
-
-
-
-
- junit
- junit
- 3.8.1
- test
-
-
-
- com.google.guava
- guava
- 14.0.1
-
-
-
- org.json
- json
- 20090211
-
-
- org.htmlparser
- htmlparser
- 1.6
-
-
-
- org.mozilla
- juniversalchardet
- 1.0.3
-
-
-
- commons-httpclient
- commons-httpclient
- 3.1
-
-
-
- org.apache.hadoop
- hadoop-core
- 2.0.0-mr1-cdh4.2.0
-
-
- commons-httpclient
- commons-httpclient
-
-
- javax.servlet
- servlet-api
-
-
- javax.servlet.jsp
- jsp-api
-
-
- org.mortbay.jetty
- jetty
-
-
- org.mortbay.jetty
- jetty-util
-
-
- tomcat
- jasper-runtime
-
-
- tomcat
- jasper-compiler
-
-
-
-
- org.apache.hadoop
- hadoop-common
- 2.0.0-cdh4.2.0
-
-
- org.apache.hadoop
- hadoop-mapreduce-client-common
- 2.0.0-cdh4.2.0
-
-
- org.apache.hadoop
- hadoop-mapreduce-client-core
- 2.0.0-cdh4.2.0
-
-
-
- org.apache.pig
- pig
- 0.11.1
- provided
-
-
-
- commons-lang
- commons-lang
- 2.5
-
-
-
- commons-io
- commons-io
- 2.4
-
-
-
- org.gnu.inet
- libidn
- 1.15
-
-
- it.unimi.dsi
- mg4j
- 1.0.1
- compile
-
-
- org.apache.httpcomponents
- httpcore
- 4.3
-
-
-
-
-
-
- org.apache.maven.plugins
- maven-compiler-plugin
- 2.3.2
-
- 1.6
- 1.6
-
-
-
- maven-assembly-plugin
- 2.4
-
-
- jar-with-dependencies
-
- ia-web-commons
-
-
-
- package
-
- single
-
-
-
-
-
-
-
- src/main/resources
- true
-
-
-
-
-
-
- internetarchive
- Internet Archive Maven Repository
- http://builds.archive.org:8080/maven2
- default
-
-
- true
- daily
- warn
-
-
- true
- daily
- warn
-
-
-
-
- cloudera
- Cloudera Hadoop
- https://repository.cloudera.com/artifactory/cloudera-repos/
- default
-
-
- true
- daily
- warn
-
-
- true
- daily
- warn
-
-
-
-
-
-
-
- repository
-
- ${repository.url}
-
-
-
-
From 4bb03baec41d90795e312e4a2865abb0395670f3 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Tue, 3 Dec 2024 20:42:29 +0900
Subject: [PATCH 048/169] Use Files.createLink instead of shelling out to ln
---
.../io/ObjectPlusFilesOutputStream.java | 19 ++++---------------
1 file changed, 4 insertions(+), 15 deletions(-)
diff --git a/src/main/java/org/archive/io/ObjectPlusFilesOutputStream.java b/src/main/java/org/archive/io/ObjectPlusFilesOutputStream.java
index 224f24e7..bd5c1eea 100644
--- a/src/main/java/org/archive/io/ObjectPlusFilesOutputStream.java
+++ b/src/main/java/org/archive/io/ObjectPlusFilesOutputStream.java
@@ -18,10 +18,8 @@
*/
package org.archive.io;
-import java.io.File;
-import java.io.IOException;
-import java.io.ObjectOutputStream;
-import java.io.OutputStream;
+import java.io.*;
+import java.nio.file.Files;
import java.util.LinkedList;
import org.archive.util.FileUtils;
@@ -116,19 +114,10 @@ public void snapshotAppendOnlyFile(File file) throws IOException {
* @throws IOException
*/
private void hardlinkOrCopy(File file, File destination) throws IOException {
- // For Linux/UNIX, try a hard link first.
- Process link = Runtime.getRuntime().exec("ln "+file.getAbsolutePath()+" "+destination.getAbsolutePath());
- // TODO NTFS also supports hard links; add appropriate try
try {
- link.waitFor();
- } catch (InterruptedException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- if(link.exitValue()!=0) {
- // hard link failed
+ Files.createLink(destination.toPath(), file.toPath());
+ } catch (UnsupportedEncodingException e) {
FileUtils.copyFile(file,destination);
}
}
-
}
From 328aef2788313a2abc6123c385f9c31b863d6f1b Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Wed, 4 Dec 2024 15:07:23 +0900
Subject: [PATCH 049/169] Remove dependency on fastutil
Fastutil is our largest dependency and consumes a third of the overall Heritrix distribution size. If we update to the latest version it will be even larger. But we're only using two tiny classes from it: the trivial RepositionableStream interface and the unsynchronized FastBufferedOutputStream.
Some downstream users (e.g. lockss-core) actually implement RepositionableStream, so to preserve API compatiblity this change includes a copy of just that interface while keeping the same package name.
Regarding FastBufferedOutputStream, for WARC writing the outer GZIPOutputStream is synchronized anyway. And RecordingOutputStream will typically be doing moderately large writes copying from the network. So in both usages it seems unlikely that there's much practical benefit in using it here over the standard BufferedOutputStream. The JVM JIT has a lot of optimizations for synchronized these days too.
---
pom.xml | 5 ---
.../dsi/fastutil/io/RepositionableStream.java | 42 +++++++++++++++++++
.../org/archive/io/RecordingOutputStream.java | 5 +--
.../java/org/archive/io/WriterPoolMember.java | 5 +--
4 files changed, 46 insertions(+), 11 deletions(-)
create mode 100644 src/main/java/it/unimi/dsi/fastutil/io/RepositionableStream.java
diff --git a/pom.xml b/pom.xml
index da2e14da..5e5fa419 100644
--- a/pom.xml
+++ b/pom.xml
@@ -148,11 +148,6 @@
libidn
1.15
-
- it.unimi.dsi
- fastutil
- 7.0.10
-
diff --git a/src/main/java/it/unimi/dsi/fastutil/io/RepositionableStream.java b/src/main/java/it/unimi/dsi/fastutil/io/RepositionableStream.java
new file mode 100644
index 00000000..a81645f0
--- /dev/null
+++ b/src/main/java/it/unimi/dsi/fastutil/io/RepositionableStream.java
@@ -0,0 +1,42 @@
+// copied from fastutil, keeping the original package name to avoid breaking
+// compatibility with existing user code that implements this interface
+package it.unimi.dsi.fastutil.io;
+
+/*
+ * Copyright (C) 2005-2015 Sebastiano Vigna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/** A basic interface specifying positioning methods for a byte stream.
+ *
+ * @author Sebastiano Vigna
+ * @since 4.4
+ */
+
+public interface RepositionableStream {
+
+ /** Sets the current stream position.
+ *
+ * @param newPosition the new stream position.
+ */
+ void position( long newPosition ) throws java.io.IOException;
+
+ /** Returns the current stream position.
+ *
+ * @return the current stream position.
+ */
+ long position() throws java.io.IOException;
+
+}
diff --git a/src/main/java/org/archive/io/RecordingOutputStream.java b/src/main/java/org/archive/io/RecordingOutputStream.java
index 7d2ff212..6c77997b 100644
--- a/src/main/java/org/archive/io/RecordingOutputStream.java
+++ b/src/main/java/org/archive/io/RecordingOutputStream.java
@@ -19,8 +19,7 @@
package org.archive.io;
-import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
-
+import java.io.BufferedOutputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
@@ -207,7 +206,7 @@ public void open(OutputStream wrappedStream) throws IOException {
protected OutputStream ensureDiskStream() throws FileNotFoundException {
if (this.diskStream == null) {
FileOutputStream fis = new FileOutputStream(this.backingFilename);
- this.diskStream = new FastBufferedOutputStream(fis);
+ this.diskStream = new BufferedOutputStream(fis);
}
return this.diskStream;
}
diff --git a/src/main/java/org/archive/io/WriterPoolMember.java b/src/main/java/org/archive/io/WriterPoolMember.java
index 893007ec..e10d443b 100644
--- a/src/main/java/org/archive/io/WriterPoolMember.java
+++ b/src/main/java/org/archive/io/WriterPoolMember.java
@@ -19,8 +19,7 @@
package org.archive.io;
-import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
-
+import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
@@ -200,7 +199,7 @@ protected String createFile(final File file) throws IOException {
close();
this.f = file;
FileOutputStream fos = new FileOutputStream(this.f);
- this.countOut = new MiserOutputStream(new FastBufferedOutputStream(fos),settings.getFrequentFlushes());
+ this.countOut = new MiserOutputStream(new BufferedOutputStream(fos),settings.getFrequentFlushes());
this.out = this.countOut;
logger.fine("Opened " + this.f.getAbsolutePath());
return this.f.getName();
From 8988fbbc3528afcc7f792bcc967189311e8a1286 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Wed, 4 Dec 2024 16:54:03 +0900
Subject: [PATCH 050/169] Deprecate some classes specific to HttpClient 3
These are intended to be removed in webarchive-commons 2. #78
---
.../java/org/archive/httpclient/HttpRecorderGetMethod.java | 2 ++
src/main/java/org/archive/httpclient/HttpRecorderMethod.java | 2 ++
.../java/org/archive/httpclient/HttpRecorderPostMethod.java | 2 ++
.../org/archive/httpclient/SingleHttpConnectionManager.java | 2 ++
.../archive/httpclient/ThreadLocalHttpConnectionManager.java | 4 +++-
.../util/binsearch/impl/HTTPSeekableLineReaderFactory.java | 1 +
.../archive/util/binsearch/impl/http/ApacheHttp31SLR.java | 4 ++++
.../util/binsearch/impl/http/ApacheHttp31SLRFactory.java | 5 +++++
8 files changed, 21 insertions(+), 1 deletion(-)
diff --git a/src/main/java/org/archive/httpclient/HttpRecorderGetMethod.java b/src/main/java/org/archive/httpclient/HttpRecorderGetMethod.java
index ef241b48..1a94af1f 100644
--- a/src/main/java/org/archive/httpclient/HttpRecorderGetMethod.java
+++ b/src/main/java/org/archive/httpclient/HttpRecorderGetMethod.java
@@ -70,7 +70,9 @@
*
* @author stack
* @version $Revision$, $Date$
+ * @deprecated Commons HttpClient 3 is end of life, this will be removed in webarchive-commons 2.0
*/
+@Deprecated
public class HttpRecorderGetMethod extends GetMethod {
protected static Logger logger =
diff --git a/src/main/java/org/archive/httpclient/HttpRecorderMethod.java b/src/main/java/org/archive/httpclient/HttpRecorderMethod.java
index 932e7e98..b08bc0bd 100644
--- a/src/main/java/org/archive/httpclient/HttpRecorderMethod.java
+++ b/src/main/java/org/archive/httpclient/HttpRecorderMethod.java
@@ -34,7 +34,9 @@
*
* @author stack
* @version $Revision$, $Date$
+ * @deprecated Commons HttpClient 3 is end of life, this will be removed in webarchive-commons 2.0
*/
+@Deprecated
public class HttpRecorderMethod {
protected static Logger logger =
Logger.getLogger(HttpRecorderMethod.class.getName());
diff --git a/src/main/java/org/archive/httpclient/HttpRecorderPostMethod.java b/src/main/java/org/archive/httpclient/HttpRecorderPostMethod.java
index 20f1bfd1..d55d816a 100644
--- a/src/main/java/org/archive/httpclient/HttpRecorderPostMethod.java
+++ b/src/main/java/org/archive/httpclient/HttpRecorderPostMethod.java
@@ -36,7 +36,9 @@
*
* @author stack
* @version $Date$ $Revision$
+ * @deprecated Commons HttpClient 3 is end of life, this will be removed in webarchive-commons 2.0
*/
+@Deprecated
public class HttpRecorderPostMethod extends PostMethod {
/**
* Instance of http recorder method.
diff --git a/src/main/java/org/archive/httpclient/SingleHttpConnectionManager.java b/src/main/java/org/archive/httpclient/SingleHttpConnectionManager.java
index 4ba6a837..d6cf27ab 100644
--- a/src/main/java/org/archive/httpclient/SingleHttpConnectionManager.java
+++ b/src/main/java/org/archive/httpclient/SingleHttpConnectionManager.java
@@ -32,7 +32,9 @@
* with external mechanisms.
*
* @author gojomo
+ * @deprecated Commons HttpClient 3 is end of life, this will be removed in webarchive-commons 2.0
*/
+@Deprecated
public class SingleHttpConnectionManager extends SimpleHttpConnectionManager {
public SingleHttpConnectionManager() {
diff --git a/src/main/java/org/archive/httpclient/ThreadLocalHttpConnectionManager.java b/src/main/java/org/archive/httpclient/ThreadLocalHttpConnectionManager.java
index 91e850ea..16821b36 100644
--- a/src/main/java/org/archive/httpclient/ThreadLocalHttpConnectionManager.java
+++ b/src/main/java/org/archive/httpclient/ThreadLocalHttpConnectionManager.java
@@ -36,8 +36,10 @@
*
* Java >= 1.4 is recommended.
*
- * @author Christian Kohlschuetter
+ * @author Christian Kohlschuetter
+ * @deprecated Commons HttpClient 3 is end of life, this will be removed in webarchive-commons 2.0
*/
+@Deprecated
public final class ThreadLocalHttpConnectionManager implements
HttpConnectionManager {
diff --git a/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReaderFactory.java b/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReaderFactory.java
index b4a23db0..68ee6551 100644
--- a/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReaderFactory.java
+++ b/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReaderFactory.java
@@ -20,6 +20,7 @@ protected HTTPSeekableLineReaderFactory()
public enum HttpLibs
{
+ @Deprecated
APACHE_31,
APACHE_43,
URLCONN,
diff --git a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java
index c4fdbba8..124d3d03 100644
--- a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java
+++ b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java
@@ -14,6 +14,10 @@
import org.apache.commons.io.input.CountingInputStream;
import org.archive.util.binsearch.impl.HTTPSeekableLineReader;
+/**
+ * @deprecated Commons HttpClient 3 is end of life, this will be removed in webarchive-commons 2.0
+ */
+@Deprecated
public class ApacheHttp31SLR extends HTTPSeekableLineReader {
private HttpClient http;
diff --git a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java
index bc5b83f4..2af03dab 100644
--- a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java
+++ b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java
@@ -15,6 +15,11 @@
import org.archive.util.binsearch.impl.HTTPSeekableLineReader;
import org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory;
+/**
+ *
+ * @deprecated Commons HttpClient 3 is end of life, this will be removed in webarchive-commons 2.0
+ */
+@Deprecated
public class ApacheHttp31SLRFactory extends HTTPSeekableLineReaderFactory {
private final static Logger LOGGER = Logger.getLogger(ApacheHttp31SLRFactory.class.getName());
From b8a91bb3b7e8a36b2162251314ff52b42a379221 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Thu, 5 Dec 2024 07:49:10 +0900
Subject: [PATCH 051/169] Remove unused dependency on commons-collections
---
pom.xml | 7 -------
1 file changed, 7 deletions(-)
diff --git a/pom.xml b/pom.xml
index 5e5fa419..6dec154c 100644
--- a/pom.xml
+++ b/pom.xml
@@ -149,13 +149,6 @@
1.15
-
-
- commons-collections
- commons-collections
- 3.2.2
-
-
org.apache.httpcomponents
httpcore
From a80b98dfe4b1c2a7556e7df2574c16426849f6d9 Mon Sep 17 00:00:00 2001
From: Tom Morris
Date: Sat, 26 Aug 2023 20:05:34 -0400
Subject: [PATCH 052/169] Add failing test from Sebastian's issue
---
src/test/java/org/archive/url/BasicURLCanonicalizerTest.java | 3 +++
src/test/java/org/archive/url/WaybackURLKeyMakerTest.java | 4 ++++
2 files changed, 7 insertions(+)
diff --git a/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java b/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java
index c21bcbe8..cc100e4c 100644
--- a/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java
+++ b/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java
@@ -143,6 +143,9 @@ public void testUnescapeRepeatedly() {
assertEquals("%",guc.unescapeRepeatedly("%25%32%35"));
assertEquals("168.188.99.26",guc.unescapeRepeatedly("%31%36%38%2e%31%38%38%2e%39%39%2e%32%36"));
+
+ assertEquals("tag=%E4%EE%F8%EA%EE%EB%FC%ED%EE%E5",
+ guc.unescapeRepeatedly("tag=%E4%EE%F8%EA%EE%EB%FC%ED%EE%E5"));
}
public void testAttemptIPFormats() throws URIException {
diff --git a/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java b/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java
index 1a1403ee..86250972 100644
--- a/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java
+++ b/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java
@@ -26,6 +26,10 @@ public void testMakeKey() throws URISyntaxException {
assertEquals("192,211,203,34)/robots.txt", km.makeKey("https://34.203.211.192/robots.txt"));
assertEquals("2600:1f18:200d:fb00:2b74:867c:ab0c:150a)/robots.txt",
km.makeKey("https://[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/robots.txt"));
+ assertEquals("ua,1kr)/newslist.html?tag=%e4%ee%f8%ea%ee%eb%fc%ed%ee%e5",
+ km.makeKey("http://1kr.ua/newslist.html?tag=%E4%EE%F8%EA%EE%EB%FC%ED%EE%E5"));
+ assertEquals("com,aluroba)/tags/%c3%ce%ca%c7%d1%e5%c7.htm",
+ km.makeKey("http://www.aluroba.com/tags/%C3%CE%CA%C7%D1%E5%C7.htm"));
}
}
From 5161306d9ec993d1986f0d092c056f33ba3abdfe Mon Sep 17 00:00:00 2001
From: Tom Morris
Date: Sun, 27 Aug 2023 13:01:19 -0400
Subject: [PATCH 053/169] Add non-UTF-8 encoded test from mailing list
---
src/test/java/org/archive/url/WaybackURLKeyMakerTest.java | 2 ++
1 file changed, 2 insertions(+)
diff --git a/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java b/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java
index 86250972..26371ba8 100644
--- a/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java
+++ b/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java
@@ -30,6 +30,8 @@ public void testMakeKey() throws URISyntaxException {
km.makeKey("http://1kr.ua/newslist.html?tag=%E4%EE%F8%EA%EE%EB%FC%ED%EE%E5"));
assertEquals("com,aluroba)/tags/%c3%ce%ca%c7%d1%e5%c7.htm",
km.makeKey("http://www.aluroba.com/tags/%C3%CE%CA%C7%D1%E5%C7.htm"));
+ assertEquals("ac,insbase)/xoops2/modules/xpwiki?%a4%d5%a4%af%a4%aa%a4%ab%b8%a9%a4%aa%a4%aa%a4%ce%a4%b8%a4%e7%a4%a6%bb%d4",
+ km.makeKey("https://www.insbase.ac/xoops2/modules/xpwiki/?%A4%D5%A4%AF%A4%AA%A4%AB%B8%A9%A4%AA%A4%AA%A4%CE%A4%B8%A4%E7%A4%A6%BB%D4"));
}
}
From f7be47bc523c4d06cc7960dc2d3b1b58f9580906 Mon Sep 17 00:00:00 2001
From: Tom Morris
Date: Sun, 27 Aug 2023 13:11:30 -0400
Subject: [PATCH 054/169] Handle non-UTF-8 encoded characters. Fixes #6
---
.../archive/url/BasicURLCanonicalizer.java | 27 +++++++++++++++----
1 file changed, 22 insertions(+), 5 deletions(-)
diff --git a/src/main/java/org/archive/url/BasicURLCanonicalizer.java b/src/main/java/org/archive/url/BasicURLCanonicalizer.java
index c09ad6e6..37b448c1 100644
--- a/src/main/java/org/archive/url/BasicURLCanonicalizer.java
+++ b/src/main/java/org/archive/url/BasicURLCanonicalizer.java
@@ -15,18 +15,18 @@
/**
* Canonicalizer that does more or less basic fixup. Based initially on rules
* specified at https://developers.google.com/safe-browsing/developers_guide_v2#
- * Canonicalization . These rules are designed for clients of google's
+ * Canonicalization. These rules are designed for clients of Google's
* "experimental" Safe Browsing API to "check URLs against Google's
* constantly-updated blacklists of suspected phishing and malware pages".
*
*
- * This class differs from google in treatment of non-ascii input. Google's
+ * This class differs from Google in treatment of non-ascii input. Google's
* rules don't really address this except with one example test case, which
* seems to suggest taking raw input bytes and pct-encoding them byte for byte.
* Since the input to this class consists of java strings, not raw bytes, that
- * wouldn't be possible, even if deemed preferable. Instead
+ * wouldn't be possible, even if deemed preferable. Instead,
* BasicURLCanonicalizer expresses non-ascii characters pct-encoded UTF-8.
*/
public class BasicURLCanonicalizer implements URLCanonicalizer {
@@ -212,6 +212,10 @@ protected static Charset UTF8() {
return _UTF8;
}
+ /**
+ * @param input String to be percent-encoded. Assumed to be fully unescaped.
+ * @return percent-encoded string
+ */
public String escapeOnce(String input) {
if (input == null) {
return null;
@@ -243,6 +247,19 @@ public String escapeOnce(String input) {
*/
sb = new StringBuilder(input.substring(0, i));
}
+ if (b == '%' && i < utf8bytes.length - 2) {
+ // Any hex escapes left at this point represent non-UTF-8 encoded characters
+ // Unescape them, so they don't get double escaped
+ int hex1 = getHex(utf8bytes[i + 1]);
+ if (hex1 >= 0) {
+ int hex2 = getHex(utf8bytes[i + 2]);
+ if (hex2 >= 0) {
+ i = i+2;
+ b = hex1 * 16 + hex2;
+ }
+ }
+
+ }
sb.append("%");
String hex = Integer.toHexString(b).toUpperCase();
if (hex.length() == 1) {
@@ -337,7 +354,7 @@ public String decode(String input) {
* Decodes bytes in bbuf as utf-8 and appends decoded characters to sb. If
* decoding of any portion fails, appends the un-decodable %xx%xx sequence
* extracted from inputStr instead of decoded characters. See "bad unicode"
- * tests in GoogleCanonicalizerTest#testDecode(). Variables only make sense
+ * tests in BasicURLCanonicalizerTest#testDecode(). Variables only make sense
* within context of {@link #decode(String)}.
*
* @param sb
From 6a3cf1b317c87305d05faee73d2c3ee3f5ec08b0 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Wed, 11 Dec 2024 21:14:06 +0100
Subject: [PATCH 055/169] WAT: Duplicated payload metadata values for
"Actual-Content-Length" and "Trailing-Slop-Length"
---
.../org/archive/resource/arc/ARCResource.java | 2 +
.../http/HTTPHeadersResourceFactory.java | 11 +++--
.../archive/resource/warc/WARCResource.java | 14 ++++--
.../record/WARCMetaDataResourceFactory.java | 10 +++-
.../archive/resource/arc/ARCResourceTest.java | 48 +++++++++++++++++++
.../resource/warc/WARCResourceTest.java | 46 ++++++++++++++++++
6 files changed, 123 insertions(+), 8 deletions(-)
create mode 100644 src/test/java/org/archive/resource/arc/ARCResourceTest.java
create mode 100644 src/test/java/org/archive/resource/warc/WARCResourceTest.java
diff --git a/src/main/java/org/archive/resource/arc/ARCResource.java b/src/main/java/org/archive/resource/arc/ARCResource.java
index b6e0a1c1..b0195f08 100644
--- a/src/main/java/org/archive/resource/arc/ARCResource.java
+++ b/src/main/java/org/archive/resource/arc/ARCResource.java
@@ -64,10 +64,12 @@ public ARCResource(MetaData metaData, ResourceContainer container,
}
}
+ @Override
public InputStream getInputStream() {
return new EOFNotifyingInputStream(digIS, this);
}
+ @Override
public void notifyEOF() throws IOException {
metaData.putLong(PAYLOAD_LENGTH, countingIS.getCount());
String digString = Base32.encode(digIS.getMessageDigest().digest());
diff --git a/src/main/java/org/archive/resource/http/HTTPHeadersResourceFactory.java b/src/main/java/org/archive/resource/http/HTTPHeadersResourceFactory.java
index 79805090..eb25d821 100644
--- a/src/main/java/org/archive/resource/http/HTTPHeadersResourceFactory.java
+++ b/src/main/java/org/archive/resource/http/HTTPHeadersResourceFactory.java
@@ -31,6 +31,7 @@ public HTTPHeadersResourceFactory(String name, String type) {
parser = new HttpHeaderParser();
}
+ @Override
public Resource getResource(InputStream is, MetaData parentMetaData,
ResourceContainer container) throws ResourceParseException,
IOException {
@@ -40,9 +41,13 @@ public Resource getResource(InputStream is, MetaData parentMetaData,
if(headers.isCorrupt()) {
parentMetaData.putBoolean(HTTP_HEADERS_CORRUPT, true);
}
- parentMetaData.putLong(PAYLOAD_LENGTH, bytes);
-
- parentMetaData.putLong(PAYLOAD_SLOP_BYTES, StreamCopy.readToEOF(is));
+ if (!parentMetaData.has(PAYLOAD_LENGTH) || bytes != parentMetaData.getLong(PAYLOAD_LENGTH)) {
+ parentMetaData.putLong(PAYLOAD_LENGTH, bytes);
+ }
+ long trailingSlopBytes = StreamCopy.readToEOF(is);
+ if (!parentMetaData.has(PAYLOAD_SLOP_BYTES) || trailingSlopBytes > 0) {
+ parentMetaData.putLong(PAYLOAD_SLOP_BYTES, trailingSlopBytes);
+ }
if(type != null) {
parentMetaData.putString(PAYLOAD_CONTENT_TYPE, type);
}
diff --git a/src/main/java/org/archive/resource/warc/WARCResource.java b/src/main/java/org/archive/resource/warc/WARCResource.java
index d538a25d..a9c3fcc3 100644
--- a/src/main/java/org/archive/resource/warc/WARCResource.java
+++ b/src/main/java/org/archive/resource/warc/WARCResource.java
@@ -53,7 +53,7 @@ public WARCResource(MetaData metaData, ResourceContainer container,
countingIS = new CountingInputStream(
ByteStreams.limit(response, length));
} else {
- throw new ResourceParseException(null);
+ throw new ResourceParseException(new Exception("Zero or negative length: " + length));
}
try {
digIS = new DigestInputStream(countingIS,
@@ -63,14 +63,18 @@ public WARCResource(MetaData metaData, ResourceContainer container,
}
}
+ @Override
public InputStream getInputStream() {
return new EOFNotifyingInputStream(digIS, this);
}
+ @Override
public void notifyEOF() throws IOException {
String digString = Base32.encode(digIS.getMessageDigest().digest());
if(container.isCompressed()) {
- metaData.putLong(PAYLOAD_LENGTH, countingIS.getCount());
+ if (!metaData.has(PAYLOAD_LENGTH) || countingIS.getCount() != metaData.getLong(PAYLOAD_LENGTH)) {
+ metaData.putLong(PAYLOAD_LENGTH, countingIS.getCount());
+ }
metaData.putLong(PAYLOAD_SLOP_BYTES, StreamCopy.readToEOF(response));
metaData.putString(PAYLOAD_DIGEST, "sha1:"+digString);
} else {
@@ -81,13 +85,17 @@ public void notifyEOF() throws IOException {
(PushBackOneByteInputStream) raw;
long numNewlines = StreamCopy.skipChars(pb1bis, CR_NL_CHARS);
if(numNewlines > 0) {
- metaData.putLong(PAYLOAD_LENGTH, countingIS.getCount());
+ long payloadLength = countingIS.getCount();
+ if (!metaData.has(PAYLOAD_LENGTH) || payloadLength != metaData.getLong(PAYLOAD_LENGTH)) {
+ metaData.putLong(PAYLOAD_LENGTH, payloadLength);
+ }
metaData.putLong(PAYLOAD_SLOP_BYTES, numNewlines);
metaData.putString(PAYLOAD_DIGEST, "sha1:"+digString);
}
}
}
}
+
public MetaData getEnvelopeMetaData() {
return envelope;
}
diff --git a/src/main/java/org/archive/resource/warc/record/WARCMetaDataResourceFactory.java b/src/main/java/org/archive/resource/warc/record/WARCMetaDataResourceFactory.java
index 0dfb2834..ba8a35da 100644
--- a/src/main/java/org/archive/resource/warc/record/WARCMetaDataResourceFactory.java
+++ b/src/main/java/org/archive/resource/warc/record/WARCMetaDataResourceFactory.java
@@ -21,6 +21,7 @@ public WARCMetaDataResourceFactory() {
parser = new HttpHeaderParser();
}
+ @Override
public Resource getResource(InputStream is, MetaData parentMetaData,
ResourceContainer container) throws ResourceParseException,
IOException {
@@ -33,8 +34,13 @@ public Resource getResource(InputStream is, MetaData parentMetaData,
if(headers.isCorrupt()) {
md.putBoolean(WARC_META_FIELDS_CORRUPT, true);
}
- parentMetaData.putLong(PAYLOAD_SLOP_BYTES, StreamCopy.readToEOF(is));
- parentMetaData.putLong(PAYLOAD_LENGTH, bytes);
+ long trailingSlopBytes = StreamCopy.readToEOF(is);
+ if (!parentMetaData.has(PAYLOAD_SLOP_BYTES) || trailingSlopBytes > 0) {
+ parentMetaData.putLong(PAYLOAD_SLOP_BYTES, trailingSlopBytes);
+ }
+ if (!parentMetaData.has(PAYLOAD_LENGTH) || bytes != parentMetaData.getLong(PAYLOAD_LENGTH)) {
+ parentMetaData.putLong(PAYLOAD_LENGTH, bytes);
+ }
return new WARCMetaDataResource(md,container, headers);
} catch (HttpParseException e) {
diff --git a/src/test/java/org/archive/resource/arc/ARCResourceTest.java b/src/test/java/org/archive/resource/arc/ARCResourceTest.java
new file mode 100644
index 00000000..43116af7
--- /dev/null
+++ b/src/test/java/org/archive/resource/arc/ARCResourceTest.java
@@ -0,0 +1,48 @@
+package org.archive.resource.arc;
+
+
+import static org.archive.resource.ResourceConstants.PAYLOAD_LENGTH;
+import static org.archive.resource.ResourceConstants.PAYLOAD_SLOP_BYTES;
+
+import java.io.IOException;
+
+import org.archive.extract.ExtractingResourceFactoryMapper;
+import org.archive.extract.ExtractingResourceProducer;
+import org.archive.extract.ProducerUtils;
+import org.archive.extract.ResourceFactoryMapper;
+import org.archive.resource.Resource;
+import org.archive.resource.ResourceParseException;
+import org.archive.resource.ResourceProducer;
+import org.archive.util.StreamCopy;
+
+import org.json.JSONObject;
+
+import junit.framework.TestCase;
+
+public class ARCResourceTest extends TestCase {
+
+ public void testARCResource() throws ResourceParseException, IOException {
+ String testFileName = "../../format/arc/IAH-20080430204825-00000-blackbook-truncated.arc";
+ ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath());
+ ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
+ ExtractingResourceProducer extractor = new ExtractingResourceProducer(producer, mapper);
+
+ Resource resource = extractor.getNext();
+
+ while (resource != null) {
+ JSONObject payloadMD = resource.getMetaData().getTopMetaData().getJSONObject("Envelope")
+ .getJSONObject("Payload-Metadata");
+ System.err.println(payloadMD);
+
+ if (payloadMD.has(PAYLOAD_LENGTH)) {
+ assertTrue(payloadMD.getLong(PAYLOAD_LENGTH) != -1);
+ }
+ if (payloadMD.has(PAYLOAD_SLOP_BYTES)) {
+ // does not occur with the tested ARC file
+ }
+
+ StreamCopy.readToEOF(resource.getInputStream());
+ resource = extractor.getNext();
+ }
+ }
+}
diff --git a/src/test/java/org/archive/resource/warc/WARCResourceTest.java b/src/test/java/org/archive/resource/warc/WARCResourceTest.java
new file mode 100644
index 00000000..1b935405
--- /dev/null
+++ b/src/test/java/org/archive/resource/warc/WARCResourceTest.java
@@ -0,0 +1,46 @@
+package org.archive.resource.warc;
+
+import static org.archive.resource.ResourceConstants.PAYLOAD_LENGTH;
+import static org.archive.resource.ResourceConstants.PAYLOAD_SLOP_BYTES;
+
+import java.io.IOException;
+
+import org.archive.extract.ExtractingResourceFactoryMapper;
+import org.archive.extract.ExtractingResourceProducer;
+import org.archive.extract.ProducerUtils;
+import org.archive.extract.ResourceFactoryMapper;
+import org.archive.resource.Resource;
+import org.archive.resource.ResourceParseException;
+import org.archive.resource.ResourceProducer;
+import org.archive.util.StreamCopy;
+
+import org.json.JSONObject;
+
+import junit.framework.TestCase;
+
+public class WARCResourceTest extends TestCase {
+
+ public void testWARCResource() throws ResourceParseException, IOException {
+ String testFileName = "../../format/warc/IAH-urls-wget.warc";
+ ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath());
+ ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
+ ExtractingResourceProducer extractor = new ExtractingResourceProducer(producer, mapper);
+
+ Resource resource = extractor.getNext();
+
+ while (resource != null) {
+ JSONObject payloadMD = resource.getMetaData().getTopMetaData().getJSONObject("Envelope")
+ .getJSONObject("Payload-Metadata");
+
+ if (payloadMD.has(PAYLOAD_LENGTH)) {
+ assertTrue(payloadMD.getLong(PAYLOAD_LENGTH) != -1);
+ }
+ if (payloadMD.has(PAYLOAD_SLOP_BYTES)) {
+ assertEquals(4, payloadMD.getLong(PAYLOAD_SLOP_BYTES));
+ }
+
+ StreamCopy.readToEOF(resource.getInputStream());
+ resource = extractor.getNext();
+ }
+ }
+}
From c5b779128edd1f0fad2709d4ab1b797326c2cb6c Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Fri, 20 Dec 2024 14:10:44 +0900
Subject: [PATCH 056/169] Update CHANGES.md for 1.3.0
---
CHANGES.md | 37 +++++++++++++++++++++++++++++++++++++
1 file changed, 37 insertions(+)
diff --git a/CHANGES.md b/CHANGES.md
index e3afd137..8a0a7d20 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,3 +1,40 @@
+1.3.0
+-----
+
+#### URL Canonicalization Changed
+
+The output of WaybackURLKeyMaker and other canonicalizers based on BasicURLCanonicalizer has changed for URLs that
+contain non UTF-8 percent encoded sequences. For example when a URL contains "%C3%23" it will now be normalised to
+"%c3%23" whereas previous releases produced "%25c3%23". This change brings webarchive-commons more inline with pywb,
+surt (Python), warcio.js and RFC 3986. While CDX file compatibility with these newer tools should improve, note that CDX
+files generated by the new release which contain such URLs may not work correctly with existing versions of
+OpenWayback that use the older webarchive-commons. [#102](https://github.com/iipc/webarchive-commons/pull/102)
+
+#### Bug fixes
+
+* WAT: Duplicated payload metadata values for "Actual-Content-Length" and "Trailing-Slop-Length" [#103](https://github.com/iipc/webarchive-commons/pull/103)
+* ObjectPlusFilesOutputStream.hardlinkOrCopy now uses `Files.createLink()` instead of executing `ln`. This
+ prevents the potential for security vulnerabilities from command line option injection and improves portability.
+
+#### Dependency upgrades
+
+* fastutil removed
+* dsiutils removed
+
+#### Deprecations
+
+The following classes and enum members have been marked deprecated as a step towards removal of the dependency on
+Apache Commons HttpClient 3.1.
+
+* org.archive.httpclient.HttpRecorderGetMethod
+* org.archive.httpclient.HttpRecorderMethod
+* org.archive.httpclient.HttpRecorderPostMethod
+* org.archive.httpclient.SingleHttpConnectionManager
+* org.archive.httpclient.ThreadLocalHttpConnectionManager
+* org.archive.util.binsearch.impl.http.ApacheHttp31SLR
+* org.archive.util.binsearch.impl.http.ApacheHttp31SLRFactory
+* org.archive.util.binsearch.impl.http.HTTPSeekableLineReaderFactory.HttpLibs.APACHE_31
+
1.2.0
-----
From eee48cc18017dde59b1d12f11654a2c752c63d45 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Fri, 20 Dec 2024 14:12:09 +0900
Subject: [PATCH 057/169] [maven-release-plugin] prepare release
webarchive-commons-1.3.0
---
pom.xml | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/pom.xml b/pom.xml
index 6dec154c..f489826c 100644
--- a/pom.xml
+++ b/pom.xml
@@ -3,7 +3,7 @@
org.netpreserve.commons
webarchive-commons
- 1.2.1-SNAPSHOT
+ 1.3.0
jar
webarchive-commons
@@ -41,7 +41,7 @@
scm:git:git@github.com:iipc/webarchive-commons.git
scm:git:git@github.com:iipc/webarchive-commons.git
https://github.com/iipc/webarchive-commons
- HEAD
+ webarchive-commons-1.3.0
From a8fd8a74b83d3327bc074cf783f6315659fbc715 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Fri, 20 Dec 2024 14:12:13 +0900
Subject: [PATCH 058/169] [maven-release-plugin] prepare for next development
iteration
---
pom.xml | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/pom.xml b/pom.xml
index f489826c..74a4bbe6 100644
--- a/pom.xml
+++ b/pom.xml
@@ -3,7 +3,7 @@
org.netpreserve.commons
webarchive-commons
- 1.3.0
+ 1.3.1-SNAPSHOT
jar
webarchive-commons
@@ -41,7 +41,7 @@
scm:git:git@github.com:iipc/webarchive-commons.git
scm:git:git@github.com:iipc/webarchive-commons.git
https://github.com/iipc/webarchive-commons
- webarchive-commons-1.3.0
+ HEAD
From a3a39598fc7b6947e38161e9f27f6842eed95456 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Tue, 11 Mar 2025 10:20:00 +0100
Subject: [PATCH 059/169] Upgrade GitHub workflow actions cache
---
.github/workflows/maven.yml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.github/workflows/maven.yml b/.github/workflows/maven.yml
index 8bb55c4e..60fac096 100644
--- a/.github/workflows/maven.yml
+++ b/.github/workflows/maven.yml
@@ -24,7 +24,7 @@ jobs:
distribution: 'temurin'
cache: maven
- name: Cache local Maven repository
- uses: actions/cache@v2
+ uses: actions/cache@v4
with:
path: ~/.m2/repository
key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}
From c427a12e82f3cebd6ba57152209d0bb5b9de2619 Mon Sep 17 00:00:00 2001
From: Alex Osborne
Date: Sun, 18 May 2025 09:39:48 +0900
Subject: [PATCH 060/169] Upgrade to JUnit 5
---
CHANGES.md | 7 +
pom.xml | 7 +-
.../java/org/archive/util/TmpDirTestCase.java | 119 ----
.../extract/RealCDXExtractorOutputTest.java | 31 +-
.../format/dns/DNSResponseParserTest.java | 10 +-
.../format/gzip/GZIPMemberSeriesTest.java | 38 +-
.../format/gzip/GZIPMemberWriterTest.java | 5 +-
.../format/gzip/zipnum/ZipNumWriterTest.java | 16 +-
.../http/HttpRequestMessageParserTest.java | 12 +-
.../format/http/HttpResponseParserTest.java | 14 +-
.../json/CompoundORJSONPathSpecTest.java | 5 +-
.../format/json/JSONPathSpecFactoryTest.java | 5 +-
.../org/archive/format/json/JSONViewTest.java | 9 +-
.../format/json/SimpleJSONPathSpecTest.java | 5 +-
.../format/text/html/CDATALexerTest.java | 14 +-
.../archive/io/ArchiveReaderFactoryTest.java | 27 +-
.../io/BufferedSeekInputStreamTest.java | 9 +-
.../archive/io/HeaderedArchiveRecordTest.java | 22 +-
.../archive/io/RecordingInputStreamTest.java | 39 +-
.../archive/io/RecordingOutputStreamTest.java | 74 ++-
.../archive/io/ReplayCharSequenceTest.java | 110 ++--
.../io/RepositionableInputStreamTest.java | 20 +-
.../archive/io/arc/ARCReaderFactoryTest.java | 13 +-
.../org/archive/io/arc/ARCWriterPoolTest.java | 41 +-
.../org/archive/io/arc/ARCWriterTest.java | 121 ++--
.../io/warc/WARCReaderFactoryTest.java | 7 +-
.../org/archive/io/warc/WARCWriterTest.java | 67 ++-
.../org/archive/net/PublicSuffixesTest.java | 55 +-
.../org/archive/resource/MetaDataTest.java | 21 +-
.../archive/resource/arc/ARCResourceTest.java | 6 +-
.../html/ExtractingParseObserverTest.java | 24 +-
.../resource/html/HTMLMetaDataTest.java | 12 +-
.../resource/warc/WARCResourceTest.java | 7 +-
.../org/archive/uid/UUIDGeneratorTest.java | 7 +-
.../url/AggressiveIAURLCanonicalizerTest.java | 9 +-
.../url/BasicURLCanonicalizerTest.java | 39 +-
.../java/org/archive/url/HandyURLTest.java | 13 +-
.../archive/url/IAURLCanonicalizerTest.java | 13 +-
.../url/OrdinaryIAURLCanonicalizerTest.java | 10 +-
.../java/org/archive/url/URLParserTest.java | 11 +-
.../archive/url/URLRegexTransformerTest.java | 45 +-
.../org/archive/url/UsableURIFactoryTest.java | 564 +++++++++---------
.../java/org/archive/url/UsableURITest.java | 16 +-
.../archive/url/WaybackURLKeyMakerTest.java | 7 +-
.../org/archive/util/ArchiveUtilsTest.java | 231 ++++---
.../java/org/archive/util/ByteOpTest.java | 14 +-
.../org/archive/util/CrossProductTest.java | 8 +-
.../java/org/archive/util/FileUtilsTest.java | 69 ++-
.../util/InterruptibleCharSequenceTest.java | 21 +-
.../org/archive/util/MimetypeUtilsTest.java | 63 +-
.../org/archive/util/PropertyUtilsTest.java | 11 +-
.../util/StringFieldExtractorTest.java | 10 +-
src/test/java/org/archive/util/TestUtils.java | 17 +-
.../org/archive/util/anvl/ANVLRecordTest.java | 56 +-
.../util/binsearch/SortedTextFileTest.java | 8 +-
.../iterator/CachingStringFilterTest.java | 5 +-
.../iterator/FilterStringIteratorTest.java | 25 +-
.../iterator/SortedCompositeIteratorTest.java | 8 +-
.../util/zip/GZIPMembersInputStreamTest.java | 157 ++---
59 files changed, 1236 insertions(+), 1173 deletions(-)
delete mode 100644 src/main/java/org/archive/util/TmpDirTestCase.java
diff --git a/CHANGES.md b/CHANGES.md
index 8a0a7d20..478238bf 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,3 +1,10 @@
+Unreleased
+----------
+
+#### Dependency upgrades
+
+- **junit**: 4.13.2 → 5.12.2
+
1.3.0
-----
diff --git a/pom.xml b/pom.xml
index 74a4bbe6..c70a2cd7 100644
--- a/pom.xml
+++ b/pom.xml
@@ -52,9 +52,10 @@
- junit
- junit
- 4.13.2
+ org.junit.jupiter
+ junit-jupiter
+ 5.12.2
+ test
diff --git a/src/main/java/org/archive/util/TmpDirTestCase.java b/src/main/java/org/archive/util/TmpDirTestCase.java
deleted file mode 100644
index 09ec345b..00000000
--- a/src/main/java/org/archive/util/TmpDirTestCase.java
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * This file is part of the Heritrix web crawler (crawler.archive.org).
- *
- * Licensed to the Internet Archive (IA) by one or more individual
- * contributors.
- *
- * The IA licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.archive.util;
-
-import java.io.File;
-import java.io.IOException;
-
-import junit.framework.TestCase;
-
-
-/**
- * Base class for TestCases that want access to a tmp dir for the writing
- * of files.
- *
- * @author stack
- */
-public abstract class TmpDirTestCase extends TestCase
-{
- /**
- * Name of the system property that holds pointer to tmp directory into
- * which we can safely write files.
- */
- public static final String TEST_TMP_SYSTEM_PROPERTY_NAME = "testtmpdir";
-
- /**
- * Default test tmp.
- */
- public static final String DEFAULT_TEST_TMP_DIR = File.separator + "tmp" +
- File.separator + "heritrix-junit-tests";
-
- /**
- * Directory to write temporary files to.
- */
- private File tmpDir = null;
-
-
- public TmpDirTestCase()
- {
- super();
- }
-
- public TmpDirTestCase(String testName)
- {
- super(testName);
- }
-
- /*
- * @see TestCase#setUp()
- */
- protected void setUp() throws Exception {
- super.setUp();
- this.tmpDir = tmpDir();
- }
-
- /**
- * @return Returns the tmpDir.
- */
- public File getTmpDir()
- {
- return this.tmpDir;
- }
-
- /**
- * Delete any files left over from previous run.
- *
- * @param basename Base name of files we're to clean up.
- */
- public void cleanUpOldFiles(String basename) {
- cleanUpOldFiles(getTmpDir(), basename);
- }
-
- /**
- * Delete any files left over from previous run.
- *
- * @param prefix Base name of files we're to clean up.
- * @param basedir Directory to start cleaning in.
- */
- public void cleanUpOldFiles(File basedir, String prefix) {
- File [] files = FileUtils.getFilesWithPrefix(basedir, prefix);
- if (files != null) {
- for (int i = 0; i < files.length; i++) {
- org.apache.commons.io.FileUtils.deleteQuietly(files[i]);
- }
- }
- }
-
-
- public static File tmpDir() throws IOException {
- String tmpDirStr = System.getProperty(TEST_TMP_SYSTEM_PROPERTY_NAME);
- tmpDirStr = (tmpDirStr == null)? DEFAULT_TEST_TMP_DIR: tmpDirStr;
- File tmpDir = new File(tmpDirStr);
- FileUtils.ensureWriteableDirectory(tmpDir);
-
- if (!tmpDir.canWrite())
- {
- throw new IOException(tmpDir.getAbsolutePath() +
- " is unwriteable.");
- }
-
- return tmpDir;
- }
-}
diff --git a/src/test/java/org/archive/extract/RealCDXExtractorOutputTest.java b/src/test/java/org/archive/extract/RealCDXExtractorOutputTest.java
index 14f8489d..a716df82 100644
--- a/src/test/java/org/archive/extract/RealCDXExtractorOutputTest.java
+++ b/src/test/java/org/archive/extract/RealCDXExtractorOutputTest.java
@@ -1,28 +1,29 @@
package org.archive.extract;
-import java.net.MalformedURLException;
import java.net.URI;
-import java.net.URISyntaxException;
-import java.net.URL;
-import java.net.URLEncoder;
-import junit.framework.TestCase;
+import org.junit.jupiter.api.Test;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
-public class RealCDXExtractorOutputTest extends TestCase {
+public class RealCDXExtractorOutputTest {
+
+ @Test
public void testEscapeResolvedUrl() throws Exception {
- String context ="http://www.uni-giessen.de/cms/studium/dateien/informationberatung/merkblattpdf";
- String spec = "http://fss.plone.uni-giessen.de/fß/studium/dateien/informationberatung/merkblattpdf/file/Mérkblatt zur Gestaltung von Nachteilsausgleichen.pdf?föo=bar#änchor";
- String escaped = RealCDXExtractorOutput.resolve(context, spec);
- assertTrue(escaped.indexOf(" ") < 0);
- URI parsed = new URI(escaped);
- assertEquals("änchor", parsed.getFragment());
+ String context = "http://www.uni-giessen.de/cms/studium/dateien/informationberatung/merkblattpdf";
+ String spec = "http://fss.plone.uni-giessen.de/fß/studium/dateien/informationberatung/merkblattpdf/file/Mérkblatt zur Gestaltung von Nachteilsausgleichen.pdf?föo=bar#änchor";
+ String escaped = RealCDXExtractorOutput.resolve(context, spec);
+ assertTrue(escaped.indexOf(" ") < 0);
+ URI parsed = new URI(escaped);
+ assertEquals("änchor", parsed.getFragment());
}
+ @Test
public void testNoDoubleEscaping() throws Exception {
- String spec = "https://www.google.com/search?q=java+escape+url+spaces&ie=utf-8&oe=utf-8";
- String resolved = RealCDXExtractorOutput.resolve(spec, spec);
- assertTrue(spec.equals(resolved));
+ String spec = "https://www.google.com/search?q=java+escape+url+spaces&ie=utf-8&oe=utf-8";
+ String resolved = RealCDXExtractorOutput.resolve(spec, spec);
+ assertTrue(spec.equals(resolved));
}
}
diff --git a/src/test/java/org/archive/format/dns/DNSResponseParserTest.java b/src/test/java/org/archive/format/dns/DNSResponseParserTest.java
index 27d0fdad..7ade0ad5 100644
--- a/src/test/java/org/archive/format/dns/DNSResponseParserTest.java
+++ b/src/test/java/org/archive/format/dns/DNSResponseParserTest.java
@@ -3,15 +3,13 @@
import java.io.ByteArrayInputStream;
import java.io.IOException;
-import org.archive.format.dns.DNSParseException;
-import org.archive.format.dns.DNSRecord;
-import org.archive.format.dns.DNSResponse;
-import org.archive.format.dns.DNSResponseParser;
+import org.junit.jupiter.api.Test;
-import junit.framework.TestCase;
+import static org.junit.jupiter.api.Assertions.assertEquals;
-public class DNSResponseParserTest extends TestCase {
+public class DNSResponseParserTest {
DNSResponseParser parser = new DNSResponseParser();
+ @Test
public void testParse() throws DNSParseException, IOException {
verifyResults("20110328212258\nfarm6.static.flickr.a06.yahoodns.net.\t300\tIN\tA\t98.136.170.121\n",
"20110328212258",new String[][] {{"farm6.static.flickr.a06.yahoodns.net.","300","IN","A","98.136.170.121"}});
diff --git a/src/test/java/org/archive/format/gzip/GZIPMemberSeriesTest.java b/src/test/java/org/archive/format/gzip/GZIPMemberSeriesTest.java
index 2eec46ec..6f218ebb 100644
--- a/src/test/java/org/archive/format/gzip/GZIPMemberSeriesTest.java
+++ b/src/test/java/org/archive/format/gzip/GZIPMemberSeriesTest.java
@@ -9,9 +9,6 @@
import org.archive.util.ByteOp;
import org.archive.util.IAUtils;
import org.archive.util.TestUtils;
-import org.archive.format.gzip.GZIPFormatException;
-import org.archive.format.gzip.GZIPMemberSeries;
-import org.archive.format.gzip.GZIPSeriesMember;
import org.archive.streamcontext.ByteArrayWrappedStream;
import org.archive.streamcontext.SimpleStream;
import org.archive.streamcontext.Stream;
@@ -19,10 +16,13 @@
import com.google.common.io.ByteStreams;
import com.google.common.primitives.Bytes;
-import junit.framework.TestCase;
+import org.junit.jupiter.api.Test;
-public class GZIPMemberSeriesTest extends TestCase {
+import static org.junit.jupiter.api.Assertions.*;
+public class GZIPMemberSeriesTest {
+
+ @Test
public void testSingle() throws IndexOutOfBoundsException, FileNotFoundException, IOException {
InputStream is = getClass().getResourceAsStream("abcd.gz");
@@ -38,6 +38,7 @@ public void testSingle() throws IndexOutOfBoundsException, FileNotFoundException
assertNull(s.getNextMember());
}
+ @Test
public void testSingleEmpty() throws IndexOutOfBoundsException, FileNotFoundException, IOException {
InputStream is = getClass().getResourceAsStream("empty.gz");
@@ -59,6 +60,7 @@ public void testSingleEmpty() throws IndexOutOfBoundsException, FileNotFoundExce
assertTrue(s.gotEOF());
}
+ @Test
public void testDouble() throws IndexOutOfBoundsException, FileNotFoundException, IOException {
InputStream is = getClass().getResourceAsStream("abcd.gz");
@@ -81,14 +83,14 @@ public void testDouble() throws IndexOutOfBoundsException, FileNotFoundException
assertNull(s.getNextMember());
}
-
+ @Test
public void testSingleCRCStrict() throws IndexOutOfBoundsException, FileNotFoundException, IOException {
InputStream is = getClass().getResourceAsStream("abcd.gz");
byte abcd[] = ByteStreams.toByteArray(is);
byte oldb = abcd[abcd.length-1];
abcd[abcd.length-1] = (byte) (abcd[abcd.length-1] + 1);
- assertFalse(oldb == abcd[abcd.length-1]);
+ assertNotEquals(oldb, abcd[abcd.length - 1]);
ByteArrayInputStream bais = new ByteArrayInputStream(abcd);
Stream stream = new SimpleStream(bais);
@@ -117,14 +119,15 @@ public void testSingleCRCStrict() throws IndexOutOfBoundsException, FileNotFound
}
assertNotNull(e);
}
-
+
+ @Test
public void testSingleCRCLAX() throws IndexOutOfBoundsException, FileNotFoundException, IOException {
InputStream is = getClass().getResourceAsStream("abcd.gz");
byte abcd[] = ByteStreams.toByteArray(is);
byte oldb = abcd[abcd.length-1];
abcd[abcd.length-1] = (byte) (abcd[abcd.length-1] + 1);
- assertFalse(oldb == abcd[abcd.length-1]);
+ assertNotEquals(oldb, abcd[abcd.length - 1]);
ByteArrayInputStream bais = new ByteArrayInputStream(abcd);
Stream stream = new SimpleStream(bais);
@@ -154,7 +157,8 @@ public void testSingleCRCLAX() throws IndexOutOfBoundsException, FileNotFoundExc
assertNull(e);
assertNull(s.getNextMember());
}
-
+
+ @Test
public void testDoubleCRC1LAX() throws IndexOutOfBoundsException, FileNotFoundException, IOException {
InputStream is = getClass().getResourceAsStream("abcd.gz");
@@ -162,7 +166,7 @@ public void testDoubleCRC1LAX() throws IndexOutOfBoundsException, FileNotFoundEx
byte abcdorig[] = ByteOp.copy(abcd);
byte oldb = abcd[abcd.length-1];
abcd[abcd.length-1] = (byte) (abcd[abcd.length-1] + 1);
- assertFalse(oldb == abcd[abcd.length-1]);
+ assertNotEquals(oldb, abcd[abcd.length - 1]);
byte both[] = Bytes.concat(abcd,abcdorig);
@@ -195,7 +199,8 @@ public void testDoubleCRC1LAX() throws IndexOutOfBoundsException, FileNotFoundEx
assertNotNull(m);
TestUtils.assertStreamEquals(m,"abcd".getBytes(IAUtils.UTF8));
}
-
+
+ @Test
public void testSingleDeflateError() throws IndexOutOfBoundsException, IOException {
InputStream is = getClass().getResourceAsStream("abcd.gz");
@@ -240,7 +245,7 @@ public void testSingleDeflateError() throws IndexOutOfBoundsException, IOExcepti
assertNull(m);
}
-
+ @Test
public void testDoubleDeflateError() throws IndexOutOfBoundsException, IOException {
InputStream is = getClass().getResourceAsStream("abcd.gz");
@@ -290,7 +295,8 @@ public void testDoubleDeflateError() throws IndexOutOfBoundsException, IOExcepti
assertFalse(s.gotIOError());
}
-
+
+ @Test
public void testDoubleBiggerDeflateErrOnFirst() throws IOException {
String resource = "double-single-inflate-error.gz";
InputStream is = getClass().getResourceAsStream(resource);
@@ -333,7 +339,8 @@ public void testDoubleBiggerDeflateErrOnFirst() throws IOException {
}
-
+
+ @Test
public void testAutoSkip() throws IOException {
InputStream is = getClass().getResourceAsStream("abcd.gz");
byte abcd[] = ByteStreams.toByteArray(is);
@@ -375,6 +382,7 @@ public void testAutoSkip() throws IOException {
assertTrue(s.gotEOF());
}
+ @Test
public void testWgetProblem() throws IndexOutOfBoundsException, FileNotFoundException, IOException {
InputStream is = getClass().getResourceAsStream("IAH-urls-wget.warc.gz");
new GZIPDecoder().parseHeader(is);
diff --git a/src/test/java/org/archive/format/gzip/GZIPMemberWriterTest.java b/src/test/java/org/archive/format/gzip/GZIPMemberWriterTest.java
index 483d2baf..45bc18e4 100644
--- a/src/test/java/org/archive/format/gzip/GZIPMemberWriterTest.java
+++ b/src/test/java/org/archive/format/gzip/GZIPMemberWriterTest.java
@@ -7,10 +7,11 @@
import org.archive.util.IAUtils;
-import junit.framework.TestCase;
+import org.junit.jupiter.api.Test;
-public class GZIPMemberWriterTest extends TestCase {
+public class GZIPMemberWriterTest {
+ @Test
public void testWrite() throws IOException {
File outFile = File.createTempFile("tmp", ".gz");
GZIPMemberWriter gzw = new GZIPMemberWriter(new FileOutputStream(outFile));
diff --git a/src/test/java/org/archive/format/gzip/zipnum/ZipNumWriterTest.java b/src/test/java/org/archive/format/gzip/zipnum/ZipNumWriterTest.java
index cfadbd79..25a5eaa7 100644
--- a/src/test/java/org/archive/format/gzip/zipnum/ZipNumWriterTest.java
+++ b/src/test/java/org/archive/format/gzip/zipnum/ZipNumWriterTest.java
@@ -10,19 +10,21 @@
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
-import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
import org.archive.format.gzip.GZIPMemberSeries;
import org.archive.format.gzip.GZIPSeriesMember;
import org.archive.streamcontext.SimpleStream;
-import junit.framework.TestCase;
+import org.junit.jupiter.api.Test;
-public class ZipNumWriterTest extends TestCase {
+import static org.junit.jupiter.api.Assertions.assertEquals;
+public class ZipNumWriterTest {
+
+ @Test
public void testAddRecord() throws IOException {
- Charset UTF8 = Charset.forName("UTF-8");
- File main = File.createTempFile("test-znw",".main");
+ File main = File.createTempFile("test-znw",".main");
File summ = File.createTempFile("test-znw",".summ");
main.deleteOnExit();
summ.deleteOnExit();
@@ -31,11 +33,11 @@ public void testAddRecord() throws IOException {
ZipNumWriter znw = new ZipNumWriter(new FileOutputStream(main,false),
new FileOutputStream(summ,false), limit);
for(int i = 0; i < 1000; i++) {
- znw.addRecord(String.format("%06d\n",i).getBytes(UTF8));
+ znw.addRecord(String.format("%06d\n",i).getBytes(StandardCharsets.UTF_8));
}
znw.close();
InputStreamReader isr =
- new InputStreamReader(new FileInputStream(summ),UTF8);
+ new InputStreamReader(new FileInputStream(summ), StandardCharsets.UTF_8);
BufferedReader br = new BufferedReader(isr);
String line = null;
int count = 0;
diff --git a/src/test/java/org/archive/format/http/HttpRequestMessageParserTest.java b/src/test/java/org/archive/format/http/HttpRequestMessageParserTest.java
index 50df9dde..9a5d69af 100644
--- a/src/test/java/org/archive/format/http/HttpRequestMessageParserTest.java
+++ b/src/test/java/org/archive/format/http/HttpRequestMessageParserTest.java
@@ -3,16 +3,16 @@
import java.io.ByteArrayInputStream;
import java.io.IOException;
-import org.archive.format.http.HttpConstants;
-import org.archive.format.http.HttpParseException;
-import org.archive.format.http.HttpRequestMessage;
-import org.archive.format.http.HttpRequestMessageParser;
import org.archive.util.IAUtils;
-import junit.framework.TestCase;
+import org.junit.jupiter.api.Test;
-public class HttpRequestMessageParserTest extends TestCase implements HttpConstants {
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+public class HttpRequestMessageParserTest implements HttpConstants {
HttpRequestMessageParser parser = new HttpRequestMessageParser();
+
+ @Test
public void testParse() throws IOException {
assertParse("GET / HTTP/1.0\r\n", METHOD_GET, "/", VERSION_0);
assertParse("GET / HTTP/1.1\r\n", METHOD_GET, "/", VERSION_1);
diff --git a/src/test/java/org/archive/format/http/HttpResponseParserTest.java b/src/test/java/org/archive/format/http/HttpResponseParserTest.java
index ea076a69..631d67c7 100644
--- a/src/test/java/org/archive/format/http/HttpResponseParserTest.java
+++ b/src/test/java/org/archive/format/http/HttpResponseParserTest.java
@@ -5,16 +5,14 @@
import org.archive.util.IAUtils;
import org.archive.util.TestUtils;
-import org.archive.format.http.HttpHeader;
-import org.archive.format.http.HttpHeaders;
-import org.archive.format.http.HttpParseException;
-import org.archive.format.http.HttpResponse;
-import org.archive.format.http.HttpResponseParser;
-import junit.framework.TestCase;
+import org.junit.jupiter.api.Test;
-public class HttpResponseParserTest extends TestCase {
+import static org.junit.jupiter.api.Assertions.*;
+public class HttpResponseParserTest {
+
+ @Test
public void testParse() throws IOException {
HttpResponseParser parser = new HttpResponseParser();
@@ -38,6 +36,7 @@ public void testParse() throws IOException {
}
+ @Test
public void testParseWithLf() throws IOException {
HttpResponseParser parser = new HttpResponseParser();
@@ -57,6 +56,7 @@ public void testParseWithLf() throws IOException {
}
+ @Test
public void testParseEmptyHeaderField() throws IOException {
HttpResponseParser parser = new HttpResponseParser();
diff --git a/src/test/java/org/archive/format/json/CompoundORJSONPathSpecTest.java b/src/test/java/org/archive/format/json/CompoundORJSONPathSpecTest.java
index 57c21965..ef8c2fa0 100644
--- a/src/test/java/org/archive/format/json/CompoundORJSONPathSpecTest.java
+++ b/src/test/java/org/archive/format/json/CompoundORJSONPathSpecTest.java
@@ -6,11 +6,12 @@
import org.json.JSONException;
import org.json.JSONObject;
-import junit.framework.TestCase;
+import org.junit.jupiter.api.Test;
-public class CompoundORJSONPathSpecTest extends TestCase {
+public class CompoundORJSONPathSpecTest {
String json1S = "{\"a\":\"A\"}";
String json2S = "{\"b\":\"B\"}";
+ @Test
public void testExtract() throws JSONException {
JSONObject json1 = new JSONObject(json1S);
JSONObject json2 = new JSONObject(json2S);
diff --git a/src/test/java/org/archive/format/json/JSONPathSpecFactoryTest.java b/src/test/java/org/archive/format/json/JSONPathSpecFactoryTest.java
index ab999dca..257cb112 100644
--- a/src/test/java/org/archive/format/json/JSONPathSpecFactoryTest.java
+++ b/src/test/java/org/archive/format/json/JSONPathSpecFactoryTest.java
@@ -4,9 +4,9 @@
import org.json.JSONException;
import org.json.JSONObject;
-import junit.framework.TestCase;
+import org.junit.jupiter.api.Test;
-public class JSONPathSpecFactoryTest extends TestCase {
+public class JSONPathSpecFactoryTest {
String json1S = "{\"a\":\"A\"}";
String json2S = "{\"b\":\"B\"}";
@@ -14,6 +14,7 @@ public class JSONPathSpecFactoryTest extends TestCase {
String json4S = "{\"b\":[{\"x\":\"x1\", \"y\":\"y1\"},{\"x\":\"x2\", \"y\":\"y2\"}]}";
+ @Test
public void testGet() throws JSONException {
JSONObject json1 = new JSONObject(json1S);
JSONObject json2 = new JSONObject(json2S);
diff --git a/src/test/java/org/archive/format/json/JSONViewTest.java b/src/test/java/org/archive/format/json/JSONViewTest.java
index 20bd4fe6..aabbe7df 100644
--- a/src/test/java/org/archive/format/json/JSONViewTest.java
+++ b/src/test/java/org/archive/format/json/JSONViewTest.java
@@ -4,14 +4,15 @@
import org.json.JSONException;
import org.json.JSONObject;
-import junit.framework.TestCase;
+import org.junit.jupiter.api.Test;
-public class JSONViewTest extends TestCase {
+public class JSONViewTest {
public int getInt(byte b[]) {
return b[0] & 0xff;
}
-
+
+ @Test
public void testBytes() throws JSONException {
JSONObject o = new JSONObject();
o.append("name1", "val\\rue1");
@@ -28,6 +29,8 @@ public void testBytes() throws JSONException {
System.out.format("I(%d) gi(%d)\n",i,gi);
}
}
+
+ @Test
public void testApply() throws JSONException {
String json1S = "{\"url\":\"a\",\"link\":[{\"zz\":\"1\",\"qq\":\"qa\"},{\"zz2\":\"2\",\"qq\":\"qb\"},{\"zz\":\"3\",\"qq\":\"qc\"},{\"zz\":\"4\"}]}";
JSONObject json1 = new JSONObject(json1S);
diff --git a/src/test/java/org/archive/format/json/SimpleJSONPathSpecTest.java b/src/test/java/org/archive/format/json/SimpleJSONPathSpecTest.java
index a703b49a..640a5a80 100644
--- a/src/test/java/org/archive/format/json/SimpleJSONPathSpecTest.java
+++ b/src/test/java/org/archive/format/json/SimpleJSONPathSpecTest.java
@@ -4,15 +4,16 @@
import org.json.JSONException;
import org.json.JSONObject;
-import junit.framework.TestCase;
+import org.junit.jupiter.api.Test;
-public class SimpleJSONPathSpecTest extends TestCase {
+public class SimpleJSONPathSpecTest {
String json1 = "{\"a\": { \"b\": \"Foo\" }}";
String json2 = "{\"a\": { \"b\": [{\"a\":\"1\"},{\"a\":\"2\"}] }}";
String json3 = "{\"a\": { \"b\": {\"A\":\"11\",\"B\":\"22\"} }}";
String json4 = "{\"a\": { \"b\": [{\"A\":\"11\",\"B\":\"22\"},{\"A\":\"33\",\"B\":\"44\"}] }}";
+ @Test
public void testExtract() throws JSONException {
JSONObject json = new JSONObject(json1);
JSONPathSpec spec = new SimpleJSONPathSpec("a.b");
diff --git a/src/test/java/org/archive/format/text/html/CDATALexerTest.java b/src/test/java/org/archive/format/text/html/CDATALexerTest.java
index 481a3eda..856576ba 100644
--- a/src/test/java/org/archive/format/text/html/CDATALexerTest.java
+++ b/src/test/java/org/archive/format/text/html/CDATALexerTest.java
@@ -1,17 +1,16 @@
package org.archive.format.text.html;
-import org.archive.format.text.html.CDATALexer;
-import org.archive.format.text.html.NodeUtils;
import org.htmlparser.Node;
import org.htmlparser.lexer.Page;
-//import org.htmlparser.nodes.RemarkNode;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.util.ParserException;
-import junit.framework.TestCase;
+import org.junit.jupiter.api.Test;
-public class CDATALexerTest extends TestCase {
+import static org.junit.jupiter.api.Assertions.*;
+
+public class CDATALexerTest {
CDATALexer l;
Node n;
private CDATALexer makeLexer(String html) {
@@ -19,7 +18,8 @@ private CDATALexer makeLexer(String html) {
t.setPage(new Page(html));
return t;
}
-
+
+ @Test
public void testNextNode() throws ParserException {
l = makeLexer("blem ");
n = l.nextNode();
@@ -35,6 +35,7 @@ public void testNextNode() throws ParserException {
assertNull(l.nextNode());
}
+ @Test
public void testInJS() throws ParserException {
l = makeLexer("");
assertFalse(l.inCSS());
@@ -54,6 +55,7 @@ public void testInJS() throws ParserException {
assertTrue(NodeUtils.isCloseTagNodeNamed(n, "SCRIPT"));
}
+ @Test
public void testInCSS() throws ParserException {
l = makeLexer("");
assertFalse(l.inCSS());
diff --git a/src/test/java/org/archive/io/ArchiveReaderFactoryTest.java b/src/test/java/org/archive/io/ArchiveReaderFactoryTest.java
index 2313868c..f7ad75d2 100644
--- a/src/test/java/org/archive/io/ArchiveReaderFactoryTest.java
+++ b/src/test/java/org/archive/io/ArchiveReaderFactoryTest.java
@@ -21,29 +21,34 @@
import java.io.File;
import java.io.IOException;
-import java.net.MalformedURLException;
import java.net.URL;
import java.util.Iterator;
import org.apache.commons.lang.StringUtils;
-import org.archive.io.ArchiveRecord;
import org.archive.io.arc.ARCWriterTest;
-import org.archive.util.TmpDirTestCase;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+public class ArchiveReaderFactoryTest {
+ @TempDir
+ File tempDir;
-public class ArchiveReaderFactoryTest extends TmpDirTestCase {
/**
* Test local file as URL
* @throws IOException
*/
+ @Test
public void testGetFileURL() throws IOException {
- File arc = ARCWriterTest.createARCFile(getTmpDir(), true);
+ File arc = ARCWriterTest.createARCFile(tempDir, true);
ArchiveReader reader = null;
try {
reader = ArchiveReaderFactory.
get(new URL("file:////" + arc.getAbsolutePath()));
for (Iterator i = reader.iterator(); i.hasNext();) {
ArchiveRecord r = (ArchiveRecord)i.next();
- assertTrue("mime unread",StringUtils.isNotBlank(r.getHeader().getMimetype()));
+ assertTrue(StringUtils.isNotBlank(r.getHeader().getMimetype()),"mime unread");
}
} finally {
if (reader != null) {
@@ -56,14 +61,15 @@ public void testGetFileURL() throws IOException {
* Test local file as File
* @throws IOException
*/
+ @Test
public void testGetFile() throws IOException {
- File arc = ARCWriterTest.createARCFile(getTmpDir(), true);
+ File arc = ARCWriterTest.createARCFile(tempDir, true);
ArchiveReader reader = null;
try {
reader = ArchiveReaderFactory.get(arc.getAbsoluteFile());
for (Iterator i = reader.iterator(); i.hasNext();) {
ArchiveRecord r = (ArchiveRecord)i.next();
- assertTrue("mime unread",StringUtils.isNotBlank(r.getHeader().getMimetype()));
+ assertTrue(StringUtils.isNotBlank(r.getHeader().getMimetype()),"mime unread");
}
} finally {
if (reader != null) {
@@ -76,14 +82,15 @@ public void testGetFile() throws IOException {
* Test local file as String path
* @throws IOException
*/
+ @Test
public void testGetPath() throws IOException {
- File arc = ARCWriterTest.createARCFile(getTmpDir(), true);
+ File arc = ARCWriterTest.createARCFile(tempDir, true);
ArchiveReader reader = null;
try {
reader = ArchiveReaderFactory.get(arc.getAbsoluteFile().getAbsolutePath());
for (Iterator i = reader.iterator(); i.hasNext();) {
ArchiveRecord r = (ArchiveRecord)i.next();
- assertTrue("mime unread",StringUtils.isNotBlank(r.getHeader().getMimetype()));
+ assertTrue(StringUtils.isNotBlank(r.getHeader().getMimetype()),"mime unread");
}
} finally {
if (reader != null) {
diff --git a/src/test/java/org/archive/io/BufferedSeekInputStreamTest.java b/src/test/java/org/archive/io/BufferedSeekInputStreamTest.java
index 270e45e0..f7e8e0b2 100644
--- a/src/test/java/org/archive/io/BufferedSeekInputStreamTest.java
+++ b/src/test/java/org/archive/io/BufferedSeekInputStreamTest.java
@@ -18,9 +18,11 @@
*/
package org.archive.io;
+import org.junit.jupiter.api.Test;
+
import java.util.Random;
-import junit.framework.TestCase;
+import static org.junit.jupiter.api.Assertions.assertEquals;
/**
@@ -29,11 +31,12 @@
*
* @author pjack
*/
-public class BufferedSeekInputStreamTest extends TestCase {
+public class BufferedSeekInputStreamTest {
private static byte[] TEST_DATA = makeTestData();
-
+
+ @Test
public void testPosition() throws Exception {
Random random = new Random();
ArraySeekInputStream asis = new ArraySeekInputStream(TEST_DATA);
diff --git a/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java b/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java
index 9f7e2a15..7988cb2b 100644
--- a/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java
+++ b/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java
@@ -26,13 +26,15 @@
import java.util.Map;
import java.util.Set;
-import junit.framework.TestCase;
-
import org.apache.commons.httpclient.Header;
import org.archive.io.arc.ARCRecord;
import org.archive.io.warc.WARCRecord;
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
-public class HeaderedArchiveRecordTest extends TestCase {
+public class HeaderedArchiveRecordTest {
private static final String HTTPHEADER = "HTTP/1.1 200 OK\r\n"
+ "Last-Modified: Sun, 28 Aug 2005 14:10:55 GMT\r\n"
+ "Content-Length: 108\r\n" + "Connection: close\r\n"
@@ -41,6 +43,7 @@ public class HeaderedArchiveRecordTest extends TestCase {
+ " Neue Seite 1 \r\n" + " \r\n"
+ " \r\n" + " \r\n" + "