From 9515fb42642ba86a7e73eba4b3952491f1308427 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 23 Apr 2026 22:27:21 +0100 Subject: [PATCH 01/15] fix: patch the HostName URL to avoid empty Host --- .gitignore | 3 +++ .../java/org/commoncrawl/net/WarcUri.java | 3 +++ .../java/org/commoncrawl/net/WarcUriTest.java | 24 +++++++++++++++++++ 3 files changed, 30 insertions(+) create mode 100644 src/test/java/org/commoncrawl/net/WarcUriTest.java diff --git a/.gitignore b/.gitignore index 2c6546c..17913b3 100644 --- a/.gitignore +++ b/.gitignore @@ -39,3 +39,6 @@ metastore_db/ # Python __pycache__/ + +# Intellij Idea +.idea \ No newline at end of file diff --git a/src/main/java/org/commoncrawl/net/WarcUri.java b/src/main/java/org/commoncrawl/net/WarcUri.java index 8f658b7..3c5a1f9 100644 --- a/src/main/java/org/commoncrawl/net/WarcUri.java +++ b/src/main/java/org/commoncrawl/net/WarcUri.java @@ -35,6 +35,9 @@ public class WarcUri { public WarcUri(String uriString) { this.uriString = uriString; try { + //LF: hot fix to work around malformed robot.txt urls such as https:////sites.google.com/robots.txt + uriString = uriString.replaceFirst("^(https?:)/{2,}", "$1//"); + try { url = new java.net.URL(uriString); scheme = url.getProtocol(); diff --git a/src/test/java/org/commoncrawl/net/WarcUriTest.java b/src/test/java/org/commoncrawl/net/WarcUriTest.java new file mode 100644 index 0000000..7c60946 --- /dev/null +++ b/src/test/java/org/commoncrawl/net/WarcUriTest.java @@ -0,0 +1,24 @@ +package org.commoncrawl.net; + +import org.apache.commons.lang3.StringUtils; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +class WarcUriTest { + + @Test + void getHostName_malformedHttps_shouldNotBeEmpty(){ + WarcUri warcUri = new WarcUri("https:////www.google.com/robot.txt"); + assertNotNull(warcUri.getHostName()); + assertTrue(StringUtils.isNotEmpty(warcUri.getHostName().getHostName()), "getHostName() should not return an empty string."); + } + + @Test + void getHostName_malformedHttp_shouldNotBeEmpty(){ + WarcUri warcUri = new WarcUri("http:////www.google.com/robot.txt"); + assertNotNull(warcUri.getHostName()); + assertTrue(StringUtils.isNotEmpty(warcUri.getHostName().getHostName()), "getHostName() should not return an empty string."); + } +} \ No newline at end of file From 98fa960077094a087428058150c656ebd91e137d Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 23 Apr 2026 22:32:50 +0100 Subject: [PATCH 02/15] feat: add more tests --- .../java/org/commoncrawl/net/WarcUriTest.java | 21 +++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/src/test/java/org/commoncrawl/net/WarcUriTest.java b/src/test/java/org/commoncrawl/net/WarcUriTest.java index 7c60946..2467648 100644 --- a/src/test/java/org/commoncrawl/net/WarcUriTest.java +++ b/src/test/java/org/commoncrawl/net/WarcUriTest.java @@ -3,8 +3,7 @@ import org.apache.commons.lang3.StringUtils; import org.junit.jupiter.api.Test; -import static org.junit.jupiter.api.Assertions.assertNotNull; -import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.*; class WarcUriTest { @@ -13,6 +12,7 @@ void getHostName_malformedHttps_shouldNotBeEmpty(){ WarcUri warcUri = new WarcUri("https:////www.google.com/robot.txt"); assertNotNull(warcUri.getHostName()); assertTrue(StringUtils.isNotEmpty(warcUri.getHostName().getHostName()), "getHostName() should not return an empty string."); + assertEquals("www.google.com", warcUri.getHostName().getHostName(), "getHostName() should return 'www.google.com' for the malformed URL."); } @Test @@ -20,5 +20,22 @@ void getHostName_malformedHttp_shouldNotBeEmpty(){ WarcUri warcUri = new WarcUri("http:////www.google.com/robot.txt"); assertNotNull(warcUri.getHostName()); assertTrue(StringUtils.isNotEmpty(warcUri.getHostName().getHostName()), "getHostName() should not return an empty string."); + assertEquals("www.google.com", warcUri.getHostName().getHostName(), "getHostName() should return 'www.google.com' for the malformed URL."); + } + + @Test + void getHostName_validHttpHost_shouldNotBeEmpty(){ + WarcUri warcUri = new WarcUri("http://sites.google.com////robot.txt"); + assertNotNull(warcUri.getHostName()); + assertTrue(StringUtils.isNotEmpty(warcUri.getHostName().getHostName()), "getHostName() should not return an empty string."); + assertEquals("sites.google.com", warcUri.getHostName().getHostName(), "getHostName() should return 'www.google.com' for the malformed URL."); + } + + @Test + void getHostName_validHttpsHost_shouldNotBeEmpty(){ + WarcUri warcUri = new WarcUri("https://sites.google.com////robot.txt"); + assertNotNull(warcUri.getHostName()); + assertTrue(StringUtils.isNotEmpty(warcUri.getHostName().getHostName()), "getHostName() should not return an empty string."); + assertEquals("sites.google.com", warcUri.getHostName().getHostName(), "getHostName() should return 'www.google.com' for the malformed URL."); } } \ No newline at end of file From 083383f54919e000df22c09c62b50573470f0390 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 24 Apr 2026 05:47:41 +0100 Subject: [PATCH 03/15] fix: add logging for WARC URI parsing errors --- src/main/java/org/commoncrawl/net/WarcUri.java | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/commoncrawl/net/WarcUri.java b/src/main/java/org/commoncrawl/net/WarcUri.java index 3c5a1f9..3f9951f 100644 --- a/src/main/java/org/commoncrawl/net/WarcUri.java +++ b/src/main/java/org/commoncrawl/net/WarcUri.java @@ -19,6 +19,9 @@ import java.net.MalformedURLException; import java.net.URISyntaxException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + /** * Parses a string representation of a URI or URL found as WARC-Target-URI and * provides access to the parts of the URL/URI. Cf. {@link java.net.URL}, @@ -26,6 +29,8 @@ */ public class WarcUri { + private static final Logger LOG = LoggerFactory.getLogger(WarcUri.class); + private String uriString; private java.net.URL url; private java.net.URI uri; @@ -51,7 +56,7 @@ public WarcUri(String uriString) { hostName = new HostName(uri); } } catch (URISyntaxException uriExc) { - // failed to be parsed into parts + LOG.warn("Failed to parse WARC URI '{}': {}", this.uriString, uriExc); } } From 56e04060fe05dac37d505eb49eee7825a734b326 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 24 Apr 2026 14:21:49 +0100 Subject: [PATCH 04/15] fix: messages --- src/main/java/org/commoncrawl/net/WarcUri.java | 2 +- src/test/java/org/commoncrawl/net/WarcUriTest.java | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/commoncrawl/net/WarcUri.java b/src/main/java/org/commoncrawl/net/WarcUri.java index 3f9951f..a449076 100644 --- a/src/main/java/org/commoncrawl/net/WarcUri.java +++ b/src/main/java/org/commoncrawl/net/WarcUri.java @@ -56,7 +56,7 @@ public WarcUri(String uriString) { hostName = new HostName(uri); } } catch (URISyntaxException uriExc) { - LOG.warn("Failed to parse WARC URI '{}': {}", this.uriString, uriExc); + LOG.warn("Failed to parse WARC URI '{}'", this.uriString, uriExc); } } diff --git a/src/test/java/org/commoncrawl/net/WarcUriTest.java b/src/test/java/org/commoncrawl/net/WarcUriTest.java index 2467648..1030b0e 100644 --- a/src/test/java/org/commoncrawl/net/WarcUriTest.java +++ b/src/test/java/org/commoncrawl/net/WarcUriTest.java @@ -28,7 +28,7 @@ void getHostName_validHttpHost_shouldNotBeEmpty(){ WarcUri warcUri = new WarcUri("http://sites.google.com////robot.txt"); assertNotNull(warcUri.getHostName()); assertTrue(StringUtils.isNotEmpty(warcUri.getHostName().getHostName()), "getHostName() should not return an empty string."); - assertEquals("sites.google.com", warcUri.getHostName().getHostName(), "getHostName() should return 'www.google.com' for the malformed URL."); + assertEquals("sites.google.com", warcUri.getHostName().getHostName(), "getHostName() should return 'sites.google.com' for the URL with extra path slashes."); } @Test @@ -36,6 +36,6 @@ void getHostName_validHttpsHost_shouldNotBeEmpty(){ WarcUri warcUri = new WarcUri("https://sites.google.com////robot.txt"); assertNotNull(warcUri.getHostName()); assertTrue(StringUtils.isNotEmpty(warcUri.getHostName().getHostName()), "getHostName() should not return an empty string."); - assertEquals("sites.google.com", warcUri.getHostName().getHostName(), "getHostName() should return 'www.google.com' for the malformed URL."); + assertEquals("sites.google.com", warcUri.getHostName().getHostName(), "getHostName() should return 'sites.google.com' for the URL with extra path slashes."); } } \ No newline at end of file From 9d3bf117821f3e3323ba0112377961c1a8e4fdec Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 24 Apr 2026 14:21:49 +0100 Subject: [PATCH 05/15] chore: spotless --- .../java/org/commoncrawl/net/WarcUri.java | 3 +- .../java/org/commoncrawl/net/WarcUriTest.java | 76 ++++++++++++------- 2 files changed, 50 insertions(+), 29 deletions(-) diff --git a/src/main/java/org/commoncrawl/net/WarcUri.java b/src/main/java/org/commoncrawl/net/WarcUri.java index a449076..d042abe 100644 --- a/src/main/java/org/commoncrawl/net/WarcUri.java +++ b/src/main/java/org/commoncrawl/net/WarcUri.java @@ -40,7 +40,8 @@ public class WarcUri { public WarcUri(String uriString) { this.uriString = uriString; try { - //LF: hot fix to work around malformed robot.txt urls such as https:////sites.google.com/robots.txt + // LF: hot fix to work around malformed robot.txt urls such as + // https:////sites.google.com/robots.txt uriString = uriString.replaceFirst("^(https?:)/{2,}", "$1//"); try { diff --git a/src/test/java/org/commoncrawl/net/WarcUriTest.java b/src/test/java/org/commoncrawl/net/WarcUriTest.java index 1030b0e..fcbff2f 100644 --- a/src/test/java/org/commoncrawl/net/WarcUriTest.java +++ b/src/test/java/org/commoncrawl/net/WarcUriTest.java @@ -7,35 +7,55 @@ class WarcUriTest { - @Test - void getHostName_malformedHttps_shouldNotBeEmpty(){ - WarcUri warcUri = new WarcUri("https:////www.google.com/robot.txt"); - assertNotNull(warcUri.getHostName()); - assertTrue(StringUtils.isNotEmpty(warcUri.getHostName().getHostName()), "getHostName() should not return an empty string."); - assertEquals("www.google.com", warcUri.getHostName().getHostName(), "getHostName() should return 'www.google.com' for the malformed URL."); - } + @Test + void getHostName_malformedHttps_shouldNotBeEmpty() { + WarcUri warcUri = new WarcUri("https:////www.google.com/robot.txt"); + assertNotNull(warcUri.getHostName()); + assertTrue( + StringUtils.isNotEmpty(warcUri.getHostName().getHostName()), + "getHostName() should not return an empty string."); + assertEquals( + "www.google.com", + warcUri.getHostName().getHostName(), + "getHostName() should return 'www.google.com' for the malformed URL."); + } - @Test - void getHostName_malformedHttp_shouldNotBeEmpty(){ - WarcUri warcUri = new WarcUri("http:////www.google.com/robot.txt"); - assertNotNull(warcUri.getHostName()); - assertTrue(StringUtils.isNotEmpty(warcUri.getHostName().getHostName()), "getHostName() should not return an empty string."); - assertEquals("www.google.com", warcUri.getHostName().getHostName(), "getHostName() should return 'www.google.com' for the malformed URL."); - } + @Test + void getHostName_malformedHttp_shouldNotBeEmpty() { + WarcUri warcUri = new WarcUri("http:////www.google.com/robot.txt"); + assertNotNull(warcUri.getHostName()); + assertTrue( + StringUtils.isNotEmpty(warcUri.getHostName().getHostName()), + "getHostName() should not return an empty string."); + assertEquals( + "www.google.com", + warcUri.getHostName().getHostName(), + "getHostName() should return 'www.google.com' for the malformed URL."); + } - @Test - void getHostName_validHttpHost_shouldNotBeEmpty(){ - WarcUri warcUri = new WarcUri("http://sites.google.com////robot.txt"); - assertNotNull(warcUri.getHostName()); - assertTrue(StringUtils.isNotEmpty(warcUri.getHostName().getHostName()), "getHostName() should not return an empty string."); - assertEquals("sites.google.com", warcUri.getHostName().getHostName(), "getHostName() should return 'sites.google.com' for the URL with extra path slashes."); - } + @Test + void getHostName_validHttpHost_shouldNotBeEmpty() { + WarcUri warcUri = new WarcUri("http://sites.google.com////robot.txt"); + assertNotNull(warcUri.getHostName()); + assertTrue( + StringUtils.isNotEmpty(warcUri.getHostName().getHostName()), + "getHostName() should not return an empty string."); + assertEquals( + "sites.google.com", + warcUri.getHostName().getHostName(), + "getHostName() should return 'sites.google.com' for the URL with extra path slashes."); + } - @Test - void getHostName_validHttpsHost_shouldNotBeEmpty(){ - WarcUri warcUri = new WarcUri("https://sites.google.com////robot.txt"); - assertNotNull(warcUri.getHostName()); - assertTrue(StringUtils.isNotEmpty(warcUri.getHostName().getHostName()), "getHostName() should not return an empty string."); - assertEquals("sites.google.com", warcUri.getHostName().getHostName(), "getHostName() should return 'sites.google.com' for the URL with extra path slashes."); - } + @Test + void getHostName_validHttpsHost_shouldNotBeEmpty() { + WarcUri warcUri = new WarcUri("https://sites.google.com////robot.txt"); + assertNotNull(warcUri.getHostName()); + assertTrue( + StringUtils.isNotEmpty(warcUri.getHostName().getHostName()), + "getHostName() should not return an empty string."); + assertEquals( + "sites.google.com", + warcUri.getHostName().getHostName(), + "getHostName() should return 'sites.google.com' for the URL with extra path slashes."); + } } \ No newline at end of file From fb0605b119cb869db2ce35c6607098fe848e3725 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Wed, 29 Apr 2026 21:08:07 +0100 Subject: [PATCH 06/15] fix: robots.txt --- src/test/java/org/commoncrawl/net/WarcUriTest.java | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/test/java/org/commoncrawl/net/WarcUriTest.java b/src/test/java/org/commoncrawl/net/WarcUriTest.java index fcbff2f..af0895f 100644 --- a/src/test/java/org/commoncrawl/net/WarcUriTest.java +++ b/src/test/java/org/commoncrawl/net/WarcUriTest.java @@ -3,13 +3,15 @@ import org.apache.commons.lang3.StringUtils; import org.junit.jupiter.api.Test; -import static org.junit.jupiter.api.Assertions.*; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; class WarcUriTest { @Test void getHostName_malformedHttps_shouldNotBeEmpty() { - WarcUri warcUri = new WarcUri("https:////www.google.com/robot.txt"); + WarcUri warcUri = new WarcUri("https:////www.google.com/robots.txt"); assertNotNull(warcUri.getHostName()); assertTrue( StringUtils.isNotEmpty(warcUri.getHostName().getHostName()), @@ -22,7 +24,7 @@ void getHostName_malformedHttps_shouldNotBeEmpty() { @Test void getHostName_malformedHttp_shouldNotBeEmpty() { - WarcUri warcUri = new WarcUri("http:////www.google.com/robot.txt"); + WarcUri warcUri = new WarcUri("http:////www.google.com/robots.txt"); assertNotNull(warcUri.getHostName()); assertTrue( StringUtils.isNotEmpty(warcUri.getHostName().getHostName()), @@ -35,7 +37,7 @@ void getHostName_malformedHttp_shouldNotBeEmpty() { @Test void getHostName_validHttpHost_shouldNotBeEmpty() { - WarcUri warcUri = new WarcUri("http://sites.google.com////robot.txt"); + WarcUri warcUri = new WarcUri("http://sites.google.com////robots.txt"); assertNotNull(warcUri.getHostName()); assertTrue( StringUtils.isNotEmpty(warcUri.getHostName().getHostName()), @@ -48,7 +50,7 @@ void getHostName_validHttpHost_shouldNotBeEmpty() { @Test void getHostName_validHttpsHost_shouldNotBeEmpty() { - WarcUri warcUri = new WarcUri("https://sites.google.com////robot.txt"); + WarcUri warcUri = new WarcUri("https://sites.google.com////robots.txt"); assertNotNull(warcUri.getHostName()); assertTrue( StringUtils.isNotEmpty(warcUri.getHostName().getHostName()), From c8dd879929d7fc45f8468a3be83595a384506531 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Wed, 29 Apr 2026 21:09:16 +0100 Subject: [PATCH 07/15] feat: use more efficient approach --- .../java/org/commoncrawl/net/WarcUri.java | 24 ++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/commoncrawl/net/WarcUri.java b/src/main/java/org/commoncrawl/net/WarcUri.java index d042abe..f4a3dfc 100644 --- a/src/main/java/org/commoncrawl/net/WarcUri.java +++ b/src/main/java/org/commoncrawl/net/WarcUri.java @@ -40,9 +40,7 @@ public class WarcUri { public WarcUri(String uriString) { this.uriString = uriString; try { - // LF: hot fix to work around malformed robot.txt urls such as - // https:////sites.google.com/robots.txt - uriString = uriString.replaceFirst("^(https?:)/{2,}", "$1//"); + uriString = normalizeMalformedHttpSlashes(uriString); try { url = new java.net.URL(uriString); @@ -61,6 +59,26 @@ public WarcUri(String uriString) { } } + private static String normalizeMalformedHttpSlashes(String uriString) { + String schemePrefix; + if (uriString.startsWith("http:")) { + schemePrefix = "http:"; + } else if (uriString.startsWith("https:")) { + schemePrefix = "https:"; + } else { + return uriString; + } + int slashStart = schemePrefix.length(); + int slashEnd = slashStart; + while (slashEnd < uriString.length() && uriString.charAt(slashEnd) == '/') { + slashEnd++; + } + if (slashEnd - slashStart < 3) { + return uriString; + } + return schemePrefix + "//" + uriString.substring(slashEnd); + } + public String getScheme() { return scheme; } From 309bb5b4bc52fe1d0eda32997026d51c88855b7c Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Wed, 29 Apr 2026 21:11:09 +0100 Subject: [PATCH 08/15] fix: forgotten Apache 2 header --- .../java/org/commoncrawl/net/WarcUriTest.java | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/test/java/org/commoncrawl/net/WarcUriTest.java b/src/test/java/org/commoncrawl/net/WarcUriTest.java index af0895f..2a576b7 100644 --- a/src/test/java/org/commoncrawl/net/WarcUriTest.java +++ b/src/test/java/org/commoncrawl/net/WarcUriTest.java @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.commoncrawl.net; import org.apache.commons.lang3.StringUtils; From 6d64095ebcc020e78e938273df7e49aa71be4e44 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Sat, 2 May 2026 22:18:49 +0100 Subject: [PATCH 09/15] fix: normalize slashed only when the hostname is blank --- .../java/org/commoncrawl/net/WarcUri.java | 38 ++++++++++++------- .../java/org/commoncrawl/net/WarcUriTest.java | 8 ++-- 2 files changed, 29 insertions(+), 17 deletions(-) diff --git a/src/main/java/org/commoncrawl/net/WarcUri.java b/src/main/java/org/commoncrawl/net/WarcUri.java index f4a3dfc..14cdd29 100644 --- a/src/main/java/org/commoncrawl/net/WarcUri.java +++ b/src/main/java/org/commoncrawl/net/WarcUri.java @@ -19,6 +19,7 @@ import java.net.MalformedURLException; import java.net.URISyntaxException; +import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -40,22 +41,33 @@ public class WarcUri { public WarcUri(String uriString) { this.uriString = uriString; try { - uriString = normalizeMalformedHttpSlashes(uriString); + parseAndSetURI(uriString); + } catch (URISyntaxException uriExc) { + LOG.warn("Failed to parse WARC URI '{}', trying to normalize slashes", this.uriString, uriExc); + } + if (StringUtils.isBlank(getHostName().getHostName())) { + uriString = normalizeMalformedHttpSlashes(uriString); try { - url = new java.net.URL(uriString); - scheme = url.getProtocol(); - hostName = new HostName(url); - uri = url.toURI(); - } catch (MalformedURLException urlExc) { - // should not happen for HTTP captures (how could the URL have been fetched - // otherwise) but may happen for other schemes - dns, whois, ntp, metadata - uri = new java.net.URI(uriString); - scheme = uri.getScheme(); - hostName = new HostName(uri); + parseAndSetURI(uriString); + } catch (URISyntaxException e) { + LOG.warn("Failed to parse WARC URI '{}' after normalizing slashes", this.uriString, e); } - } catch (URISyntaxException uriExc) { - LOG.warn("Failed to parse WARC URI '{}'", this.uriString, uriExc); + } + } + + private void parseAndSetURI(String uriString) throws URISyntaxException { + try { + this.url = new java.net.URL(uriString); + this.scheme = url.getProtocol(); + this.hostName = new HostName(url); + this.uri = url.toURI(); + } catch (MalformedURLException urlExc) { + // should not happen for HTTP captures (how could the URL have been fetched + // otherwise) but may happen for other schemes - dns, whois, ntp, metadata + this.uri = new java.net.URI(uriString); + this.scheme = uri.getScheme(); + this.hostName = new HostName(uri); } } diff --git a/src/test/java/org/commoncrawl/net/WarcUriTest.java b/src/test/java/org/commoncrawl/net/WarcUriTest.java index 2a576b7..8093e6f 100644 --- a/src/test/java/org/commoncrawl/net/WarcUriTest.java +++ b/src/test/java/org/commoncrawl/net/WarcUriTest.java @@ -27,7 +27,7 @@ class WarcUriTest { @Test - void getHostName_malformedHttps_shouldNotBeEmpty() { + void getHostNameWithMalformedHttpsShouldNotBeEmpty() { WarcUri warcUri = new WarcUri("https:////www.google.com/robots.txt"); assertNotNull(warcUri.getHostName()); assertTrue( @@ -40,7 +40,7 @@ void getHostName_malformedHttps_shouldNotBeEmpty() { } @Test - void getHostName_malformedHttp_shouldNotBeEmpty() { + void getHostNameMalformedHttpShouldNotBeEmpty() { WarcUri warcUri = new WarcUri("http:////www.google.com/robots.txt"); assertNotNull(warcUri.getHostName()); assertTrue( @@ -53,7 +53,7 @@ void getHostName_malformedHttp_shouldNotBeEmpty() { } @Test - void getHostName_validHttpHost_shouldNotBeEmpty() { + void getHostNameValidHttpHostShouldNotBeEmpty() { WarcUri warcUri = new WarcUri("http://sites.google.com////robots.txt"); assertNotNull(warcUri.getHostName()); assertTrue( @@ -66,7 +66,7 @@ void getHostName_validHttpHost_shouldNotBeEmpty() { } @Test - void getHostName_validHttpsHost_shouldNotBeEmpty() { + void getHostNameValidHttpsHostShouldNotBeEmpty() { WarcUri warcUri = new WarcUri("https://sites.google.com////robots.txt"); assertNotNull(warcUri.getHostName()); assertTrue( From 2cc0c671dda2905bd17475c2cc1a1b003841100e Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Sat, 2 May 2026 22:36:01 +0100 Subject: [PATCH 10/15] fix: removed non-direct dependencies --- .../java/org/commoncrawl/net/WarcUri.java | 3 +-- .../java/org/commoncrawl/net/WarcUriTest.java | 21 +++++-------------- 2 files changed, 6 insertions(+), 18 deletions(-) diff --git a/src/main/java/org/commoncrawl/net/WarcUri.java b/src/main/java/org/commoncrawl/net/WarcUri.java index 14cdd29..94fb745 100644 --- a/src/main/java/org/commoncrawl/net/WarcUri.java +++ b/src/main/java/org/commoncrawl/net/WarcUri.java @@ -19,7 +19,6 @@ import java.net.MalformedURLException; import java.net.URISyntaxException; -import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -46,7 +45,7 @@ public WarcUri(String uriString) { LOG.warn("Failed to parse WARC URI '{}', trying to normalize slashes", this.uriString, uriExc); } - if (StringUtils.isBlank(getHostName().getHostName())) { + if (this.hostName == null || this.hostName.getHostName().isEmpty()) { uriString = normalizeMalformedHttpSlashes(uriString); try { parseAndSetURI(uriString); diff --git a/src/test/java/org/commoncrawl/net/WarcUriTest.java b/src/test/java/org/commoncrawl/net/WarcUriTest.java index 8093e6f..c9b244c 100644 --- a/src/test/java/org/commoncrawl/net/WarcUriTest.java +++ b/src/test/java/org/commoncrawl/net/WarcUriTest.java @@ -17,12 +17,9 @@ package org.commoncrawl.net; -import org.apache.commons.lang3.StringUtils; import org.junit.jupiter.api.Test; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertNotNull; -import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.*; class WarcUriTest { @@ -30,9 +27,7 @@ class WarcUriTest { void getHostNameWithMalformedHttpsShouldNotBeEmpty() { WarcUri warcUri = new WarcUri("https:////www.google.com/robots.txt"); assertNotNull(warcUri.getHostName()); - assertTrue( - StringUtils.isNotEmpty(warcUri.getHostName().getHostName()), - "getHostName() should not return an empty string."); + assertNotEquals("", warcUri.getHostName().getHostName(), "getHostName() should not return an empty string."); assertEquals( "www.google.com", warcUri.getHostName().getHostName(), @@ -43,9 +38,7 @@ void getHostNameWithMalformedHttpsShouldNotBeEmpty() { void getHostNameMalformedHttpShouldNotBeEmpty() { WarcUri warcUri = new WarcUri("http:////www.google.com/robots.txt"); assertNotNull(warcUri.getHostName()); - assertTrue( - StringUtils.isNotEmpty(warcUri.getHostName().getHostName()), - "getHostName() should not return an empty string."); + assertNotEquals("", warcUri.getHostName().getHostName(), "getHostName() should not return an empty string."); assertEquals( "www.google.com", warcUri.getHostName().getHostName(), @@ -56,9 +49,7 @@ void getHostNameMalformedHttpShouldNotBeEmpty() { void getHostNameValidHttpHostShouldNotBeEmpty() { WarcUri warcUri = new WarcUri("http://sites.google.com////robots.txt"); assertNotNull(warcUri.getHostName()); - assertTrue( - StringUtils.isNotEmpty(warcUri.getHostName().getHostName()), - "getHostName() should not return an empty string."); + assertNotEquals("", warcUri.getHostName().getHostName(), "getHostName() should not return an empty string."); assertEquals( "sites.google.com", warcUri.getHostName().getHostName(), @@ -69,9 +60,7 @@ void getHostNameValidHttpHostShouldNotBeEmpty() { void getHostNameValidHttpsHostShouldNotBeEmpty() { WarcUri warcUri = new WarcUri("https://sites.google.com////robots.txt"); assertNotNull(warcUri.getHostName()); - assertTrue( - StringUtils.isNotEmpty(warcUri.getHostName().getHostName()), - "getHostName() should not return an empty string."); + assertNotEquals("", warcUri.getHostName().getHostName(), "getHostName() should not return an empty string."); assertEquals( "sites.google.com", warcUri.getHostName().getHostName(), From 81f5815556e195c8fdc3c888d77f1e126af01031 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Sat, 2 May 2026 22:43:31 +0100 Subject: [PATCH 11/15] test: fix broken test --- src/main/java/org/commoncrawl/net/WarcUri.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/commoncrawl/net/WarcUri.java b/src/main/java/org/commoncrawl/net/WarcUri.java index 94fb745..27e2957 100644 --- a/src/main/java/org/commoncrawl/net/WarcUri.java +++ b/src/main/java/org/commoncrawl/net/WarcUri.java @@ -45,7 +45,7 @@ public WarcUri(String uriString) { LOG.warn("Failed to parse WARC URI '{}', trying to normalize slashes", this.uriString, uriExc); } - if (this.hostName == null || this.hostName.getHostName().isEmpty()) { + if (this.hostName == null || (this.hostName.getHostName() != null && this.hostName.getHostName().isEmpty())) { uriString = normalizeMalformedHttpSlashes(uriString); try { parseAndSetURI(uriString); From 22deb33a43746fd3bad3415f5ebc49d950cd8078 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 4 May 2026 14:27:11 +0100 Subject: [PATCH 12/15] fix: also URLs with a single / --- src/main/java/org/commoncrawl/net/WarcUri.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/commoncrawl/net/WarcUri.java b/src/main/java/org/commoncrawl/net/WarcUri.java index 27e2957..16a0bc9 100644 --- a/src/main/java/org/commoncrawl/net/WarcUri.java +++ b/src/main/java/org/commoncrawl/net/WarcUri.java @@ -84,7 +84,7 @@ private static String normalizeMalformedHttpSlashes(String uriString) { while (slashEnd < uriString.length() && uriString.charAt(slashEnd) == '/') { slashEnd++; } - if (slashEnd - slashStart < 3) { + if (slashEnd - slashStart == 2) { return uriString; } return schemePrefix + "//" + uriString.substring(slashEnd); From 000b3a3ef14abb104d01fd434bb924de02f9df92 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 4 May 2026 14:33:09 +0100 Subject: [PATCH 13/15] fix: also URLs with a single / and add tests --- .../java/org/commoncrawl/net/WarcUri.java | 2 +- .../java/org/commoncrawl/net/WarcUriTest.java | 46 +++++++++++++++++++ 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/commoncrawl/net/WarcUri.java b/src/main/java/org/commoncrawl/net/WarcUri.java index 16a0bc9..f832138 100644 --- a/src/main/java/org/commoncrawl/net/WarcUri.java +++ b/src/main/java/org/commoncrawl/net/WarcUri.java @@ -70,7 +70,7 @@ private void parseAndSetURI(String uriString) throws URISyntaxException { } } - private static String normalizeMalformedHttpSlashes(String uriString) { + static String normalizeMalformedHttpSlashes(String uriString) { String schemePrefix; if (uriString.startsWith("http:")) { schemePrefix = "http:"; diff --git a/src/test/java/org/commoncrawl/net/WarcUriTest.java b/src/test/java/org/commoncrawl/net/WarcUriTest.java index c9b244c..e31b5f6 100644 --- a/src/test/java/org/commoncrawl/net/WarcUriTest.java +++ b/src/test/java/org/commoncrawl/net/WarcUriTest.java @@ -66,4 +66,50 @@ void getHostNameValidHttpsHostShouldNotBeEmpty() { warcUri.getHostName().getHostName(), "getHostName() should return 'sites.google.com' for the URL with extra path slashes."); } + + @Test + void testNormalizeMalformedHttpUrlOK() { + + String url = "http://www.google.com/robots.txt"; + + assertEquals( + url, + WarcUri.normalizeMalformedHttpSlashes(url), + "Normalizer should not change a well-formed URL."); + } + + @Test + void testNormalizeMalformedHttpsUrlOK() { + + String url = "https://www.google.com/robots.txt"; + + assertEquals( + url, + WarcUri.normalizeMalformedHttpSlashes(url), + "Normalizer should not change a well-formed URL."); + } + + @Test + void testNormalizeMalformedHttps3SlashesIsFixed() { + assertEquals( + "http://www.google.com/robots.txt", + WarcUri.normalizeMalformedHttpSlashes("http:///www.google.com/robots.txt"), + "Normalizer should fix change a malformed URL with three slashes."); + } + + @Test + void testNormalizeMalformedHttps4SlashesIsFixed() { + assertEquals( + "http://www.google.com/robots.txt", + WarcUri.normalizeMalformedHttpSlashes("http:////www.google.com/robots.txt"), + "Normalizer should fix change a malformed URL with four slashes."); + } + + @Test + void testNormalizeMalformedHttps1SlashIsFixed() { + assertEquals( + "http://www.google.com/robots.txt", + WarcUri.normalizeMalformedHttpSlashes("http:/www.google.com/robots.txt"), + "Normalizer should fix change a malformed URL with one slash."); + } } \ No newline at end of file From 52252f2bef99e65510148827f1fb94d992e13399 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 4 May 2026 14:52:38 +0100 Subject: [PATCH 14/15] test: add test with a different schema --- src/test/java/org/commoncrawl/net/WarcUriTest.java | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/test/java/org/commoncrawl/net/WarcUriTest.java b/src/test/java/org/commoncrawl/net/WarcUriTest.java index e31b5f6..f121500 100644 --- a/src/test/java/org/commoncrawl/net/WarcUriTest.java +++ b/src/test/java/org/commoncrawl/net/WarcUriTest.java @@ -112,4 +112,12 @@ void testNormalizeMalformedHttps1SlashIsFixed() { WarcUri.normalizeMalformedHttpSlashes("http:/www.google.com/robots.txt"), "Normalizer should fix change a malformed URL with one slash."); } + + @Test + void testNormalizeMalformedFtpShouldIgnore() { + assertEquals( + "ftp://////ftp.google.com", + WarcUri.normalizeMalformedHttpSlashes("ftp://////ftp.google.com"), + "Normalizer should fix change a malformed URL with one slash."); + } } \ No newline at end of file From 8447fcd134af9837e8f6638de198e564cb6c14a4 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 5 May 2026 07:41:24 +0100 Subject: [PATCH 15/15] test: simplify --- src/test/java/org/commoncrawl/net/WarcUriTest.java | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/test/java/org/commoncrawl/net/WarcUriTest.java b/src/test/java/org/commoncrawl/net/WarcUriTest.java index f121500..36cae13 100644 --- a/src/test/java/org/commoncrawl/net/WarcUriTest.java +++ b/src/test/java/org/commoncrawl/net/WarcUriTest.java @@ -26,8 +26,6 @@ class WarcUriTest { @Test void getHostNameWithMalformedHttpsShouldNotBeEmpty() { WarcUri warcUri = new WarcUri("https:////www.google.com/robots.txt"); - assertNotNull(warcUri.getHostName()); - assertNotEquals("", warcUri.getHostName().getHostName(), "getHostName() should not return an empty string."); assertEquals( "www.google.com", warcUri.getHostName().getHostName(), @@ -37,8 +35,6 @@ void getHostNameWithMalformedHttpsShouldNotBeEmpty() { @Test void getHostNameMalformedHttpShouldNotBeEmpty() { WarcUri warcUri = new WarcUri("http:////www.google.com/robots.txt"); - assertNotNull(warcUri.getHostName()); - assertNotEquals("", warcUri.getHostName().getHostName(), "getHostName() should not return an empty string."); assertEquals( "www.google.com", warcUri.getHostName().getHostName(), @@ -48,8 +44,6 @@ void getHostNameMalformedHttpShouldNotBeEmpty() { @Test void getHostNameValidHttpHostShouldNotBeEmpty() { WarcUri warcUri = new WarcUri("http://sites.google.com////robots.txt"); - assertNotNull(warcUri.getHostName()); - assertNotEquals("", warcUri.getHostName().getHostName(), "getHostName() should not return an empty string."); assertEquals( "sites.google.com", warcUri.getHostName().getHostName(), @@ -59,8 +53,6 @@ void getHostNameValidHttpHostShouldNotBeEmpty() { @Test void getHostNameValidHttpsHostShouldNotBeEmpty() { WarcUri warcUri = new WarcUri("https://sites.google.com////robots.txt"); - assertNotNull(warcUri.getHostName()); - assertNotEquals("", warcUri.getHostName().getHostName(), "getHostName() should not return an empty string."); assertEquals( "sites.google.com", warcUri.getHostName().getHostName(),