diff --git a/.gitignore b/.gitignore index 2c6546c..17913b3 100644 --- a/.gitignore +++ b/.gitignore @@ -39,3 +39,6 @@ metastore_db/ # Python __pycache__/ + +# Intellij Idea +.idea \ No newline at end of file diff --git a/src/main/java/org/commoncrawl/net/WarcUri.java b/src/main/java/org/commoncrawl/net/WarcUri.java index 8f658b7..f832138 100644 --- a/src/main/java/org/commoncrawl/net/WarcUri.java +++ b/src/main/java/org/commoncrawl/net/WarcUri.java @@ -19,6 +19,9 @@ import java.net.MalformedURLException; import java.net.URISyntaxException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + /** * Parses a string representation of a URI or URL found as WARC-Target-URI and * provides access to the parts of the URL/URI. Cf. {@link java.net.URL}, @@ -26,6 +29,8 @@ */ public class WarcUri { + private static final Logger LOG = LoggerFactory.getLogger(WarcUri.class); + private String uriString; private java.net.URL url; private java.net.URI uri; @@ -35,23 +40,56 @@ public class WarcUri { public WarcUri(String uriString) { this.uriString = uriString; try { + parseAndSetURI(uriString); + } catch (URISyntaxException uriExc) { + LOG.warn("Failed to parse WARC URI '{}', trying to normalize slashes", this.uriString, uriExc); + } + + if (this.hostName == null || (this.hostName.getHostName() != null && this.hostName.getHostName().isEmpty())) { + uriString = normalizeMalformedHttpSlashes(uriString); try { - url = new java.net.URL(uriString); - scheme = url.getProtocol(); - hostName = new HostName(url); - uri = url.toURI(); - } catch (MalformedURLException urlExc) { - // should not happen for HTTP captures (how could the URL have been fetched - // otherwise) but may happen for other schemes - dns, whois, ntp, metadata - uri = new java.net.URI(uriString); - scheme = uri.getScheme(); - hostName = new HostName(uri); + parseAndSetURI(uriString); + } catch (URISyntaxException e) { + LOG.warn("Failed to parse WARC URI '{}' after normalizing slashes", this.uriString, e); } - } catch (URISyntaxException uriExc) { - // failed to be parsed into parts } } + private void parseAndSetURI(String uriString) throws URISyntaxException { + try { + this.url = new java.net.URL(uriString); + this.scheme = url.getProtocol(); + this.hostName = new HostName(url); + this.uri = url.toURI(); + } catch (MalformedURLException urlExc) { + // should not happen for HTTP captures (how could the URL have been fetched + // otherwise) but may happen for other schemes - dns, whois, ntp, metadata + this.uri = new java.net.URI(uriString); + this.scheme = uri.getScheme(); + this.hostName = new HostName(uri); + } + } + + static String normalizeMalformedHttpSlashes(String uriString) { + String schemePrefix; + if (uriString.startsWith("http:")) { + schemePrefix = "http:"; + } else if (uriString.startsWith("https:")) { + schemePrefix = "https:"; + } else { + return uriString; + } + int slashStart = schemePrefix.length(); + int slashEnd = slashStart; + while (slashEnd < uriString.length() && uriString.charAt(slashEnd) == '/') { + slashEnd++; + } + if (slashEnd - slashStart == 2) { + return uriString; + } + return schemePrefix + "//" + uriString.substring(slashEnd); + } + public String getScheme() { return scheme; } diff --git a/src/test/java/org/commoncrawl/net/WarcUriTest.java b/src/test/java/org/commoncrawl/net/WarcUriTest.java new file mode 100644 index 0000000..36cae13 --- /dev/null +++ b/src/test/java/org/commoncrawl/net/WarcUriTest.java @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.commoncrawl.net; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.*; + +class WarcUriTest { + + @Test + void getHostNameWithMalformedHttpsShouldNotBeEmpty() { + WarcUri warcUri = new WarcUri("https:////www.google.com/robots.txt"); + assertEquals( + "www.google.com", + warcUri.getHostName().getHostName(), + "getHostName() should return 'www.google.com' for the malformed URL."); + } + + @Test + void getHostNameMalformedHttpShouldNotBeEmpty() { + WarcUri warcUri = new WarcUri("http:////www.google.com/robots.txt"); + assertEquals( + "www.google.com", + warcUri.getHostName().getHostName(), + "getHostName() should return 'www.google.com' for the malformed URL."); + } + + @Test + void getHostNameValidHttpHostShouldNotBeEmpty() { + WarcUri warcUri = new WarcUri("http://sites.google.com////robots.txt"); + assertEquals( + "sites.google.com", + warcUri.getHostName().getHostName(), + "getHostName() should return 'sites.google.com' for the URL with extra path slashes."); + } + + @Test + void getHostNameValidHttpsHostShouldNotBeEmpty() { + WarcUri warcUri = new WarcUri("https://sites.google.com////robots.txt"); + assertEquals( + "sites.google.com", + warcUri.getHostName().getHostName(), + "getHostName() should return 'sites.google.com' for the URL with extra path slashes."); + } + + @Test + void testNormalizeMalformedHttpUrlOK() { + + String url = "http://www.google.com/robots.txt"; + + assertEquals( + url, + WarcUri.normalizeMalformedHttpSlashes(url), + "Normalizer should not change a well-formed URL."); + } + + @Test + void testNormalizeMalformedHttpsUrlOK() { + + String url = "https://www.google.com/robots.txt"; + + assertEquals( + url, + WarcUri.normalizeMalformedHttpSlashes(url), + "Normalizer should not change a well-formed URL."); + } + + @Test + void testNormalizeMalformedHttps3SlashesIsFixed() { + assertEquals( + "http://www.google.com/robots.txt", + WarcUri.normalizeMalformedHttpSlashes("http:///www.google.com/robots.txt"), + "Normalizer should fix change a malformed URL with three slashes."); + } + + @Test + void testNormalizeMalformedHttps4SlashesIsFixed() { + assertEquals( + "http://www.google.com/robots.txt", + WarcUri.normalizeMalformedHttpSlashes("http:////www.google.com/robots.txt"), + "Normalizer should fix change a malformed URL with four slashes."); + } + + @Test + void testNormalizeMalformedHttps1SlashIsFixed() { + assertEquals( + "http://www.google.com/robots.txt", + WarcUri.normalizeMalformedHttpSlashes("http:/www.google.com/robots.txt"), + "Normalizer should fix change a malformed URL with one slash."); + } + + @Test + void testNormalizeMalformedFtpShouldIgnore() { + assertEquals( + "ftp://////ftp.google.com", + WarcUri.normalizeMalformedHttpSlashes("ftp://////ftp.google.com"), + "Normalizer should fix change a malformed URL with one slash."); + } +} \ No newline at end of file