From 3ef28b5dd76249ae7e9f9fdc028e94edef566d0f Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Tue, 11 Nov 2025 15:45:52 +0100 Subject: [PATCH 1/2] BasicURLCanonicalizer: more efficient normalization of dots in host name --- .../archive/url/BasicURLCanonicalizer.java | 35 +++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/archive/url/BasicURLCanonicalizer.java b/src/main/java/org/archive/url/BasicURLCanonicalizer.java index 37b448c1..fe2e0d42 100644 --- a/src/main/java/org/archive/url/BasicURLCanonicalizer.java +++ b/src/main/java/org/archive/url/BasicURLCanonicalizer.java @@ -34,7 +34,9 @@ public class BasicURLCanonicalizer implements URLCanonicalizer { .compile("^(0[0-7]*)(\\.[0-7]+)?(\\.[0-7]+)?(\\.[0-7]+)?$"); Pattern DECIMAL_IP = Pattern .compile("^([1-9][0-9]*)(\\.[0-9]+)?(\\.[0-9]+)?(\\.[0-9]+)?$"); + Pattern MULTIDOT = Pattern.compile("\\.{2,}"); + @Override public void canonicalize(HandyURL url) { url.setHash(null); url.setAuthUser(minimalEscape(url.getAuthUser())); @@ -55,8 +57,7 @@ public void canonicalize(HandyURL url) { host = hostE; } - host = host.replaceAll("^\\.+", "").replaceAll("\\.\\.+", ".") - .replaceAll("\\.$", ""); + host = normalizeDots(host); } String ip = null; @@ -74,6 +75,36 @@ public void canonicalize(HandyURL url) { url.setPath(escapeOnce(normalizePath(path))); } + /** + * Normalize dots in the host name. + * + * @param host + * @return host name with all sequences of dots replaced with a single dot, + * and all leading and trailing dots removed + */ + private String normalizeDots(String host) { + if (host.indexOf('.') == -1) { + return host; + } + int start = 0, end = host.length(); + boolean changed = false; + while (host.charAt(start) == '.') { + start++; + changed = true; + } + while (host.charAt(end - 1) == '.') { + end--; + changed = true; + } + if (changed) { + host = host.substring(start, end); + } + if (host.contains("..")) { + host = MULTIDOT.matcher(host).replaceAll("."); + } + return host; + } + private static final Pattern SINGLE_FORWARDSLASH_PATTERN = Pattern .compile("/"); From 44ec22772a24d5dc916b0b0730625988e20cc865 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Thu, 13 Nov 2025 21:13:58 +0100 Subject: [PATCH 2/2] BasicURLCanonicalizer: more efficient normalization of dots in host name Add unit test and prevent from StringIndexOutOfBoundsException. --- .../java/org/archive/url/BasicURLCanonicalizer.java | 4 ++-- .../org/archive/url/BasicURLCanonicalizerTest.java | 10 ++++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/archive/url/BasicURLCanonicalizer.java b/src/main/java/org/archive/url/BasicURLCanonicalizer.java index fe2e0d42..f2cee60f 100644 --- a/src/main/java/org/archive/url/BasicURLCanonicalizer.java +++ b/src/main/java/org/archive/url/BasicURLCanonicalizer.java @@ -88,11 +88,11 @@ private String normalizeDots(String host) { } int start = 0, end = host.length(); boolean changed = false; - while (host.charAt(start) == '.') { + while (start < end && host.charAt(start) == '.') { start++; changed = true; } - while (host.charAt(end - 1) == '.') { + while (end > start && host.charAt(end - 1) == '.') { end--; changed = true; } diff --git a/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java b/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java index 19b1984f..0ab1e3b9 100644 --- a/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java +++ b/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java @@ -286,6 +286,16 @@ public void testUnicodeEscaping() throws URISyntaxException { checkCanonicalization("http://example.org/%F0%9F%82%A1", "http://example.org/%F0%9F%82%A1"); } + @Test + public void testHostDots() throws URISyntaxException { + checkCanonicalization("https://foobar.org./", "https://foobar.org/"); + checkCanonicalization("https://.foobar.org/", "https://foobar.org/"); + checkCanonicalization("https://foo...bar.org/", "https://foo.bar.org/"); + checkCanonicalization("https://...foo...bar.org.../", "https://foo.bar.org/"); + checkCanonicalization("https://localhost/path/file.txt", "https://localhost/path/file.txt"); + checkCanonicalization("https://....../path/file.txt", "https:///path/file.txt"); + } + private void checkCanonicalization(String in, String want) throws URISyntaxException { HandyURL h = URLParser.parse(in); guc.canonicalize(h);