From a4748d9e79abb972a6571f5f4d46951be6049b1a Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Wed, 27 Nov 2024 13:24:17 +0100 Subject: [PATCH] URLParser and WaybackURLKeyMaker fail on URLs with IPv6 address hostname --- src/main/java/org/archive/url/URLParser.java | 11 ++++++++++- .../java/org/archive/url/URLRegexTransformer.java | 4 ++++ src/test/java/org/archive/url/URLParserTest.java | 3 +++ .../java/org/archive/url/WaybackURLKeyMakerTest.java | 3 +++ 4 files changed, 20 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/archive/url/URLParser.java b/src/main/java/org/archive/url/URLParser.java index a7860b02..bcd0b7fb 100644 --- a/src/main/java/org/archive/url/URLParser.java +++ b/src/main/java/org/archive/url/URLParser.java @@ -226,7 +226,16 @@ public static HandyURL parse(String urlString) throws URISyntaxException { String colonPort = null; int atIndex = uriAuthority.indexOf(COMMERCIAL_AT); - int portColonIndex = uriAuthority.indexOf(COLON,(atIndex<0)?0:atIndex); + int portColonIndex = -1; + int startColonIndex = 0; + if (atIndex > -1) { + startColonIndex = atIndex; + } + if (uriAuthority.charAt(startColonIndex) == '[') { + // IPv6 address + startColonIndex = uriAuthority.indexOf(']', (startColonIndex + 1)); + } + portColonIndex = uriAuthority.indexOf(COLON, startColonIndex); if(atIndex<0 && portColonIndex<0) { // most common case: neither userinfo nor port diff --git a/src/main/java/org/archive/url/URLRegexTransformer.java b/src/main/java/org/archive/url/URLRegexTransformer.java index 617e0225..5f31c81c 100644 --- a/src/main/java/org/archive/url/URLRegexTransformer.java +++ b/src/main/java/org/archive/url/URLRegexTransformer.java @@ -121,6 +121,10 @@ public static String hostToSURT(String host) { // TODO: ensure we DONT reverse IP addresses! String parts[] = host.split("\\.",-1); if(parts.length == 1) { + // strip enclosing "[" and "]" from IPv6 hosts + if (host.charAt(0) == '[' && host.charAt(host.length() - 1) == ']') { + return host.substring(1, host.length() - 1); + } return host; } StringBuilder sb = new StringBuilder(host.length()); diff --git a/src/test/java/org/archive/url/URLParserTest.java b/src/test/java/org/archive/url/URLParserTest.java index b060ffa7..68dfcd23 100644 --- a/src/test/java/org/archive/url/URLParserTest.java +++ b/src/test/java/org/archive/url/URLParserTest.java @@ -86,6 +86,9 @@ public void testParse() throws UnsupportedEncodingException, URISyntaxException checkParse(" \n http://:****@www.archive.org:8080/inde\rx.html?query#foo \r\n \t ", null, "http", "", "****", "www.archive.org", 8080, "/index.html", "query", "foo", "http://:****@www.archive.org:8080/index.html?query#foo", "/index.html?query"); + checkParse("https://[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/robots.txt", null, "https", null, null, + "[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]", -1, "/robots.txt", null, null, + "https://[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/robots.txt", "/robots.txt"); } private void checkParse(String s, String opaque, String scheme, String authUser, diff --git a/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java b/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java index 26161456..1a1403ee 100644 --- a/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java +++ b/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java @@ -23,6 +23,9 @@ public void testMakeKey() throws URISyntaxException { assertEquals("org,archive)/goo?a&b", km.makeKey("http://archive.org/goo/?b&a")); assertEquals("org,archive)/goo?a=1&a=2&b", km.makeKey("http://archive.org/goo/?a=2&b&a=1")); assertEquals("org,archive)/", km.makeKey("http://archive.org:/")); + assertEquals("192,211,203,34)/robots.txt", km.makeKey("https://34.203.211.192/robots.txt")); + assertEquals("2600:1f18:200d:fb00:2b74:867c:ab0c:150a)/robots.txt", + km.makeKey("https://[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/robots.txt")); } }