diff --git a/src/main/java/org/archive/url/BasicURLCanonicalizer.java b/src/main/java/org/archive/url/BasicURLCanonicalizer.java index c09ad6e6..37b448c1 100644 --- a/src/main/java/org/archive/url/BasicURLCanonicalizer.java +++ b/src/main/java/org/archive/url/BasicURLCanonicalizer.java @@ -15,18 +15,18 @@ /** * Canonicalizer that does more or less basic fixup. Based initially on rules * specified at https://developers.google.com/safe-browsing/developers_guide_v2# - * Canonicalization. These rules are designed for clients of google's + * Canonicalization. These rules are designed for clients of Google's * "experimental" Safe Browsing API to "check URLs against Google's * constantly-updated blacklists of suspected phishing and malware pages". * *
- * This class differs from google in treatment of non-ascii input. Google's + * This class differs from Google in treatment of non-ascii input. Google's * rules don't really address this except with one example test case, which * seems to suggest taking raw input bytes and pct-encoding them byte for byte. * Since the input to this class consists of java strings, not raw bytes, that - * wouldn't be possible, even if deemed preferable. Instead + * wouldn't be possible, even if deemed preferable. Instead, * BasicURLCanonicalizer expresses non-ascii characters pct-encoded UTF-8. */ public class BasicURLCanonicalizer implements URLCanonicalizer { @@ -212,6 +212,10 @@ protected static Charset UTF8() { return _UTF8; } + /** + * @param input String to be percent-encoded. Assumed to be fully unescaped. + * @return percent-encoded string + */ public String escapeOnce(String input) { if (input == null) { return null; @@ -243,6 +247,19 @@ public String escapeOnce(String input) { */ sb = new StringBuilder(input.substring(0, i)); } + if (b == '%' && i < utf8bytes.length - 2) { + // Any hex escapes left at this point represent non-UTF-8 encoded characters + // Unescape them, so they don't get double escaped + int hex1 = getHex(utf8bytes[i + 1]); + if (hex1 >= 0) { + int hex2 = getHex(utf8bytes[i + 2]); + if (hex2 >= 0) { + i = i+2; + b = hex1 * 16 + hex2; + } + } + + } sb.append("%"); String hex = Integer.toHexString(b).toUpperCase(); if (hex.length() == 1) { @@ -337,7 +354,7 @@ public String decode(String input) { * Decodes bytes in bbuf as utf-8 and appends decoded characters to sb. If * decoding of any portion fails, appends the un-decodable %xx%xx sequence * extracted from inputStr instead of decoded characters. See "bad unicode" - * tests in GoogleCanonicalizerTest#testDecode(). Variables only make sense + * tests in BasicURLCanonicalizerTest#testDecode(). Variables only make sense * within context of {@link #decode(String)}. * * @param sb diff --git a/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java b/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java index c21bcbe8..cc100e4c 100644 --- a/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java +++ b/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java @@ -143,6 +143,9 @@ public void testUnescapeRepeatedly() { assertEquals("%",guc.unescapeRepeatedly("%25%32%35")); assertEquals("168.188.99.26",guc.unescapeRepeatedly("%31%36%38%2e%31%38%38%2e%39%39%2e%32%36")); + + assertEquals("tag=%E4%EE%F8%EA%EE%EB%FC%ED%EE%E5", + guc.unescapeRepeatedly("tag=%E4%EE%F8%EA%EE%EB%FC%ED%EE%E5")); } public void testAttemptIPFormats() throws URIException { diff --git a/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java b/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java index 26161456..7da046da 100644 --- a/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java +++ b/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java @@ -23,6 +23,12 @@ public void testMakeKey() throws URISyntaxException { assertEquals("org,archive)/goo?a&b", km.makeKey("http://archive.org/goo/?b&a")); assertEquals("org,archive)/goo?a=1&a=2&b", km.makeKey("http://archive.org/goo/?a=2&b&a=1")); assertEquals("org,archive)/", km.makeKey("http://archive.org:/")); + assertEquals("ua,1kr)/newslist.html?tag=%e4%ee%f8%ea%ee%eb%fc%ed%ee%e5", + km.makeKey("http://1kr.ua/newslist.html?tag=%E4%EE%F8%EA%EE%EB%FC%ED%EE%E5")); + assertEquals("com,aluroba)/tags/%c3%ce%ca%c7%d1%e5%c7.htm", + km.makeKey("http://www.aluroba.com/tags/%C3%CE%CA%C7%D1%E5%C7.htm")); + assertEquals("ac,insbase)/xoops2/modules/xpwiki?%a4%d5%a4%af%a4%aa%a4%ab%b8%a9%a4%aa%a4%aa%a4%ce%a4%b8%a4%e7%a4%a6%bb%d4", + km.makeKey("https://www.insbase.ac/xoops2/modules/xpwiki/?%A4%D5%A4%AF%A4%AA%A4%AB%B8%A9%A4%AA%A4%AA%A4%CE%A4%B8%A4%E7%A4%A6%BB%D4")); } }