From a80b98dfe4b1c2a7556e7df2574c16426849f6d9 Mon Sep 17 00:00:00 2001
From: Tom Morris
Date: Sat, 26 Aug 2023 20:05:34 -0400
Subject: [PATCH 1/3] Add failing test from Sebastian's issue
---
src/test/java/org/archive/url/BasicURLCanonicalizerTest.java | 3 +++
src/test/java/org/archive/url/WaybackURLKeyMakerTest.java | 4 ++++
2 files changed, 7 insertions(+)
diff --git a/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java b/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java
index c21bcbe8..cc100e4c 100644
--- a/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java
+++ b/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java
@@ -143,6 +143,9 @@ public void testUnescapeRepeatedly() {
assertEquals("%",guc.unescapeRepeatedly("%25%32%35"));
assertEquals("168.188.99.26",guc.unescapeRepeatedly("%31%36%38%2e%31%38%38%2e%39%39%2e%32%36"));
+
+ assertEquals("tag=%E4%EE%F8%EA%EE%EB%FC%ED%EE%E5",
+ guc.unescapeRepeatedly("tag=%E4%EE%F8%EA%EE%EB%FC%ED%EE%E5"));
}
public void testAttemptIPFormats() throws URIException {
diff --git a/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java b/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java
index 1a1403ee..86250972 100644
--- a/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java
+++ b/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java
@@ -26,6 +26,10 @@ public void testMakeKey() throws URISyntaxException {
assertEquals("192,211,203,34)/robots.txt", km.makeKey("https://34.203.211.192/robots.txt"));
assertEquals("2600:1f18:200d:fb00:2b74:867c:ab0c:150a)/robots.txt",
km.makeKey("https://[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/robots.txt"));
+ assertEquals("ua,1kr)/newslist.html?tag=%e4%ee%f8%ea%ee%eb%fc%ed%ee%e5",
+ km.makeKey("http://1kr.ua/newslist.html?tag=%E4%EE%F8%EA%EE%EB%FC%ED%EE%E5"));
+ assertEquals("com,aluroba)/tags/%c3%ce%ca%c7%d1%e5%c7.htm",
+ km.makeKey("http://www.aluroba.com/tags/%C3%CE%CA%C7%D1%E5%C7.htm"));
}
}
From 5161306d9ec993d1986f0d092c056f33ba3abdfe Mon Sep 17 00:00:00 2001
From: Tom Morris
Date: Sun, 27 Aug 2023 13:01:19 -0400
Subject: [PATCH 2/3] Add non-UTF-8 encoded test from mailing list
---
src/test/java/org/archive/url/WaybackURLKeyMakerTest.java | 2 ++
1 file changed, 2 insertions(+)
diff --git a/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java b/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java
index 86250972..26371ba8 100644
--- a/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java
+++ b/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java
@@ -30,6 +30,8 @@ public void testMakeKey() throws URISyntaxException {
km.makeKey("http://1kr.ua/newslist.html?tag=%E4%EE%F8%EA%EE%EB%FC%ED%EE%E5"));
assertEquals("com,aluroba)/tags/%c3%ce%ca%c7%d1%e5%c7.htm",
km.makeKey("http://www.aluroba.com/tags/%C3%CE%CA%C7%D1%E5%C7.htm"));
+ assertEquals("ac,insbase)/xoops2/modules/xpwiki?%a4%d5%a4%af%a4%aa%a4%ab%b8%a9%a4%aa%a4%aa%a4%ce%a4%b8%a4%e7%a4%a6%bb%d4",
+ km.makeKey("https://www.insbase.ac/xoops2/modules/xpwiki/?%A4%D5%A4%AF%A4%AA%A4%AB%B8%A9%A4%AA%A4%AA%A4%CE%A4%B8%A4%E7%A4%A6%BB%D4"));
}
}
From f7be47bc523c4d06cc7960dc2d3b1b58f9580906 Mon Sep 17 00:00:00 2001
From: Tom Morris
Date: Sun, 27 Aug 2023 13:11:30 -0400
Subject: [PATCH 3/3] Handle non-UTF-8 encoded characters. Fixes #6
---
.../archive/url/BasicURLCanonicalizer.java | 27 +++++++++++++++----
1 file changed, 22 insertions(+), 5 deletions(-)
diff --git a/src/main/java/org/archive/url/BasicURLCanonicalizer.java b/src/main/java/org/archive/url/BasicURLCanonicalizer.java
index c09ad6e6..37b448c1 100644
--- a/src/main/java/org/archive/url/BasicURLCanonicalizer.java
+++ b/src/main/java/org/archive/url/BasicURLCanonicalizer.java
@@ -15,18 +15,18 @@
/**
* Canonicalizer that does more or less basic fixup. Based initially on rules
* specified at https://developers.google.com/safe-browsing/developers_guide_v2#
- * Canonicalization. These rules are designed for clients of google's
+ * Canonicalization. These rules are designed for clients of Google's
* "experimental" Safe Browsing API to "check URLs against Google's
* constantly-updated blacklists of suspected phishing and malware pages".
*
*
- * This class differs from google in treatment of non-ascii input. Google's
+ * This class differs from Google in treatment of non-ascii input. Google's
* rules don't really address this except with one example test case, which
* seems to suggest taking raw input bytes and pct-encoding them byte for byte.
* Since the input to this class consists of java strings, not raw bytes, that
- * wouldn't be possible, even if deemed preferable. Instead
+ * wouldn't be possible, even if deemed preferable. Instead,
* BasicURLCanonicalizer expresses non-ascii characters pct-encoded UTF-8.
*/
public class BasicURLCanonicalizer implements URLCanonicalizer {
@@ -212,6 +212,10 @@ protected static Charset UTF8() {
return _UTF8;
}
+ /**
+ * @param input String to be percent-encoded. Assumed to be fully unescaped.
+ * @return percent-encoded string
+ */
public String escapeOnce(String input) {
if (input == null) {
return null;
@@ -243,6 +247,19 @@ public String escapeOnce(String input) {
*/
sb = new StringBuilder(input.substring(0, i));
}
+ if (b == '%' && i < utf8bytes.length - 2) {
+ // Any hex escapes left at this point represent non-UTF-8 encoded characters
+ // Unescape them, so they don't get double escaped
+ int hex1 = getHex(utf8bytes[i + 1]);
+ if (hex1 >= 0) {
+ int hex2 = getHex(utf8bytes[i + 2]);
+ if (hex2 >= 0) {
+ i = i+2;
+ b = hex1 * 16 + hex2;
+ }
+ }
+
+ }
sb.append("%");
String hex = Integer.toHexString(b).toUpperCase();
if (hex.length() == 1) {
@@ -337,7 +354,7 @@ public String decode(String input) {
* Decodes bytes in bbuf as utf-8 and appends decoded characters to sb. If
* decoding of any portion fails, appends the un-decodable %xx%xx sequence
* extracted from inputStr instead of decoded characters. See "bad unicode"
- * tests in GoogleCanonicalizerTest#testDecode(). Variables only make sense
+ * tests in BasicURLCanonicalizerTest#testDecode(). Variables only make sense
* within context of {@link #decode(String)}.
*
* @param sb