diff --git a/src/main/java/org/archive/url/UsableURIFactory.java b/src/main/java/org/archive/url/UsableURIFactory.java index 9118b850..1059bfbd 100644 --- a/src/main/java/org/archive/url/UsableURIFactory.java +++ b/src/main/java/org/archive/url/UsableURIFactory.java @@ -49,8 +49,8 @@ * @author stack */ public class UsableURIFactory extends URI { - - private static final long serialVersionUID = -6146295130382209042L; + + private static final long serialVersionUID = 2L; /** * Logging instance. @@ -395,9 +395,6 @@ private String fixup(String uri, final URI base, final String charset) } TextUtils.recycleMatcher(matcher); - // now, minimally escape any whitespace - uri = escapeWhitespace(uri); - // For further processing, get uri elements. See the RFC2396REGEX // comment above for explanation of group indices used in the below. // matcher = RFC2396REGEX.matcher(uri); @@ -663,51 +660,6 @@ private String ensureMinimalEscaping(String u, final String charset, return u; } - /** - * Escape any whitespace found. - * - * The parent class takes care of the bulk of escaping. But if any - * instance of escaping is found in the URI, then we ask for parent - * to do NO escaping. Here we escape any whitespace found irrespective - * of whether the uri has already been escaped. We do this for - * case where uri has been judged already-escaped only, its been - * incompletly done and whitespace remains. Spaces, etc., in the URI are - * a real pain. Their presence will break log file and ARC parsing. - * @param uri URI string to check. - * @return uri with spaces escaped if any found. - */ - protected String escapeWhitespace(String uri) { - // Just write a new string anyways. The perl '\s' is not - // as inclusive as the Character.isWhitespace so there are - // whitespace characters we could miss. So, rather than - // write some awkward regex, just go through the string - // a character at a time. Only create buffer first time - // we find a space. - MutableString buffer = null; - for (int i = 0; i < uri.length(); i++) { - char c = uri.charAt(i); - if (Character.isWhitespace(c)) { - if (buffer == null) { - buffer = new MutableString(uri.length() + - 2 /*If space, two extra characters (at least)*/); - buffer.append(uri.substring(0, i)); - } - buffer.append("%"); - String hexStr = Integer.toHexString(c); - if ((hexStr.length() % 2) > 0) { - buffer.append("0"); - } - buffer.append(hexStr); - - } else { - if (buffer != null) { - buffer.append(c); - } - } - } - return (buffer != null)? buffer.toString(): uri; - } - /** * Check port on passed http authority. Make sure the size is not larger * than allowed: See the 'port' definition on this diff --git a/src/test/java/org/archive/url/UsableURIFactoryTest.java b/src/test/java/org/archive/url/UsableURIFactoryTest.java index af190957..73f2b6db 100644 --- a/src/test/java/org/archive/url/UsableURIFactoryTest.java +++ b/src/test/java/org/archive/url/UsableURIFactoryTest.java @@ -174,7 +174,7 @@ public final void testWhitespaceEscaped() throws URIException { assertTrue("Not equal " + uuri.toString(), uuri.toString().equals(tgtUri)); uri = "http://archive.org/index%25\u001D.html"; - tgtUri = "http://archive.org/index%25%1D.html".toLowerCase(); + tgtUri = "http://archive.org/index%25%1D.html"; uuri = UsableURIFactory.getInstance(uri); assertEquals("whitespace escaping", tgtUri, uuri.toString()); uri = "http://gemini.info.usaid.gov/directory/" + @@ -185,6 +185,12 @@ public final void testWhitespaceEscaped() throws URIException { "faxResults.cfm?name=Ebenezer +Rumplestiltskin,&location=" + "RRB%20%20%20%205%2E08%2D006"); assertEquals("whitespace escaping", tgtUri, uuri.toString()); + + // https://webarchive.jira.com/browse/HER-2089 + uri = "http://archive.org/index%25\u3000.html"; + tgtUri = "http://archive.org/index%25%E3%80%80.html"; + uuri = UsableURIFactory.getInstance(uri); + assertEquals("U+3000 ideographic space escaping", tgtUri, uuri.toString()); } // public final void testFailedGetPath() throws URIException {