Skip to content

Commit c1545bc

Browse files
committed
fix for HER-2089 - get rid of broken, seemingly unnecessary escapeWhitespace() step of uri fixup
1 parent 7a7cf08 commit c1545bc

2 files changed

Lines changed: 9 additions & 51 deletions

File tree

src/main/java/org/archive/url/UsableURIFactory.java

Lines changed: 2 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,8 @@
4949
* @author stack
5050
*/
5151
public class UsableURIFactory extends URI {
52-
53-
private static final long serialVersionUID = -6146295130382209042L;
52+
53+
private static final long serialVersionUID = 2L;
5454

5555
/**
5656
* Logging instance.
@@ -395,9 +395,6 @@ private String fixup(String uri, final URI base, final String charset)
395395
}
396396
TextUtils.recycleMatcher(matcher);
397397

398-
// now, minimally escape any whitespace
399-
uri = escapeWhitespace(uri);
400-
401398
// For further processing, get uri elements. See the RFC2396REGEX
402399
// comment above for explanation of group indices used in the below.
403400
// matcher = RFC2396REGEX.matcher(uri);
@@ -663,51 +660,6 @@ private String ensureMinimalEscaping(String u, final String charset,
663660
return u;
664661
}
665662

666-
/**
667-
* Escape any whitespace found.
668-
*
669-
* The parent class takes care of the bulk of escaping. But if any
670-
* instance of escaping is found in the URI, then we ask for parent
671-
* to do NO escaping. Here we escape any whitespace found irrespective
672-
* of whether the uri has already been escaped. We do this for
673-
* case where uri has been judged already-escaped only, its been
674-
* incompletly done and whitespace remains. Spaces, etc., in the URI are
675-
* a real pain. Their presence will break log file and ARC parsing.
676-
* @param uri URI string to check.
677-
* @return uri with spaces escaped if any found.
678-
*/
679-
protected String escapeWhitespace(String uri) {
680-
// Just write a new string anyways. The perl '\s' is not
681-
// as inclusive as the Character.isWhitespace so there are
682-
// whitespace characters we could miss. So, rather than
683-
// write some awkward regex, just go through the string
684-
// a character at a time. Only create buffer first time
685-
// we find a space.
686-
MutableString buffer = null;
687-
for (int i = 0; i < uri.length(); i++) {
688-
char c = uri.charAt(i);
689-
if (Character.isWhitespace(c)) {
690-
if (buffer == null) {
691-
buffer = new MutableString(uri.length() +
692-
2 /*If space, two extra characters (at least)*/);
693-
buffer.append(uri.substring(0, i));
694-
}
695-
buffer.append("%");
696-
String hexStr = Integer.toHexString(c);
697-
if ((hexStr.length() % 2) > 0) {
698-
buffer.append("0");
699-
}
700-
buffer.append(hexStr);
701-
702-
} else {
703-
if (buffer != null) {
704-
buffer.append(c);
705-
}
706-
}
707-
}
708-
return (buffer != null)? buffer.toString(): uri;
709-
}
710-
711663
/**
712664
* Check port on passed http authority. Make sure the size is not larger
713665
* than allowed: See the 'port' definition on this

src/test/java/org/archive/url/UsableURIFactoryTest.java

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ public final void testWhitespaceEscaped() throws URIException {
174174
assertTrue("Not equal " + uuri.toString(),
175175
uuri.toString().equals(tgtUri));
176176
uri = "http://archive.org/index%25\u001D.html";
177-
tgtUri = "http://archive.org/index%25%1D.html".toLowerCase();
177+
tgtUri = "http://archive.org/index%25%1D.html";
178178
uuri = UsableURIFactory.getInstance(uri);
179179
assertEquals("whitespace escaping", tgtUri, uuri.toString());
180180
uri = "http://gemini.info.usaid.gov/directory/" +
@@ -185,6 +185,12 @@ public final void testWhitespaceEscaped() throws URIException {
185185
"faxResults.cfm?name=Ebenezer +Rumplestiltskin,&location=" +
186186
"RRB%20%20%20%205%2E08%2D006");
187187
assertEquals("whitespace escaping", tgtUri, uuri.toString());
188+
189+
// https://webarchive.jira.com/browse/HER-2089
190+
uri = "http://archive.org/index%25\u3000.html";
191+
tgtUri = "http://archive.org/index%25%E3%80%80.html";
192+
uuri = UsableURIFactory.getInstance(uri);
193+
assertEquals("U+3000 ideographic space escaping", tgtUri, uuri.toString());
188194
}
189195

190196
// public final void testFailedGetPath() throws URIException {

0 commit comments

Comments
 (0)