diff --git a/CHANGES.md b/CHANGES.md index 8e787634..7fb2f7c4 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,6 @@ 1.1.5 ----- +* [Escape redirect URLs in RealCDXExtractorOutput](https://github.com/iipc/webarchive-commons/pull/36) * [Tests fail on Windows](https://github.com/iipc/webarchive-commons/issues/2) * [Test fails on Java 8](https://github.com/iipc/webarchive-commons/issues/31) diff --git a/src/main/java/org/archive/extract/RealCDXExtractorOutput.java b/src/main/java/org/archive/extract/RealCDXExtractorOutput.java index 62a423c5..8ca3ff82 100644 --- a/src/main/java/org/archive/extract/RealCDXExtractorOutput.java +++ b/src/main/java/org/archive/extract/RealCDXExtractorOutput.java @@ -4,6 +4,7 @@ import java.io.OutputStream; import java.io.PrintWriter; import java.net.MalformedURLException; +import java.net.URI; import java.net.URISyntaxException; import java.net.URL; import java.util.List; @@ -307,12 +308,14 @@ private String extractHTMLMetaRefresh(String origUrl, MetaData m) { return "-"; } - private String resolve(String context, String spec) { + static String resolve(String context, String spec) { // TODO: test! try { URL cUrl = new URL(context); - URL resolved = new URL(cUrl,spec); - return resolved.toURI().toASCIIString(); + URL url = new URL(cUrl, spec); + // this constructor escapes its arguments, if necessary + URI uri = new URI(url.getProtocol(), url.getHost(), url.getPath(), url.getQuery(), url.getRef()); + return uri.toASCIIString(); } catch (URISyntaxException e) { } catch (MalformedURLException e) { diff --git a/src/test/java/org/archive/extract/RealCDXExtractorOutputTest.java b/src/test/java/org/archive/extract/RealCDXExtractorOutputTest.java new file mode 100644 index 00000000..14f8489d --- /dev/null +++ b/src/test/java/org/archive/extract/RealCDXExtractorOutputTest.java @@ -0,0 +1,28 @@ +package org.archive.extract; + +import java.net.MalformedURLException; +import java.net.URI; +import java.net.URISyntaxException; +import java.net.URL; +import java.net.URLEncoder; + +import junit.framework.TestCase; + + +public class RealCDXExtractorOutputTest extends TestCase { + + public void testEscapeResolvedUrl() throws Exception { + String context ="http://www.uni-giessen.de/cms/studium/dateien/informationberatung/merkblattpdf"; + String spec = "http://fss.plone.uni-giessen.de/fß/studium/dateien/informationberatung/merkblattpdf/file/Mérkblatt zur Gestaltung von Nachteilsausgleichen.pdf?föo=bar#änchor"; + String escaped = RealCDXExtractorOutput.resolve(context, spec); + assertTrue(escaped.indexOf(" ") < 0); + URI parsed = new URI(escaped); + assertEquals("änchor", parsed.getFragment()); + } + + public void testNoDoubleEscaping() throws Exception { + String spec = "https://www.google.com/search?q=java+escape+url+spaces&ie=utf-8&oe=utf-8"; + String resolved = RealCDXExtractorOutput.resolve(spec, spec); + assertTrue(spec.equals(resolved)); + } +}