Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
1.1.5
-----
* [Escape redirect URLs in RealCDXExtractorOutput](https://github.com/iipc/webarchive-commons/pull/36)
* [Tests fail on Windows](https://github.com/iipc/webarchive-commons/issues/2)
* [Test fails on Java 8](https://github.com/iipc/webarchive-commons/issues/31)

Expand Down
9 changes: 6 additions & 3 deletions src/main/java/org/archive/extract/RealCDXExtractorOutput.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import java.io.OutputStream;
import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.List;
Expand Down Expand Up @@ -307,12 +308,14 @@ private String extractHTMLMetaRefresh(String origUrl, MetaData m) {
return "-";
}

private String resolve(String context, String spec) {
static String resolve(String context, String spec) {
// TODO: test!
try {
URL cUrl = new URL(context);
URL resolved = new URL(cUrl,spec);
return resolved.toURI().toASCIIString();
URL url = new URL(cUrl, spec);
// this constructor escapes its arguments, if necessary
URI uri = new URI(url.getProtocol(), url.getHost(), url.getPath(), url.getQuery(), url.getRef());
return uri.toASCIIString();

} catch (URISyntaxException e) {
} catch (MalformedURLException e) {
Expand Down
28 changes: 28 additions & 0 deletions src/test/java/org/archive/extract/RealCDXExtractorOutputTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
package org.archive.extract;

import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLEncoder;

import junit.framework.TestCase;


public class RealCDXExtractorOutputTest extends TestCase {

public void testEscapeResolvedUrl() throws Exception {
String context ="http://www.uni-giessen.de/cms/studium/dateien/informationberatung/merkblattpdf";
String spec = "http://fss.plone.uni-giessen.de/fß/studium/dateien/informationberatung/merkblattpdf/file/Mérkblatt zur Gestaltung von Nachteilsausgleichen.pdf?föo=bar#änchor";
String escaped = RealCDXExtractorOutput.resolve(context, spec);
assertTrue(escaped.indexOf(" ") < 0);
URI parsed = new URI(escaped);
assertEquals("änchor", parsed.getFragment());
}

public void testNoDoubleEscaping() throws Exception {
String spec = "https://www.google.com/search?q=java+escape+url+spaces&ie=utf-8&oe=utf-8";
String resolved = RealCDXExtractorOutput.resolve(spec, spec);
assertTrue(spec.equals(resolved));
}
}