From d6b90e34ff7d95b384a9e272499f43d6f2692b9f Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Wed, 20 May 2026 09:50:16 +0200 Subject: [PATCH 1/5] feat: add raw URL handling and normalization for HTTP responses --- .../apache/nutch/net/protocols/Response.java | 17 ++++++++++++++- .../commoncrawl/util/WarcRecordWriter.java | 2 +- .../nutch/protocol/http/api/HttpBase.java | 21 ++++++++++++++++++- .../http/api/HttpRobotRulesParser.java | 3 ++- .../nutch/protocol/htmlunit/HttpResponse.java | 5 +++++ .../nutch/protocol/http/HttpResponse.java | 5 +++++ .../protocol/httpclient/HttpResponse.java | 18 ++++++++++++++++ .../interactiveselenium/HttpResponse.java | 5 +++++ .../apache/nutch/protocol/okhttp/OkHttp.java | 18 ++++++++++++++++ .../nutch/protocol/okhttp/OkHttpResponse.java | 15 +++++++++++-- .../nutch/protocol/selenium/HttpResponse.java | 5 +++++ 11 files changed, 108 insertions(+), 6 deletions(-) diff --git a/src/java/org/apache/nutch/net/protocols/Response.java b/src/java/org/apache/nutch/net/protocols/Response.java index 3fbe932667..3b1ca23f8b 100644 --- a/src/java/org/apache/nutch/net/protocols/Response.java +++ b/src/java/org/apache/nutch/net/protocols/Response.java @@ -85,11 +85,26 @@ public static enum TruncatedContentReason { }; /** - * Get the URL used to retrieve this response. + * Get the URL the protocol actually used to retrieve this response. + * May differ from the URL passed to {@code getResponse(...)} if the protocol + * library normalized or transformed it (e.g. IDN→punycode via OkHttp's + * HttpUrl, or post-redirect URI from Commons HttpClient when + * followRedirects=true). For the originally-requested URL see + * {@link #getRawUrl()}. * @return {@link java.net.URL} */ public URL getUrl(); + /** + * Get the URL the caller originally requested, verbatim, as passed to + * {@code getResponse(...)}. Implementers that do not transform URLs may + * simply return {@link #getUrl()} — but they must say so explicitly, so + * that any future plugin which does transform is forced by the compiler + * to consider what raw means for it. + * @return {@link java.net.URL} + */ + public URL getRawUrl(); + /** * Get the response code. * @return protocol response code (int) diff --git a/src/java/org/commoncrawl/util/WarcRecordWriter.java b/src/java/org/commoncrawl/util/WarcRecordWriter.java index 05f2a304f6..9593a61734 100644 --- a/src/java/org/commoncrawl/util/WarcRecordWriter.java +++ b/src/java/org/commoncrawl/util/WarcRecordWriter.java @@ -556,7 +556,7 @@ public synchronized void write(Text key, WarcCapture value) try { targetUri = new URI(url); } catch (URISyntaxException e) { - if (value.datum != null + if (value.datum != null // not a robots.txt && value.datum.getStatus() == CrawlDatum.STATUS_FETCH_SUCCESS) { // if a successful capture, try to normalize the URL String urlNorm = null; diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java index 63b9224b0d..4a6d172ba5 100755 --- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java +++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java @@ -20,6 +20,7 @@ import java.io.BufferedReader; import java.io.IOException; import java.io.Reader; +import java.net.MalformedURLException; import java.net.Proxy; import java.net.URI; import java.net.URL; @@ -49,7 +50,6 @@ import org.apache.nutch.util.GZIPUtils; import org.apache.nutch.util.MimeUtil; import org.apache.nutch.util.DeflateUtils; -import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.IntWritable; @@ -722,6 +722,25 @@ protected static void main(HttpBase http, String[] args) throws Exception { protected abstract Response getResponse(URL url, CrawlDatum datum, boolean followRedirects) throws ProtocolException, IOException; + /** + * Resolve a relative URL against a base URL using the protocol's URL + * library. The default uses Java's {@link URL} constructor. Subclasses + * with a more lenient URL parser (e.g. OkHttp's HttpUrl, which handles + * malformed slashes and IDN normalization) should override. + * + * Used by {@link HttpRobotRulesParser} when following robots.txt redirects: + * delegating to this hook means each protocol gets to use the same URL + * parser for redirect resolution that it would use to actually fetch. + * + * @param base the base URL the relative URL is resolved against + * @param relative the relative URL string (typically a Location: header value) + * @return resolved absolute URL + * @throws MalformedURLException if the URL cannot be parsed + */ + protected URL resolveUrl(URL base, String relative) throws MalformedURLException { + return new URL(base, relative); + } + @Override public BaseRobotRules getRobotRules(Text url, CrawlDatum datum, List robotsTxtContent) { diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java index 4f4bd99774..349541ff5a 100644 --- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java +++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java @@ -171,7 +171,8 @@ public BaseRobotRules getRobotRulesSet(Protocol http, URL url, LOG.debug("Following robots.txt redirect: {} -> {}", robotsUrlRedir, redirectionLocation); try { - robotsUrlRedir = new URL(robotsUrlRedir, redirectionLocation); + robotsUrlRedir = ((HttpBase) http).resolveUrl(robotsUrlRedir, + redirectionLocation); } catch (MalformedURLException e) { LOG.info( "Failed to resolve redirect location for robots.txt: {} -> {} ({})", diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java index 3afcbb25cf..1aeeee2fae 100644 --- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java +++ b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java @@ -308,6 +308,11 @@ public URL getUrl() { return url; } + @Override + public URL getRawUrl() { + return url; // htmlunit does not transform URLs; raw equals effective + } + @Override public int getCode() { return code; diff --git a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java index 377b784c57..9585ad2607 100644 --- a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java +++ b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java @@ -353,6 +353,11 @@ public URL getUrl() { return url; } + @Override + public URL getRawUrl() { + return url; // raw-socket plugin does not transform; raw equals effective + } + @Override public int getCode() { return code; diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java index 87ee0bb8ac..a83ab3c2fd 100644 --- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java +++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java @@ -46,6 +46,7 @@ public class HttpResponse implements Response { private URL url; + private URL rawUrl; private byte[] content; private int code; private Metadata headers = new SpellCheckedMetadata(); @@ -71,6 +72,7 @@ public class HttpResponse implements Response { // Prepare GET method for HTTP request this.url = url; + this.rawUrl = url; GetMethod get = new GetMethod(url.toString()); get.setFollowRedirects(followRedirects); get.setDoAuthentication(true); @@ -121,6 +123,17 @@ public class HttpResponse implements Response { client.getParams().setParameter("http.useragent", http.getUserAgent()); // NUTCH-1941 code = client.executeMethod(get); + // When followRedirects=true HC3 walks the redirect chain internally; + // getURI() returns the final URI. Capture it so getUrl() honors the + // contract — without this, robots.txt redirects via this plugin + // report the original URL even though a different URL was fetched. + try { + this.url = new URL(get.getURI().toString()); + } catch (org.apache.commons.httpclient.URIException + | java.net.MalformedURLException e) { + // Keep the input URL or try to normalize it? + } + Header[] heads = get.getResponseHeaders(); for (int i = 0; i < heads.length; i++) { @@ -214,6 +227,11 @@ public URL getUrl() { return url; } + @Override + public URL getRawUrl() { + return rawUrl; + } + @Override public int getCode() { return code; diff --git a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java index 9081f14b81..83a3ba8f8c 100644 --- a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java +++ b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java @@ -334,6 +334,11 @@ public URL getUrl() { return url; } + @Override + public URL getRawUrl() { + return url; // interactiveselenium does not transform URLs; raw equals effective + } + @Override public int getCode() { return code; diff --git a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java index b47142cb24..e3480ceda5 100644 --- a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java +++ b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java @@ -20,6 +20,7 @@ import java.lang.invoke.MethodHandles; import java.net.InetAddress; import java.net.InetSocketAddress; +import java.net.MalformedURLException; import java.net.Proxy; import java.net.ProxySelector; import java.net.SocketAddress; @@ -58,6 +59,7 @@ import okhttp3.Gzip; import okhttp3.Handshake; import okhttp3.Headers; +import okhttp3.HttpUrl; import okhttp3.Interceptor; import okhttp3.OkHttpClient; import okhttp3.Protocol; @@ -441,6 +443,22 @@ protected Response getResponse(URL url, CrawlDatum datum, boolean redirect) return new OkHttpResponse(this, url, datum); } + /** + * Resolve a relative URL using OkHttp's {@link HttpUrl} parser, which is + * more lenient than Java's {@link URL} (handles malformed protocol-relative + * slashes such as {@code https:////host/path}, IDN→punycode, host case + * normalization, etc.). Falls back to Java URL if HttpUrl cannot parse. + */ + @Override + protected URL resolveUrl(URL base, String relative) throws MalformedURLException { + HttpUrl baseHttpUrl = HttpUrl.get(base); + HttpUrl resolved = baseHttpUrl.resolve(relative); + if (resolved != null) { + return resolved.url(); + } + return super.resolveUrl(base, relative); + } + public static void main(String[] args) throws Exception { OkHttp okhttp = new OkHttp(); okhttp.setConf(NutchConfiguration.create()); diff --git a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java index 9aa1526157..5ae5a901cd 100644 --- a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java +++ b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java @@ -44,6 +44,7 @@ public class OkHttpResponse implements Response { .getLogger(MethodHandles.lookup().lookupClass()); private URL url; + private URL rawUrl; private byte[] content; private int code; private Metadata headers = new Metadata(); @@ -72,7 +73,8 @@ public boolean booleanValue() { public OkHttpResponse(OkHttp okhttp, URL url, CrawlDatum datum) throws ProtocolException, IOException { - this.url = url; + this.url = url; // provisional; overwritten below with the normalized form + this.rawUrl = url; Request.Builder rb = new Request.Builder().url(url); @@ -103,7 +105,11 @@ public OkHttpResponse(OkHttp okhttp, URL url, CrawlDatum datum) } Request request = rb.build(); - okhttp3.Call call = okhttp.getClient(url).newCall(request); + + // OkHttp parsed the URL via HttpUrl; that's the form actually going on + // the wire (IDN→punycode, repeated-slash repair, host lowercasing). + this.url = request.url().url(); + okhttp3.Call call = okhttp.getClient(this.url).newCall(request); // ensure that Response and underlying ResponseBody are closed try (okhttp3.Response response = call.execute()) { @@ -233,6 +239,11 @@ public URL getUrl() { return this.url; } + @Override + public URL getRawUrl() { + return this.rawUrl; + } + @Override public int getCode() { return this.code; diff --git a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java index ddfcf72417..d5d105baa8 100644 --- a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java +++ b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java @@ -330,6 +330,11 @@ public URL getUrl() { return url; } + @Override + public URL getRawUrl() { + return url; // selenium does not transform URLs; raw equals effective + } + @Override public int getCode() { return code; From a48f4026be68d5559f37d7f747332ccf8f3b71bd Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 22 May 2026 12:05:57 +0100 Subject: [PATCH 2/5] fix: remove rawUrl as the caller still have it --- .../org/apache/nutch/net/protocols/Response.java | 13 +------------ .../nutch/protocol/htmlunit/HttpResponse.java | 5 ----- .../apache/nutch/protocol/http/HttpResponse.java | 5 ----- .../nutch/protocol/httpclient/HttpResponse.java | 7 ------- .../protocol/interactiveselenium/HttpResponse.java | 5 ----- .../nutch/protocol/okhttp/OkHttpResponse.java | 7 ------- .../nutch/protocol/selenium/HttpResponse.java | 5 ----- 7 files changed, 1 insertion(+), 46 deletions(-) diff --git a/src/java/org/apache/nutch/net/protocols/Response.java b/src/java/org/apache/nutch/net/protocols/Response.java index 3b1ca23f8b..c0c0ba4619 100644 --- a/src/java/org/apache/nutch/net/protocols/Response.java +++ b/src/java/org/apache/nutch/net/protocols/Response.java @@ -89,22 +89,11 @@ public static enum TruncatedContentReason { * May differ from the URL passed to {@code getResponse(...)} if the protocol * library normalized or transformed it (e.g. IDN→punycode via OkHttp's * HttpUrl, or post-redirect URI from Commons HttpClient when - * followRedirects=true). For the originally-requested URL see - * {@link #getRawUrl()}. + * followRedirects=true). * @return {@link java.net.URL} */ public URL getUrl(); - /** - * Get the URL the caller originally requested, verbatim, as passed to - * {@code getResponse(...)}. Implementers that do not transform URLs may - * simply return {@link #getUrl()} — but they must say so explicitly, so - * that any future plugin which does transform is forced by the compiler - * to consider what raw means for it. - * @return {@link java.net.URL} - */ - public URL getRawUrl(); - /** * Get the response code. * @return protocol response code (int) diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java index 1aeeee2fae..3afcbb25cf 100644 --- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java +++ b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java @@ -308,11 +308,6 @@ public URL getUrl() { return url; } - @Override - public URL getRawUrl() { - return url; // htmlunit does not transform URLs; raw equals effective - } - @Override public int getCode() { return code; diff --git a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java index 9585ad2607..377b784c57 100644 --- a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java +++ b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java @@ -353,11 +353,6 @@ public URL getUrl() { return url; } - @Override - public URL getRawUrl() { - return url; // raw-socket plugin does not transform; raw equals effective - } - @Override public int getCode() { return code; diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java index a83ab3c2fd..a7109bce4a 100644 --- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java +++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java @@ -46,7 +46,6 @@ public class HttpResponse implements Response { private URL url; - private URL rawUrl; private byte[] content; private int code; private Metadata headers = new SpellCheckedMetadata(); @@ -72,7 +71,6 @@ public class HttpResponse implements Response { // Prepare GET method for HTTP request this.url = url; - this.rawUrl = url; GetMethod get = new GetMethod(url.toString()); get.setFollowRedirects(followRedirects); get.setDoAuthentication(true); @@ -227,11 +225,6 @@ public URL getUrl() { return url; } - @Override - public URL getRawUrl() { - return rawUrl; - } - @Override public int getCode() { return code; diff --git a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java index 83a3ba8f8c..9081f14b81 100644 --- a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java +++ b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java @@ -334,11 +334,6 @@ public URL getUrl() { return url; } - @Override - public URL getRawUrl() { - return url; // interactiveselenium does not transform URLs; raw equals effective - } - @Override public int getCode() { return code; diff --git a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java index 5ae5a901cd..db5c2edefe 100644 --- a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java +++ b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java @@ -44,7 +44,6 @@ public class OkHttpResponse implements Response { .getLogger(MethodHandles.lookup().lookupClass()); private URL url; - private URL rawUrl; private byte[] content; private int code; private Metadata headers = new Metadata(); @@ -74,7 +73,6 @@ public OkHttpResponse(OkHttp okhttp, URL url, CrawlDatum datum) throws ProtocolException, IOException { this.url = url; // provisional; overwritten below with the normalized form - this.rawUrl = url; Request.Builder rb = new Request.Builder().url(url); @@ -239,11 +237,6 @@ public URL getUrl() { return this.url; } - @Override - public URL getRawUrl() { - return this.rawUrl; - } - @Override public int getCode() { return this.code; diff --git a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java index d5d105baa8..ddfcf72417 100644 --- a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java +++ b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java @@ -330,11 +330,6 @@ public URL getUrl() { return url; } - @Override - public URL getRawUrl() { - return url; // selenium does not transform URLs; raw equals effective - } - @Override public int getCode() { return code; From e2bafaf8980cbe04b2de8c836b11ed20ed0a1715 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 22 May 2026 12:06:45 +0100 Subject: [PATCH 3/5] fix: implement resolveUrl as default method in Protocol --- .../org/apache/nutch/protocol/Protocol.java | 13 ++++++++++++ .../nutch/protocol/http/api/HttpBase.java | 20 ------------------- .../apache/nutch/protocol/okhttp/OkHttp.java | 2 +- 3 files changed, 14 insertions(+), 21 deletions(-) diff --git a/src/java/org/apache/nutch/protocol/Protocol.java b/src/java/org/apache/nutch/protocol/Protocol.java index 2514eae33e..34805febb7 100644 --- a/src/java/org/apache/nutch/protocol/Protocol.java +++ b/src/java/org/apache/nutch/protocol/Protocol.java @@ -16,6 +16,7 @@ */ package org.apache.nutch.protocol; +import java.net.MalformedURLException; import java.net.URL; import java.util.List; @@ -40,6 +41,18 @@ public interface Protocol extends Pluggable, Configurable { */ ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum); +/** + * Resolve a relative URL against a base URL using the protocol's URL + * library. + * + * @param base the base URL the relative URL is resolved against + * @param relative the relative URL string (typically a Location: header value) + * @return resolved absolute URL + * @throws MalformedURLException if the URL is malformed + */ + default URL resolveUrl(URL base, String relative) throws MalformedURLException { + return new URL(base, relative); + } /** * Retrieve robot rules applicable for this URL. * diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java index 4a6d172ba5..e7689187bc 100755 --- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java +++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java @@ -20,7 +20,6 @@ import java.io.BufferedReader; import java.io.IOException; import java.io.Reader; -import java.net.MalformedURLException; import java.net.Proxy; import java.net.URI; import java.net.URL; @@ -722,25 +721,6 @@ protected static void main(HttpBase http, String[] args) throws Exception { protected abstract Response getResponse(URL url, CrawlDatum datum, boolean followRedirects) throws ProtocolException, IOException; - /** - * Resolve a relative URL against a base URL using the protocol's URL - * library. The default uses Java's {@link URL} constructor. Subclasses - * with a more lenient URL parser (e.g. OkHttp's HttpUrl, which handles - * malformed slashes and IDN normalization) should override. - * - * Used by {@link HttpRobotRulesParser} when following robots.txt redirects: - * delegating to this hook means each protocol gets to use the same URL - * parser for redirect resolution that it would use to actually fetch. - * - * @param base the base URL the relative URL is resolved against - * @param relative the relative URL string (typically a Location: header value) - * @return resolved absolute URL - * @throws MalformedURLException if the URL cannot be parsed - */ - protected URL resolveUrl(URL base, String relative) throws MalformedURLException { - return new URL(base, relative); - } - @Override public BaseRobotRules getRobotRules(Text url, CrawlDatum datum, List robotsTxtContent) { diff --git a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java index e3480ceda5..be26886fed 100644 --- a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java +++ b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java @@ -450,7 +450,7 @@ protected Response getResponse(URL url, CrawlDatum datum, boolean redirect) * normalization, etc.). Falls back to Java URL if HttpUrl cannot parse. */ @Override - protected URL resolveUrl(URL base, String relative) throws MalformedURLException { + public URL resolveUrl(URL base, String relative) throws MalformedURLException { HttpUrl baseHttpUrl = HttpUrl.get(base); HttpUrl resolved = baseHttpUrl.resolve(relative); if (resolved != null) { From 12a7f433d5419554a2626adc52a466dda7aa184a Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 22 May 2026 12:07:28 +0100 Subject: [PATCH 4/5] feat: debug URL request transformation/normalization when differs and in debug mode --- .../java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java index db5c2edefe..8d7966199a 100644 --- a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java +++ b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java @@ -107,6 +107,9 @@ public OkHttpResponse(OkHttp okhttp, URL url, CrawlDatum datum) // OkHttp parsed the URL via HttpUrl; that's the form actually going on // the wire (IDN→punycode, repeated-slash repair, host lowercasing). this.url = request.url().url(); + if (LOG.isDebugEnabled() && !this.url.toString().equals(url.toString())) { + LOG.debug("The normalized URL different from the requested URL: {} -> {}", url, this.url); + } okhttp3.Call call = okhttp.getClient(this.url).newCall(request); // ensure that Response and underlying ResponseBody are closed From ff1fd25af197da42f388d8f5c0471067a92e7753 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 22 May 2026 12:12:38 +0100 Subject: [PATCH 5/5] doc: improve description --- src/java/org/apache/nutch/net/protocols/Response.java | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/java/org/apache/nutch/net/protocols/Response.java b/src/java/org/apache/nutch/net/protocols/Response.java index c0c0ba4619..2f52784373 100644 --- a/src/java/org/apache/nutch/net/protocols/Response.java +++ b/src/java/org/apache/nutch/net/protocols/Response.java @@ -85,11 +85,7 @@ public static enum TruncatedContentReason { }; /** - * Get the URL the protocol actually used to retrieve this response. - * May differ from the URL passed to {@code getResponse(...)} if the protocol - * library normalized or transformed it (e.g. IDN→punycode via OkHttp's - * HttpUrl, or post-redirect URI from Commons HttpClient when - * followRedirects=true). + * Get the URL the protocol actually used when requesting the Response. * @return {@link java.net.URL} */ public URL getUrl();