diff --git a/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestOkHttpPunyCodeNormalization.java b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestOkHttpPunyCodeNormalization.java new file mode 100644 index 0000000000..9dde13170e --- /dev/null +++ b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestOkHttpPunyCodeNormalization.java @@ -0,0 +1,171 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.okhttp; + +import okhttp3.HttpUrl; +import okhttp3.Interceptor; +import okhttp3.OkHttpClient; +import okhttp3.Request; +import okhttp3.Response; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.net.IDN; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicReference; + +import static org.junit.jupiter.api.Assertions.*; + +/** + * Tests for how OkHttp parses and normalizes hosts in three forms: + * - Unicode (e.g. "https://🧠.s.country/...") + * - Percent-encoded UTF-8 (e.g. "https://%F0%9F%A7%A0.s.country/...") + * - Punycode / ACE (e.g. "https://xn--nv8h.s.country/...") + */ +public class TestOkHttpPunyCodeNormalization { + + // U+1F9E0 BRAIN + private static final String BRAIN_UNICODE = "🧠"; + private static final String BRAIN_PCT_UTF8 = "%F0%9F%A7%A0"; + private static final String BRAIN_PUNYCODE = "xn--qv9h"; + + private static final String PARENT = ".s.country"; + private static final String PATH = "/p/human-protocol-aligning-hearts-bots"; + + + @Test + public void testOkHttpVersion() { + // Just for mental sanity, will be removed + assertEquals("5.3.2", okhttp3.OkHttp.VERSION); + } + + @Test + public void unicodeHostNormalizesToPunycode() { + HttpUrl url = HttpUrl.parse("https://" + BRAIN_UNICODE + PARENT + PATH); + assertNotNull(url, "HttpUrl.parse must accept Unicode host"); + assertEquals(BRAIN_PUNYCODE + PARENT, url.host()); + } + + @Test + public void percentEncodedHostNormalizesToPunycode() { + // This is the CC WARC-Target-URI form. The question: does OkHttp + // decode the percent-escapes in the host and IDN-normalize, or + // does it leave them as literal characters / mis-normalize? + HttpUrl url = HttpUrl.parse("https://" + BRAIN_PCT_UTF8 + PARENT + PATH); + assertNotNull(url, "HttpUrl.parse must accept percent-encoded host"); + assertEquals( + BRAIN_PUNYCODE + PARENT, url.host(), "Percent-encoded UTF-8 host must normalize to Punycode for the SAME emoji"); + } + + @Test + public void punycodeHostPassesThrough() { + HttpUrl url = HttpUrl.parse("https://" + BRAIN_PUNYCODE + PARENT + PATH); + assertNotNull(url); + assertEquals(BRAIN_PUNYCODE + PARENT, url.host()); + } + + @Test + public void allThreeFormsProduceEquivalentHost() { + HttpUrl uni = HttpUrl.parse("https://" + BRAIN_UNICODE + PARENT + PATH); + HttpUrl pct = HttpUrl.parse("https://" + BRAIN_PCT_UTF8 + PARENT + PATH); + HttpUrl ace = HttpUrl.parse("https://" + BRAIN_PUNYCODE + PARENT + PATH); + assertNotNull(uni); + assertNotNull(pct); + assertNotNull(ace); + assertEquals(uni.host(), pct.host()); + assertEquals(pct.host(), ace.host()); + } + + @Test + public void pathIsNotMangledByHostNormalization() { + // Sanity: percent-decoding the host must not bleed into the path. + HttpUrl url = HttpUrl.parse("https://" + BRAIN_PCT_UTF8 + PARENT + PATH); + assertNotNull(url); + assertEquals(PATH, url.encodedPath()); + } + + @Test + public void javaIdnAgreesWithOkHttp() { + // Cross-check OkHttp's host() output against the JDK's IDN.toASCII() + // so we know which spec OkHttp is following. + String jdk = IDN.toASCII(BRAIN_UNICODE + PARENT, IDN.ALLOW_UNASSIGNED); + HttpUrl url = HttpUrl.parse("https://" + BRAIN_UNICODE + PARENT + PATH); + assertNotNull(url); + assertEquals(jdk, url.host()); + } + + + @Test + public void hostHeaderMatchesNormalizedHost() throws IOException { + // Build a request and intercept it BEFORE it hits the network, so + // we can read the exact Host header OkHttp would send. We use an + // application interceptor that short-circuits with a synthetic + // response — no actual DNS / TCP needed. + AtomicReference seenHost = new AtomicReference<>(); + AtomicReference seenUrl = new AtomicReference<>(); + + Interceptor capture = chain -> { + Request req = chain.request(); + seenHost.set(req.header("Host") != null + ? req.header("Host") + : req.url().host()); // OkHttp adds Host at the network layer + seenUrl.set(req.url().toString()); + return new Response.Builder() + .request(req) + .protocol(okhttp3.Protocol.HTTP_1_1) + .code(204) + .message("No Content (synthetic)") + .build(); + }; + + OkHttpClient client = new OkHttpClient.Builder() + .addInterceptor(capture) + .callTimeout(2, TimeUnit.SECONDS) + .build(); + + String input = "https://" + BRAIN_PCT_UTF8 + PARENT + PATH; + Request req = new Request.Builder().url(input).head().build(); + try (Response r = client.newCall(req).execute()) { + assertEquals(204, r.code()); + } + + assertEquals( + BRAIN_PUNYCODE + PARENT, seenHost.get(), + "Effective host derived from a percent-encoded UTF-8 input must be the matching Punycode"); + } + + // -- Mismatch detector (the CC bug, reproduced if it triggers) ----------- + + @Test + public void parsedHostMustMatchOriginalEmoji() { + // If this ever fails, OkHttp itself is producing a host that + // disagrees with the input — which would be the CC WARC bug + // happening inside OkHttp. Currently expected to pass. + String[] inputs = { + "https://" + BRAIN_UNICODE + PARENT + PATH, + "https://" + BRAIN_PCT_UTF8 + PARENT + PATH, + "https://" + BRAIN_PUNYCODE + PARENT + PATH, + }; + for (String s : inputs) { + HttpUrl u = HttpUrl.parse(s); + assertNotNull(u, "parse failed for " + s); + assertTrue( + u.host().startsWith(BRAIN_PUNYCODE + "."), + "Host for " + s + " was " + u.host() + ", expected to contain " + BRAIN_PUNYCODE); + } + } +} diff --git a/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestOkHttpRobotsTxtInvalidSlashesNormalization.java b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestOkHttpRobotsTxtInvalidSlashesNormalization.java new file mode 100644 index 0000000000..10824f9c36 --- /dev/null +++ b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestOkHttpRobotsTxtInvalidSlashesNormalization.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.okhttp; + +import okhttp3.*; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.net.IDN; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicReference; + +import static org.junit.jupiter.api.Assertions.*; + +/** + * Tests for how OkHttp parses and normalizes hosts in three forms: +*/ +public class TestOkHttpRobotsTxtInvalidSlashesNormalization { + + @Test + public void unicodeHostNormalizesToPunycode() { + HttpUrl url = HttpUrl.parse("https:////sites.google.com/bao"); + assertNotNull(url, "HttpUrl.parse must accept Unicode host"); + assertEquals("sites.google.com", url.host()); + } + + +} diff --git a/src/test/org/commoncrawl/util/TestWarcWriter.java b/src/test/org/commoncrawl/util/TestWarcWriter.java index 4f7344010d..af448245eb 100644 --- a/src/test/org/commoncrawl/util/TestWarcWriter.java +++ b/src/test/org/commoncrawl/util/TestWarcWriter.java @@ -20,6 +20,7 @@ import org.apache.nutch.net.protocols.HttpDateFormat; import org.apache.nutch.protocol.Content; import org.commoncrawl.util.test.SegmenterRecordReader; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import java.io.ByteArrayInputStream; @@ -39,12 +40,14 @@ public void testWriteRevisitRecordContentType() throws Exception { ByteArrayOutputStream bos = new ByteArrayOutputStream(); WarcWriter writer = new WarcWriter(bos); - File segmentDir = new File(System.getProperty("test.build.data", "."), "test-segments/20260224170658-revisit"); + File segmentDir = new File(System.getProperty("test.build.data", "."), + "test-segments/20260224170658-revisit"); assertNotNull(segmentDir, "Missing segment resource"); String segmentPath = segmentDir.getAbsolutePath(); String url = "https://de.wikipedia.org/wiki/Wikipedia:WikiCon_2025"; Content content = SegmenterRecordReader.retrieveContent(segmentPath, url); + assert (content.getContent() == null || content.getContent().length == 0) : "Content in revisit records must be null or empty."; URI targetUri = new URI(content.getUrl()); Metadata metadata = content.getMetadata(); @@ -80,4 +83,122 @@ public void testWriteRevisitRecordContentType() throws Exception { assertTrue(warcOutput.contains("WARC-Profile: " + warcProfile), "WARC record should have WARC-Profile header"); } + + @Test + @Disabled("This test is testing a behaviour we are not sure we will implement - fixing the issue downstream instead of upstream. ") + public void testWriteResponseRecordWithMalformedURL() throws Exception { + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + WarcWriter writer = new WarcWriter(bos); + + File segmentDir = new File(System.getProperty("test.build.data", "."), + "test-segments/20260505091103-malformed-urls"); + assertNotNull(segmentDir, "Missing segment resource"); + String segmentPath = segmentDir.getAbsolutePath(); + String url = "https:////sites.google.com/site/lebercailgiteennormandie/robots.txt"; + + Content content = SegmenterRecordReader.retrieveContent(segmentPath, url); + assert (content.getContent() != null && content.getContent().length > 0) : "Content in fetched 200s records must not be null."; + URI targetUri = new URI(content.getUrl()); + + Metadata metadata = content.getMetadata(); + String ip = content.getMetadata().get("_ip_"); + int httpStatusCode = 200; + + Date date = HttpDateFormat.toDate(metadata.get("date")); + URI warcinfoId = writer.getRecordId(); + URI relatedId = writer.getRecordId(); + String payloadDigest = "sha1:abc123"; + String blockDigest = "sha1:def456"; + + writer.writeWarcResponseRecord(targetUri, ip, httpStatusCode, date, + warcinfoId, relatedId, payloadDigest, + blockDigest, "false", + null, + null, content.getContent(), content); + + byte[] compressed = bos.toByteArray(); + ByteArrayInputStream bis = new ByteArrayInputStream(compressed); + GZIPInputStream gis = new GZIPInputStream(bis); + ByteArrayOutputStream decompressed = new ByteArrayOutputStream(); + gis.transferTo(decompressed); + + String warcOutput = decompressed.toString(); + + assertTrue(warcOutput.contains("WARC-Target-URI: https://sites.google.com/site/lebercailgiteennormandie/robots.txt"), + "WARC-Target-URI should be normalized to a valid URL"); + } + + @Test + @Disabled("This test is testing a behaviour we are not sure we will implement - fixing the issue downstream instead of upstream. ") + public void testWriteRequestRecordWithMalformedURL() throws Exception { + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + WarcWriter writer = new WarcWriter(bos); + + File segmentDir = new File(System.getProperty("test.build.data", "."), + "test-segments/20260505091103-malformed-urls"); + assertNotNull(segmentDir, "Missing segment resource"); + String segmentPath = segmentDir.getAbsolutePath(); + String url = "https:////sites.google.com/site/lebercailgiteennormandie/robots.txt"; + + Content content = SegmenterRecordReader.retrieveContent(segmentPath, url); + assert (content.getContent() != null && content.getContent().length > 0) : "Content in fetched 200s records must not be null."; + URI targetUri = new URI(content.getUrl()); + + Metadata metadata = content.getMetadata(); + String ip = content.getMetadata().get("_ip_"); + + Date date = HttpDateFormat.toDate(metadata.get("date")); + URI warcinfoId = writer.getRecordId(); + + writer.writeWarcRequestRecord(targetUri, ip, date, + warcinfoId, null, null, content.getContent()); + + byte[] compressed = bos.toByteArray(); + ByteArrayInputStream bis = new ByteArrayInputStream(compressed); + GZIPInputStream gis = new GZIPInputStream(bis); + ByteArrayOutputStream decompressed = new ByteArrayOutputStream(); + gis.transferTo(decompressed); + + String warcOutput = decompressed.toString(); + + assertTrue(warcOutput.contains("WARC-Target-URI: https://sites.google.com/site/lebercailgiteennormandie/robots.txt"), + "WARC-Target-URI should be normalized to a valid URL"); + } + + @Test + @Disabled("This test is testing a behaviour we are not sure we will implement - fixing the issue downstream instead of upstream. ") + public void testWriteMetadataRecordWithMalformedURL() throws Exception { + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + WarcWriter writer = new WarcWriter(bos); + + File segmentDir = new File(System.getProperty("test.build.data", "."), + "test-segments/20260505091103-malformed-urls"); + assertNotNull(segmentDir, "Missing segment resource"); + String segmentPath = segmentDir.getAbsolutePath(); + String url = "https:////sites.google.com/site/lebercailgiteennormandie/robots.txt"; + + Content content = SegmenterRecordReader.retrieveContent(segmentPath, url); + assert (content.getContent() != null && content.getContent().length > 0) : "Content in fetched 200s records must not be null."; + URI targetUri = new URI(content.getUrl()); + + Metadata metadata = content.getMetadata(); + URI relatedId = writer.getRecordId(); + String blockDigest = "sha1:def456"; + + Date date = HttpDateFormat.toDate(metadata.get("date")); + URI warcinfoId = writer.getRecordId(); + + writer.writeWarcMetadataRecord(targetUri, date, warcinfoId, relatedId, blockDigest, content.getContent()); + + byte[] compressed = bos.toByteArray(); + ByteArrayInputStream bis = new ByteArrayInputStream(compressed); + GZIPInputStream gis = new GZIPInputStream(bis); + ByteArrayOutputStream decompressed = new ByteArrayOutputStream(); + gis.transferTo(decompressed); + + String warcOutput = decompressed.toString(); + + assertTrue(warcOutput.contains("WARC-Target-URI: https://sites.google.com/site/lebercailgiteennormandie/robots.txt"), + "WARC-Target-URI should be normalized to a valid URL"); + } } diff --git a/src/test/org/commoncrawl/util/test/SegmenterRecordReader.java b/src/test/org/commoncrawl/util/test/SegmenterRecordReader.java index 62057f4e17..5570ac54bf 100644 --- a/src/test/org/commoncrawl/util/test/SegmenterRecordReader.java +++ b/src/test/org/commoncrawl/util/test/SegmenterRecordReader.java @@ -28,7 +28,6 @@ private int run(String path, String url) throws Exception { Content c = new Content(); readers[0].get(k, c); assert (c.getUrl().equals(url)); - assert (c.getContent() == null || c.getContent().length == 0); this.content = c; return 0; diff --git a/src/testresources/test-segments/20260505091103-malformed-urls/content/part-r-00000/.data.crc b/src/testresources/test-segments/20260505091103-malformed-urls/content/part-r-00000/.data.crc new file mode 100644 index 0000000000..91d344a940 Binary files /dev/null and b/src/testresources/test-segments/20260505091103-malformed-urls/content/part-r-00000/.data.crc differ diff --git a/src/testresources/test-segments/20260505091103-malformed-urls/content/part-r-00000/.index.crc b/src/testresources/test-segments/20260505091103-malformed-urls/content/part-r-00000/.index.crc new file mode 100644 index 0000000000..6a28dce48b Binary files /dev/null and b/src/testresources/test-segments/20260505091103-malformed-urls/content/part-r-00000/.index.crc differ diff --git a/src/testresources/test-segments/20260505091103-malformed-urls/content/part-r-00000/data b/src/testresources/test-segments/20260505091103-malformed-urls/content/part-r-00000/data new file mode 100644 index 0000000000..43231dd227 Binary files /dev/null and b/src/testresources/test-segments/20260505091103-malformed-urls/content/part-r-00000/data differ diff --git a/src/testresources/test-segments/20260505091103-malformed-urls/content/part-r-00000/index b/src/testresources/test-segments/20260505091103-malformed-urls/content/part-r-00000/index new file mode 100644 index 0000000000..76fc53d44a Binary files /dev/null and b/src/testresources/test-segments/20260505091103-malformed-urls/content/part-r-00000/index differ