From fc28d7ffeccfdcaa26eef9d41ab985c29253fe29 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 4 May 2026 15:08:11 +0100 Subject: [PATCH 01/16] fix: apply URL normalisation for robots.txt redirect --- .../http/api/HttpRobotRulesParser.java | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java index 4f4bd99774..267b8f059f 100644 --- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java +++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java @@ -84,6 +84,26 @@ protected static String getCacheKey(URL url) { return cacheKey; } + static String normalizeMalformedHttpSlashes(String uriString) { + String schemePrefix; + if (uriString.startsWith("http:")) { + schemePrefix = "http:"; + } else if (uriString.startsWith("https:")) { + schemePrefix = "https:"; + } else { + return uriString; + } + int slashStart = schemePrefix.length(); + int slashEnd = slashStart; + while (slashEnd < uriString.length() && uriString.charAt(slashEnd) == '/') { + slashEnd++; + } + if (slashEnd - slashStart == 2) { + return uriString; + } + return schemePrefix + "//" + uriString.substring(slashEnd); + } + /** * Get the rules from robots.txt which applies for the given {@code url}. * Robot rules are cached for a unique combination of host, protocol, and @@ -178,6 +198,19 @@ public BaseRobotRules getRobotRulesSet(Protocol http, URL url, robotsUrlRedir, redirectionLocation, e.getMessage()); break; } + + if (robotsUrlRedir.getHost() == null || robotsUrlRedir.getHost().isEmpty()) { + LOG.info("Robots.txt redirect resolved to malformed URL {} (initial: {}); will normalize", + robotsUrlRedir, robotsUrl); + try { + robotsUrlRedir = new URL(normalizeMalformedHttpSlashes(robotsUrlRedir.toString())); + } catch (MalformedURLException mue) { + cacheRule = false; // TODO: validate with Sebastian + LOG.info("Failed to normalize malformed robots.txt redirect: {} ({})", + robotsUrlRedir, mue.getMessage()); + } + } + response = ((HttpBase) http).getResponse(robotsUrlRedir, new CrawlDatum(), true); code = response.getCode(); From 5ef314cd925a1818e45c773855b4b3d7f2028428 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 4 May 2026 15:38:38 +0100 Subject: [PATCH 02/16] fix: exit strategy --- .../apache/nutch/protocol/http/api/HttpRobotRulesParser.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java index 267b8f059f..72c7280947 100644 --- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java +++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java @@ -196,6 +196,7 @@ public BaseRobotRules getRobotRulesSet(Protocol http, URL url, LOG.info( "Failed to resolve redirect location for robots.txt: {} -> {} ({})", robotsUrlRedir, redirectionLocation, e.getMessage()); + cacheRule = false; // TODO: validate with Sebastian break; } @@ -208,6 +209,7 @@ public BaseRobotRules getRobotRulesSet(Protocol http, URL url, cacheRule = false; // TODO: validate with Sebastian LOG.info("Failed to normalize malformed robots.txt redirect: {} ({})", robotsUrlRedir, mue.getMessage()); + break; } } From d18e8f5320079c19fe9afea3cda4211ca4270561 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 4 May 2026 16:07:24 +0100 Subject: [PATCH 03/16] feat: add unit tests on the normalization method --- .../http/api/TestHttpRobotRulesParser.java | 63 +++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestHttpRobotRulesParser.java diff --git a/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestHttpRobotRulesParser.java b/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestHttpRobotRulesParser.java new file mode 100644 index 0000000000..f03ea600da --- /dev/null +++ b/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestHttpRobotRulesParser.java @@ -0,0 +1,63 @@ +package org.apache.nutch.protocol.http.api; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.junit.jupiter.api.Test; + +class TestHttpRobotRulesParser { + + @Test + void testNormalizeMalformedHttpUrlOK() { + + String url = "http://www.google.com/robots.txt"; + + assertEquals( + url, + HttpRobotRulesParser.normalizeMalformedHttpSlashes(url), + "Normalizer should not change a well-formed URL."); + } + + @Test + void testNormalizeMalformedHttpsUrlOK() { + + String url = "https://www.google.com/robots.txt"; + + assertEquals( + url, + HttpRobotRulesParser.normalizeMalformedHttpSlashes(url), + "Normalizer should not change a well-formed URL."); + } + + @Test + void testNormalizeMalformedHttps3SlashesIsFixed() { + assertEquals( + "http://www.google.com/robots.txt", + HttpRobotRulesParser.normalizeMalformedHttpSlashes("http:///www.google.com/robots.txt"), + "Normalizer should fix change a malformed URL with three slashes."); + } + + @Test + void testNormalizeMalformedHttps4SlashesIsFixed() { + assertEquals( + "http://www.google.com/robots.txt", + HttpRobotRulesParser.normalizeMalformedHttpSlashes("http:////www.google.com/robots.txt"), + "Normalizer should fix change a malformed URL with four slashes."); + } + + @Test + void testNormalizeMalformedHttps1SlashIsFixed() { + assertEquals( + "http://www.google.com/robots.txt", + HttpRobotRulesParser.normalizeMalformedHttpSlashes("http:/www.google.com/robots.txt"), + "Normalizer should fix change a malformed URL with one slash."); + } + + @Test + void testNormalizeMalformedFtpShouldIgnore() { + assertEquals( + "ftp://////ftp.google.com", + HttpRobotRulesParser.normalizeMalformedHttpSlashes("ftp://////ftp.google.com"), + "Normalizer should fix change a malformed URL with one slash."); + } + +} \ No newline at end of file From f0227a2a34b28cd1e2baeb1d7386c88fddd9196f Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 4 May 2026 20:23:32 +0100 Subject: [PATCH 04/16] Revert "fix: exit strategy" This reverts commit 02173bc1e2910e550e428701a8e291563b67436d. --- .../apache/nutch/protocol/http/api/HttpRobotRulesParser.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java index 72c7280947..267b8f059f 100644 --- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java +++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java @@ -196,7 +196,6 @@ public BaseRobotRules getRobotRulesSet(Protocol http, URL url, LOG.info( "Failed to resolve redirect location for robots.txt: {} -> {} ({})", robotsUrlRedir, redirectionLocation, e.getMessage()); - cacheRule = false; // TODO: validate with Sebastian break; } @@ -209,7 +208,6 @@ public BaseRobotRules getRobotRulesSet(Protocol http, URL url, cacheRule = false; // TODO: validate with Sebastian LOG.info("Failed to normalize malformed robots.txt redirect: {} ({})", robotsUrlRedir, mue.getMessage()); - break; } } From 26e3a191ef675b01190718f0a8e88bcda4034c88 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 4 May 2026 20:23:38 +0100 Subject: [PATCH 05/16] Revert "fix: apply URL normalisation for robots.txt redirect" This reverts commit 7afb83504eb414a695794a24f99d24de195230f5. --- .../http/api/HttpRobotRulesParser.java | 33 ------------------- 1 file changed, 33 deletions(-) diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java index 267b8f059f..4f4bd99774 100644 --- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java +++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java @@ -84,26 +84,6 @@ protected static String getCacheKey(URL url) { return cacheKey; } - static String normalizeMalformedHttpSlashes(String uriString) { - String schemePrefix; - if (uriString.startsWith("http:")) { - schemePrefix = "http:"; - } else if (uriString.startsWith("https:")) { - schemePrefix = "https:"; - } else { - return uriString; - } - int slashStart = schemePrefix.length(); - int slashEnd = slashStart; - while (slashEnd < uriString.length() && uriString.charAt(slashEnd) == '/') { - slashEnd++; - } - if (slashEnd - slashStart == 2) { - return uriString; - } - return schemePrefix + "//" + uriString.substring(slashEnd); - } - /** * Get the rules from robots.txt which applies for the given {@code url}. * Robot rules are cached for a unique combination of host, protocol, and @@ -198,19 +178,6 @@ public BaseRobotRules getRobotRulesSet(Protocol http, URL url, robotsUrlRedir, redirectionLocation, e.getMessage()); break; } - - if (robotsUrlRedir.getHost() == null || robotsUrlRedir.getHost().isEmpty()) { - LOG.info("Robots.txt redirect resolved to malformed URL {} (initial: {}); will normalize", - robotsUrlRedir, robotsUrl); - try { - robotsUrlRedir = new URL(normalizeMalformedHttpSlashes(robotsUrlRedir.toString())); - } catch (MalformedURLException mue) { - cacheRule = false; // TODO: validate with Sebastian - LOG.info("Failed to normalize malformed robots.txt redirect: {} ({})", - robotsUrlRedir, mue.getMessage()); - } - } - response = ((HttpBase) http).getResponse(robotsUrlRedir, new CrawlDatum(), true); code = response.getCode(); From 2a297bf113945eda328ddddaebd30046b3c82ea6 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 5 May 2026 09:38:37 +0100 Subject: [PATCH 06/16] fix: cleanup --- .../http/api/TestHttpRobotRulesParser.java | 63 ------------------- 1 file changed, 63 deletions(-) delete mode 100644 src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestHttpRobotRulesParser.java diff --git a/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestHttpRobotRulesParser.java b/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestHttpRobotRulesParser.java deleted file mode 100644 index f03ea600da..0000000000 --- a/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestHttpRobotRulesParser.java +++ /dev/null @@ -1,63 +0,0 @@ -package org.apache.nutch.protocol.http.api; - -import static org.junit.jupiter.api.Assertions.assertEquals; - -import org.junit.jupiter.api.Test; - -class TestHttpRobotRulesParser { - - @Test - void testNormalizeMalformedHttpUrlOK() { - - String url = "http://www.google.com/robots.txt"; - - assertEquals( - url, - HttpRobotRulesParser.normalizeMalformedHttpSlashes(url), - "Normalizer should not change a well-formed URL."); - } - - @Test - void testNormalizeMalformedHttpsUrlOK() { - - String url = "https://www.google.com/robots.txt"; - - assertEquals( - url, - HttpRobotRulesParser.normalizeMalformedHttpSlashes(url), - "Normalizer should not change a well-formed URL."); - } - - @Test - void testNormalizeMalformedHttps3SlashesIsFixed() { - assertEquals( - "http://www.google.com/robots.txt", - HttpRobotRulesParser.normalizeMalformedHttpSlashes("http:///www.google.com/robots.txt"), - "Normalizer should fix change a malformed URL with three slashes."); - } - - @Test - void testNormalizeMalformedHttps4SlashesIsFixed() { - assertEquals( - "http://www.google.com/robots.txt", - HttpRobotRulesParser.normalizeMalformedHttpSlashes("http:////www.google.com/robots.txt"), - "Normalizer should fix change a malformed URL with four slashes."); - } - - @Test - void testNormalizeMalformedHttps1SlashIsFixed() { - assertEquals( - "http://www.google.com/robots.txt", - HttpRobotRulesParser.normalizeMalformedHttpSlashes("http:/www.google.com/robots.txt"), - "Normalizer should fix change a malformed URL with one slash."); - } - - @Test - void testNormalizeMalformedFtpShouldIgnore() { - assertEquals( - "ftp://////ftp.google.com", - HttpRobotRulesParser.normalizeMalformedHttpSlashes("ftp://////ftp.google.com"), - "Normalizer should fix change a malformed URL with one slash."); - } - -} \ No newline at end of file From 85e67a7a25d3a1f7b047c9a4ec3ce5b24cd18773 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 5 May 2026 09:39:07 +0100 Subject: [PATCH 07/16] fix: remove assertion on the utility class for reading segments --- src/test/org/commoncrawl/util/test/SegmenterRecordReader.java | 1 - 1 file changed, 1 deletion(-) diff --git a/src/test/org/commoncrawl/util/test/SegmenterRecordReader.java b/src/test/org/commoncrawl/util/test/SegmenterRecordReader.java index 62057f4e17..5570ac54bf 100644 --- a/src/test/org/commoncrawl/util/test/SegmenterRecordReader.java +++ b/src/test/org/commoncrawl/util/test/SegmenterRecordReader.java @@ -28,7 +28,6 @@ private int run(String path, String url) throws Exception { Content c = new Content(); readers[0].get(k, c); assert (c.getUrl().equals(url)); - assert (c.getContent() == null || c.getContent().length == 0); this.content = c; return 0; From c08944d67c21aedb0cd5c56c1d7e369a3620130a Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 5 May 2026 09:41:14 +0100 Subject: [PATCH 08/16] feat: add a test to simulate the required behaviour --- .../org/commoncrawl/util/TestWarcWriter.java | 46 +++++++++++++++++- .../content/part-r-00000/.data.crc | Bin 0 -> 288 bytes .../content/part-r-00000/.index.crc | Bin 0 -> 12 bytes .../content/part-r-00000/data | Bin 0 -> 35703 bytes .../content/part-r-00000/index | Bin 0 -> 225 bytes 5 files changed, 45 insertions(+), 1 deletion(-) create mode 100644 src/testresources/test-segments/20260505091103-malformed-urls/content/part-r-00000/.data.crc create mode 100644 src/testresources/test-segments/20260505091103-malformed-urls/content/part-r-00000/.index.crc create mode 100644 src/testresources/test-segments/20260505091103-malformed-urls/content/part-r-00000/data create mode 100644 src/testresources/test-segments/20260505091103-malformed-urls/content/part-r-00000/index diff --git a/src/test/org/commoncrawl/util/TestWarcWriter.java b/src/test/org/commoncrawl/util/TestWarcWriter.java index 4f7344010d..c4bdef4646 100644 --- a/src/test/org/commoncrawl/util/TestWarcWriter.java +++ b/src/test/org/commoncrawl/util/TestWarcWriter.java @@ -39,12 +39,14 @@ public void testWriteRevisitRecordContentType() throws Exception { ByteArrayOutputStream bos = new ByteArrayOutputStream(); WarcWriter writer = new WarcWriter(bos); - File segmentDir = new File(System.getProperty("test.build.data", "."), "test-segments/20260224170658-revisit"); + File segmentDir = new File(System.getProperty("test.build.data", "."), + "test-segments/20260224170658-revisit"); assertNotNull(segmentDir, "Missing segment resource"); String segmentPath = segmentDir.getAbsolutePath(); String url = "https://de.wikipedia.org/wiki/Wikipedia:WikiCon_2025"; Content content = SegmenterRecordReader.retrieveContent(segmentPath, url); + assert (content.getContent() == null || content.getContent().length == 0); URI targetUri = new URI(content.getUrl()); Metadata metadata = content.getMetadata(); @@ -80,4 +82,46 @@ public void testWriteRevisitRecordContentType() throws Exception { assertTrue(warcOutput.contains("WARC-Profile: " + warcProfile), "WARC record should have WARC-Profile header"); } + + @Test + public void testWriteResponseRecordWithMalformedURL() throws Exception { + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + WarcWriter writer = new WarcWriter(bos); + + File segmentDir = new File(System.getProperty("test.build.data", "."), + "test-segments/20260505091103-malformed-urls"); + assertNotNull(segmentDir, "Missing segment resource"); + String segmentPath = segmentDir.getAbsolutePath(); + String url = "https:////sites.google.com/site/lebercailgiteennormandie/robots.txt"; + + Content content = SegmenterRecordReader.retrieveContent(segmentPath, url); + URI targetUri = new URI(content.getUrl()); + + Metadata metadata = content.getMetadata(); + String ip = content.getMetadata().get("_ip_"); + int httpStatusCode = 200; + + Date date = HttpDateFormat.toDate(metadata.get("date")); + URI warcinfoId = writer.getRecordId(); + URI relatedId = writer.getRecordId(); + String payloadDigest = "sha1:abc123"; + String blockDigest = "sha1:def456"; + + writer.writeWarcResponseRecord(targetUri, ip, httpStatusCode, date, + warcinfoId, relatedId, payloadDigest, + blockDigest, "false", + null, + null, content.getContent(), content); + + byte[] compressed = bos.toByteArray(); + ByteArrayInputStream bis = new ByteArrayInputStream(compressed); + GZIPInputStream gis = new GZIPInputStream(bis); + ByteArrayOutputStream decompressed = new ByteArrayOutputStream(); + gis.transferTo(decompressed); + + String warcOutput = decompressed.toString(); + + assertTrue(warcOutput.contains("WARC-Target-ID: https://sites.google.com/site/lebercailgiteennormandie/robots.txt"), + "WARC-Target-ID should be normalized to a valid URL"); + } } diff --git a/src/testresources/test-segments/20260505091103-malformed-urls/content/part-r-00000/.data.crc b/src/testresources/test-segments/20260505091103-malformed-urls/content/part-r-00000/.data.crc new file mode 100644 index 0000000000000000000000000000000000000000..91d344a940275f000b67746ab8d795f4609163f9 GIT binary patch literal 288 zcmV+*0pI>(a$^7h00ID>gXlvZ<^>guv*xS83D_t56@+=S^ppzT#fb0y>=mrb`7lw5 zSqPgB*+zu_0%ea6+dnQfI|x!m*h5DC`boE-f=20Y1UFur2QAHGu5j(Vzl8Y@oyw#E zMZ&yeK^S-uY8gs{xuZU^t<(Qe{2T{y&)|BX`s literal 0 HcmV?d00001 diff --git a/src/testresources/test-segments/20260505091103-malformed-urls/content/part-r-00000/.index.crc b/src/testresources/test-segments/20260505091103-malformed-urls/content/part-r-00000/.index.crc new file mode 100644 index 0000000000000000000000000000000000000000..6a28dce48b6b66ed80af5ceb0d915ac2ab020550 GIT binary patch literal 12 TcmYc;N@ieSU}E6vdi4 z*3O97@#M<=+e#t|5SX?OrnLHY`i5r4v}XE7wzhV(=C-uT#%@jo|HBg-XD355S~~|@ zCtE{XD_S928z*BMCtx7T|DnzQ-P_RC+Rnk)(UDfz*hJsi%1Ox9$k-6*r=;@1xUz-b z!CfQ}oGvWsl>-5RBK}kp;F~!)*>Td*nVLHp8yGtn>YH29nmEuo*c#Y6Inp}0Ik{z4 zDQn4X&|}abTj|AVuig{a#8lQLpsmLzNmkPH?<)$dHkMGzbQiDv;u@rF{0RLtHX)-y z32sg)Q(VB#xpwK^-~K6kd5L90<`@zuLPwqo(h!(aZaGo{2kKu5@{}}{&D+=6E}Pd| zJb!m_@%~*6l@tsh&}JlhS2&YzIlD##z#FW+I3AJU^DDNyDvDyHl%Q7E)QmYt-B1@^ zVd9JvZ|4S3q<}G!w;)-Vw^rfqf}ar(VG4B=$K&2pLD`|#4;uCwahuQW0h-JNM5}la zNv#{_vDPoiPGD3_&EFfpLH~v&WX5eX)tVas`_fG#*kePl<(J9o5HG3RO#?LJ=#yWy z%9K4Gho`tnA!5@!CyxaN1%qfiZ$gNkko}L3W3>f}^GMJ{ts*%y8B}=ps3AX7Vcj;Q zm}uK(_Ol6Lok>03oHSs?tPSX{Jbp3XP2&iFYh}EwX6%-$R2f#Bt~ML8+r{mw#{tTEx|?jQ%m!p?_uVpvL@D*b7(f2EksA|L-fjZjEDER zrS_s^=mM=bEHsx5=p^qEqRO}M;UiY3;1w-ty&vntf^{!ncnjyb@6_296{x!4XvOf2 za33u7&3$Hu?s2*8wyJR#`OyU>+K|hSu>a2#WFd(YUpn$L>iYF@$qQo}oz5l^}4L`E@bZko%td9hoTgeM#SzRWMy8M`?4 zEfUy%kO)KKi%)Q@4>x!$fiUYv8omsPvG3uJ@6Ul=3UlA=cmq0vDc&*B7xPpY^d}2X ztir`dB*hfRqV*mx{R5tDq0HvTEr9>G-9lSwogZbWthwEA*qhtcWTJp%4Gf81#DrYl z9%fwJtVB*)oR)y)V%P}jM(lxZVG9hUYN7jqWGtKK*>Hv@DQv~%=WmxnPvz>oPN4*RC3oK+`q8|zmLtVul z&i7sInEl33hagWnn>0x!ej2d8Pk%f{nJ|XkSx1;LOcc6E>YRG6B$umIlX;t{6YyRS zIY%S>YdtSp?znWC9QDvk6nPd}9gZsisRwto$skd(D8Zmjof`A?u0sHWVHz6^PB>nD za1h#34Mq}fYSm^HoA_=!T@c}Xq}7nlPG<(aPKOnKU{UI&(y*&o=_;}bX_V}l?4~f$ zTT%Y3nF&J0X@m%k_#N4^dL1_t%R^(O1=2g!C6!H5Q(`63L4b?W)~BH$C=k&i8k9h7 zgY~hP(83nbH-%!54feUd&S9zPw%|-NaW+mwUWx~MYmuGm7e;4PK#6W&t{3lQZuZ)_ z-cjo8StTVUU`~R(6GLDcqQA@4jFL1-a%caAk_DDyBkO!g>?)3sP;ga}kT>p9+5p~g^&g|fJN`Y0>YOe+;PZRb-OFq_rFjpO2<ls-TT&F|ktsfeGOIELKHYqiR+y!wz8hLbL?wqv(i=nMoDIQ`=Y!b1Pi2ge_QprdpAkqAdx zQ(Ie8D`VOpq4}>0I;;OnHvXd^#x^##4%YfMM&`!#oqs*-(*CW=*G^|sxd+agi8o!CtpW!e`V9y>{w^KJO_|Mj5IY6Y z8xnhpcc~xItY0mUS-zhjzouBST)NT`*BwVfUa^&JhjdDQ z10=KC1z2LTgm&sypfqL6ke7OLAg;2N5btIMcVB$oiJtOnGY;0XnjsIDu7qk)mtD`O zq&CPj8+)!xb!ub3H-uK7xDZoz+P8{iJJ6PUv=7VMQfwA)J$35O8~tB5*#??AdmoHQ zX8(@dv*F#z?-XX@F$WN@hN`VM$sT}z{iRiNOt+6saIoKH9x(w*Q`m!$ zb=W1Im64@QB53+8tUrR-6_C3T7GJA9vYA!824_faIC`Ti5$xcN!DYjwZpOM0Z@yuh zBv`fb6koeQi=x~8h)*G#KP%O(BJ6S1yo;iHl|_E6dMJrP+*O)2p}a?Ng(q8v*?2Co zKN7KH4tcMOdb(}?$)bD9(y#Z>{WQhpCUtV3 ze1}qk=#2Y7HNO-Y6~Pbbl9PDGUvCH>S7O@yxKv2bv>N#kEq~N~9lZ{1yRfmxWs zhm$P`9Yr0|g$RCJ?=yaI_t5M;-*A$9x*W&3c=Z9Qt{#wVlO#_P|7twK)6E- z?abTG(;_d8sCJ^S3pt;a;wyx3VPdu+;7XbjqM&GtaY9$bX5L|yHr!!EYZ9S zg}*nQ%}n<0xYR&%TcC35eMg9~mWbs;auV|h>l4x|F+EqkJUty&#~wQ%4ZD}oNtb1F z@?~~M#sxxWm8n*2Bc8NS_HS0#KWm{ z^7)689TliP(($CnU`394dC9CPmG%WXLW5Kc`uPV4vod0Nd!?GW>qV-hj}RuzxZK2Y z`8T&_;YO^YHy{?25+2#>Hg+i|?sSctnm}~lriikS9a7+X7R`rplIkX|v>+%&`?K9>cH52o z#^H_&wm{RA^HWh?ZaTl-w@*P84pW?_LdYeiSYq-JHd^^k*+~Be@~J6!gA6{->Y=qBiOQ*4_1p)7Q1D*mOgGCJ!;@L9VUK#+9|9 z<4Pmm3#+4xez&rs{PxDFo49#aaG;Vi90(=7k8_kbb|vYjoN%UC0Ac}Gf8)P3b&Q^z ztcrkGv!d#|D~qx`@WgFKlhbB0;#k}DYXXV8Y)YumZsUr(@m^7Qf$e*bC5yc{wo<<4 zO{h}L5QH1D&ac(l$OI&{K@}gX#uN*(>CPsGa3nzWw@R_-(66HKy(n*c6t`)pZ<>Ld zW8MLDUV|6G@tXlThE2L|I6JMoHcYROfJJH+B?*fs&81U@Mhuao$WD=U-Cu+|dDgH}gavLE+(G=e>wwH}ATC3yX_JOosH zmTj~(p_UNz`_$AjoRhIkF(5^XVSbg?sWT?Mu-rN==S@$KhB{#E9I<6dXN=P*(jASO zGBongXb^LPIFq#O>w}gk-2vZF7Fi>FJ?OND9Pbv*FQ%uS3YryY-PPUQ)eg3wAuvzQ zDgKxJXGIPw9|P3rK7WqG^>1WlcZLK{8{)~WY5i4?V}oIUM{>L&zs4~=^`N9=W7p1~-E7@K#IU+O(} zOVTpD0=nE{sa|JMSEfB;jYy%v327tAX)%}q_M*~S-KI6}`OL$UgVIvGWG+pl9l$rj0pg$W>M8b;pA&9Z|0TjHLWXw{u zqV1E^IMviv&+v--paF{*hOvLhJ%0<|a<77fMwTULqfv_@FfUW06y>PoO054zK%n#l zs9!}z{A^JIcQbH4rUo^vKkYr;S6_8%Sn5qPlR&mbUBDf8{UL1LR*{#lk4ZrR1w%~$Zll z0b>(%-WN3bkkSWjBaUhgGT2F_gMrFVo6*wmyyigJbUSWFW)J_g#go zriUe44TLohj;A-A&|3VCsQ#-L&l9%x_Zz6-F%$Hhc2)GyV*0T1MRL7P+C@hX9&IaU z)!ZHeDG*pFj&xSwzm&H{u^zJq(_lX>zS^&jOE&zdvTXkRX;ArOQ6154{Yc(h($ewf z{a%^>rqs@tiGA~-Ws!)ZMO|Q~nP{d_Q8bcav(iYeA^r?cQkYOvbEnX?rLz?1-N3Xs znsKVz6Z_{CyEW8_7!ZR0w(ZdWl;dty()Ha~8=nI?_Q_0CBapjyuDMXA8*)hxokcns zrQ|g>utS~+t0}=YKC|$YMLaLaW)W5BJP_At#9pKlvEl8l6IaCT!jmx4>=gTRz`JC)jeQ0_ zaq+kc%CeV9rV8($2WGXhMB59nA&ju1M$^DIxO1@**^e~uK{T1sgJNu9tO)LT%m!-4 zt4S-=qqP|Ha5T5bTK{n<;E0C7Q@!4oETD#;Lvu4UVZQqu#S}-})3i2H4uHk&jOKI~&0H8uVDzu-tqk+@h82vZ%4e zijaA=ZQr?v%bI*`@LKw}woNJLeEaP(7?Xp{bNg;S&B4{J@xJj_hqJDA#diC)#vf?U z_u&5f@M zG)qjSJg;wmim}o%^;Gh%5}PtOAH0;(YYA{px{50EQ<(vWHmh|6FOA5EoJ<_eQ`x+$ z63h6Fd1-ld-3;S0Q&fFrt=v!|igp~T3@Zlt1SO;%KgO(+L`xpc+AKRgDo}8>5Gf?3 zNm_7~IVmKkiAK3aMK};EundC}f;6fhM6wo;l4x#)YTG2RvqeR$p`EW`l8OqGlL*jn zKj3{Oj#(oxc2qRTQ2~u6nKhK&ewXoj2&EA}W5U@>#)oO32ZUaEW^G?a7tLn44S5cr zbeT0))FvxCAW&x1RBWvV_E~r^dRu-bNLC<`BmpaS`npJyJhA;=R^Ss+%+zUMfC(ji zDWfqky^-Xc(b_zFZCyltwO0DHU-d6~r>052;Gn2_lJj~XG~f+IW&~xcky@)0As}kd zg9c47a*+}Z6Mmt7=}gWvQ1}t!GUF(6VSbc;lz*%o-`|A^=-wtpVAQM-jQXUpl7bUH1PB4rw@ix?Qaq+A z8Ff|~L2r-06n>&n1%euLBKOPkXPrQDaUjKeb1Uo3$4DXd*m~t+t&)WN?0iuqX;i~JAi)v*Y(O~~7AV3bYFbMKi61)C>hX^>r^88nq~h&Rwi zSusH<2;P*P*qZr`lP$-BEuafA;nGTRvC|gmQZ>>J3Fn=onfI`S7GXxy1nxycqu?oX zH&8+kv5dOF^4oXsnbR=^D-mmHj8#(rhgkBH#N8P9p#|cBiVD0R2KeqCiw=5hIa0l}lc6vF zxAZa{LH-Qm=%PbmgF`Neu@rv58xRFO1{t&loZnEQl5mwT(X0e0KtENW)QX~+zkDoK zL6yZX^ep+HfnU~muT0WdJqE8pciKmAr<@U4SbqOmps&j6e|&5Fa`sroy^bNv)kTE z3!q#uK<`Yg9Hq#|qO?mp3dmy=GR`oG1!&On1L})Hn?jUmeuGeOuc(HWz}@m%4ciB+ zW3dy-podbY6#Jbeh%y*48PTti2s^;ABKCuDh%9ES2DA6?LIqF94Uvl&)nXkozWnur z3nCIiHwWy1%oC6+mLlR9n%38aTIGWS2H`Ejg|QmO;V`i=&C}zGHHZ`1^Aghr2^m8z z^&Ip#o%~SF2AEM26huPJ7z8dtO$4b|{Oz)S3_PWKF}%r?3@@c^{S!q^PpDmt1xPno z4Hzw)4JW#@#mM>{P7ymk-k12hnV+~WR|q4)p-_IJmeiMi0Y%)@d{>2rMg&cWvtcHX zS{1aiDw8m1Nrhn+AZ~1)Jn0rbN?HLK4|cSH7OGIBiDiq4bc=f+ZM+-_xJ4nw`=cLD z9RvV#FWd^WN`}ps@(>PXuBr6vl&9toEdBelJC;K#^kW~jG7KkZq$I}( z&tC<}BN>c^Anp4F<(w2-y#{hxnW-P);H%EjxL0b7tpE?z;G~LxL}f~WX3j09j9}ed zT^L$)TABuuCd?BXjnu-gj^J;vxoaQD2?doe@IVo0)kp%+;14C<0Qn~!0IQ4&Uj;5? zMg!!hm=Vm+F6KMVNNdCXGvRiY?RkZvnh<%66N@mZKuJxNX@Kf95RIQ3+yOmt{ikpO zQ9p$4WmQEZCa~tfqyUygD@LP~HFiA_$zQ;uwEOdMv3U%*iS~6iP=$(A#!gzy2mojq z(?dA=N{V`x=+R+^K}UW2{KC*CxCokbwwuY?R(Zw2`QD?SZA^qp4X06wEkAiQ&q!ca#A^kv z#u%u_V4IBDKg3J$CnFtXjT-$Z2FbJGu&jNh(B{AUtcTVZ!1B1*}d>xK{>!HEx0}D4Yz;JTi}o2#AAA zJD>}*+!eI0nv5MIM%qdii3iOLB9AhIP?@7ihDD;6fF@ui-0U`}0_w4!35_+6-NF~< zv`#lO3^fUzU`hAlhp+{gh-L8;Zvkjf&VWxXU^23>>iA;)&VW``Gs(7+dE3RFbfr^P3onavcilog~*fLA}TFB1~h1lELB^XkwUFp zsPah<&={GYlQdl}*sdZZ79s+hLqr)pvSawgAQqJf%%e|}R2X~YQ-2s8Kj>8|7ND+( zL_=zWRtg8q@Z-~$7hyCW8;s-186N4 zbK9}q1De4IYiAQ!4E{4Y9)YD$kWoe;386D{ixn_%*J%=;qt?e?64YsT0zEQ&AC~(R zi1=JDf}h6~>p#PM2l!jz!8uG^P8Q`7?|z3wTs3?H+S>OEo~_RuY3Bo|=V*h=G`!8w z`T20;a(~07vt;$|G`pkc=W`K}1+}5B-fF8liD^wtEb6nlxf5tDVxj{b@@;#~ko4V- zX>F^6M-JbUq>X~liuQ8Hq}~;;-}>Bb#P4FH$a+PacUx=JC}*qMI@oiBW>nQR#+D0R zV4pd`>u+ZVPIlIM?B`+!pWI({HRJl{oXopq@4rB6=;s~}nWbG;J0lY<6^BC`T%%W) zc-L&}#!MT}rygWh7pE{r8|wj&n;hya4g22QjtK+1VD*-;p0WH{REgxA~V*+5UeQCHLgI#xgFRZUe8B}fji>&sw9^V)4w4v7OTsqITDz}K!E?~R< zNXa6~!*Vv`a8qC9@EDt3$aY0U7DB9fo|DPR9d7^AchMWUbss4&Xyn#zR2gc6_u}CG z!=vdikR$>qwzEFG5f8Ws_DJ01X?ap(YMOaw9rzr)5PX^zG27`l+y@>H*dP8Jn?i=U z{pV-j>e^stY-t&D9_=NT|6)a*PF8iP<09C!l)HbG7TX1|arL-zyf8JU;n>-#$RV0p zlCgd@>T#SOebV-3x55^c5?^9|eS(`VHa5`j`)diS>ajVf=^DFdNo(tgagEHC>oK5L zXMn<;^TZ~5yQM*zhpF>;Wa!f{bGBBE*TyQ#`1t+8X-NXjY0}1=X28n%Ii_L%iWibg z)F0+=`=e9y4ktoGzFRu=;%^$f1|GGt*JqxB3>;VYf2aCAp1^AzhuPl0GFfq%_Us(~ zHlF8TI*;;hC`29{5ok^uT`y-}$N1yMKOJP_S#MJcuzG$A$Jpc`sws?|naUPo<5U~h zBknigm{zoeaJuMdX?1J6J2C!j<@d{^LnEAHu0}TW7`ny~>8C!WJzQ6pIZmhZXE8F< z4z0EIkf@I3y*Y&GRhz%_Pc%b3Q_D7Prc_!_2u45M^&9OLT^cOppPo9w9Wm?iTyk)6 z7Y9nn6S|(UI|L>~Dq8>UxmsRrt;;mu@@(zgs+}{Fbfn`wFE3KB#3$VkPos^$>T_&z ziTgJ+NI$(^M&EZDmTDwz3!WPIy)Yw zwjR-3a+^hPIaq$mc9>^j;JoVbsH?kkul9Gx%;Z>Fl0nLnzP~WY^Blu9BsJV;El1Zc zP8jK$Z6(cm2Rd44ddb=sCC3p{j(?aoUz>T)S|(wf6N-% z7;3lJ{Bh-gs;+=6THXsE%9W}cWuAJ*wT?LFR=a|NTlS_I#OCbA%S!glB?IH4<`7cX zp_TP0vz77j*S_j2n`9Pd&zYDBt>dHZ`1mni9f*e4+DOhLR@LleE!x#&?;WwrxA}61 znCbq|=4raf?KeSIF7@u#GP-F(oLNS2kV9pFcW3531!+HEp4xN)&wsHrJ9|}>s-XG9!p)P?8^<=*Mf*dVoiR!V>PN+|jg_0m4RV@X$LBb3 z2v@jRCGT>}BB^7%D#+huo{?7W>P-l?D%#NXUe}}@qFoXD2)5C`zY5xWGRAm0|ENaU z*(29C+1YQbnLjVrgWk_nHEcf-q;D>PE^zB=oExdkZfNj!aSXg7T*b_Ic1VS&7C-4= zYqG5~LXg@YH(ugorb?|{M2Omhv>zsp^#_A;JaOGDEP zCzYkxvlNe0yyv>)-|~m`w2RNfjrF;LfN2A@wuPCd^)1I_OG(q~hsOQ9F>Bi_*6IE! zY9+*Ewtx&XB-GZHEe)+mJhk?;i~Z$>?(SA)QJT}V^?V;ZG}SVsrk#bBQO7a(E#Tj}O6cRGtORGq*CH1xVjp z|E{cAbLXjC@LJJ5cTPnZa=7U{p1H=P6J9STEW59W=yPhW`?n6QCZca>Zt*OkcSex7 z+xncBbp&=F^mn>$ejZ2oT@*zEOYg2JS(k4k;c?h$2c{zEyVyiZ^InisrT6@Iy zDFJD)6C8c8H}^~fxiEau0k@i(FZf978_43f|QEZ68RH+g`t`wd1JH%kB)+-~z--HEE z(1drzw}0v3!2+92>XF^ghMTif@_hYTf1(WX_St5}I(Yjz-7!W*>PF0oYMa&1`|MsK z6)v_F=KyVu>O-;m>-)1uB6i}o<)tXo&$fId%Y`d+q~&pOiuDoD+S-k$)rii*M6v+< zc@_f`%6l^cQk!Fe>;8T|aMp5Z>&(DY|MHx`=U=vE z15AH=V`sS@EzO-$%)0K^LR%Y3F|N6>X(1<%j{D6>R$2e8xAFFl9#dNCt?bRKx0Yr5 z84m4@y(ivA$s? z?Sz>DA|I71Uh-fQ^8#rC%r86}qqbYVJqA7gFT@mp16;s%E`%W}>)2^9gU=)9mx80F z{(V{dC2>}d1#CL?w~DrZkN%!-08ph;c(w!s#SjVwlQ)KI#myK=nJKP#id#Gw zad$l%z%+%^o4y0xxs<)|8!cbV0zP{0-#`6cU|mP_zwp(s@777(iC1`%b?8s5EogGh z2#517r));9smoB_I1-R9}$xWUj~!dc_&kH3F;ONY+I-LLJ%; zx_?GGs>EZ8<3}t^V`o+79BKxeWO6FO7g|5sOG0wA;~1HepBpR<7_%@ZKgWZ z8in9DIV;LQ;# zN_1cdeZ@|#sD&8PyKHGdm9}v9o$b(j&p$eroM{r@@ClJuxT^MYaAOi_YdQ*e=N91@ zej1CEe&34Dtp2v=VCBP0aa?wt)C+%F6_#a)-Fh_ks$vxpIL`Z6cn)tm?|*#?cX=JO#W5AP@$5--An;$K=nmJpzblmG-3oUlye73qU|!|NB=kriC9 z{-GI{*pud8Fo(xFX;qOt@7KEKZjKOykbfy8QjdF~kd$0T$@abY8@h3J?}q#X2w5tcpaLxV& zj#aX*16s}v(5wg%uwJ<1lpZnXwu@L#X-1x*|GNr>syMNc0YZf)se!Z!b1}En#(05n zJBq=;6E(=9Cu!OxJKy87=ur5vcIs|a@jlS1f?EQRdzL1=ja<}~6rJ9*{Lgk@%N=AV zSiR7E_I|oSrRTf1k%ta%rG<<$T_A3gZ(Xx`Nm8ev+V<_fDS9QML;*1POG70@~LurAY}0S;TMu(9CcL=jZbgQ z4fFDuPme0kBHln&nP5YU;^lFjjW6QcD)ZxnjWugVGT&prsB*?I1bp&mk~_w^xk!U6 zS$hwId!hrhF80v36P47|gG2KCcOLiigf~8BLg<@JKViC7!xQ5A_Nqoa$(!z&ff4`b z7-~sYy}h+;Hrw1tDNSZ{Gs8TO_RzD4S3B!4R92~(nN>oORdfZr!o|Gzt91GDDl}$N z-zjA@(&#DFSD<(}#0ab*-F8#{Ivv!$-jGiueK;>KRb>8lNsxl_hi9NdTqLT7*kmlHu%1&L%u7TGcPiJ7-tU#dDt1A=vRSCag;Kt9JTEU zQ(t;1lT0YrpW|m?hVKg0m%N1Y#tgZ()d>IGEXQ^4>REphb{zV_ph$4omp!RdYWB@J zsp@w7XOzMND>_lcnSJdYuDaEY_o&a280(6bbGLy5X-k2fVYP2-m&9v2Bi5-MMOB%* zB4VceS>~@8%1RpE({rU$M7mcA5qLDjn5{C^mo@9W;2>iaw1TpI>uPa&W>6B^zi-Yc4;LsKvfM^NYB7K{Y-N56tzN%>6Vx z98&jhz}Qa9KQ5c_hdzSR$nm1+4PI!`;a*2R&fdGez9iiZuc8Fa`wM0@$8f*0LxM4; z(5EKk%V{he%$d)dxxuG=ch3~wfpK|@*dGKnij$H#K>0MXpi_hQHO*w{v^gh4Q^nlX zD__bHH%MwhCWCu@P#@R}`lXVhvp{y*z7TSOe&;%`vTJ8R*gz~zgz)=!7!DH-<>SpW zuk+VC1va5CVS)n4g91ouuUGKafOgV|OHq`|5H&7)&CrRm#Cgin&Y*wYl(&gq0K6}} z{y{V^8TM_#qZXAW!hHF-+m+>6p%7%{SF2cW+KnVi`tXXqF^M6A0oI$0*TG-mHuFq? z8kD1S%6jc74PxXQ(KErgm)n;D%EzdMk05DqYIYQgcM8eh64>*GVmq_`dgS|#F`R{< zFRczY%tocdneuyiDnVGX4H6HS?|yF<-WyJ&Ee}RHwKWm)IgBWt??ky`E&soOMLUDE z$B#vega!IhoCqF%IXsu|09>>Ckj$LFvC!uwRp9XXmH%R;C2~}wrIGo5(f`Zry~#)W zy>9EwmLF!>=!dL@$2ajN6Jwu*WI4LO)H1(+X=J$8D@J^GVh3@RGzBH? z+Ad{o(PoR=swvo&&6~~(In;#b8*6DV^5i|Ak$dO6%#m0;o1FIA8Vm0D6I9&8hcK9R zby#*7L*Z>?o+k=SM(9PButlweJumleUGr)6oG%UvCjJ{XY!^tPhc_WV?VBD6`{HH= z8r>A*JFHMV_CvS#8b0>UGFzDW7H=0agzKHfUu5-54vhr5{u@?&3W&mIhxj8zd`i!> zN3@oGaii`oHQvX4KyXVtGc8Z4BJ7Jt??LS0I{wClfn z9nKPX^ZQfpXZI01;8b5OI7+Ov3XU^{H?Fa}vE(Kf#hktW0l&V8P2}f2%XLOy#D`lb z;RQ=-+)YyyU#!*n$-uQNJT%0#i*0AQ>5{%g4?PIiC#+pBWuz~MjTG;YrSakAboUPi z>|Lw9zeumIT3Dg+Y0Grt^*w}JBf^iT!Lj;&Pq^XfC|+d$)0x*p@bSfPP0!2Wpu3gG z`(5=S@dT%ax3!KX=h`8}6|aO{<8ZQB5ITBq#Ny3b@<=bki!_i}_fVhxFa`Z~0~ z-T7q8`8YLAQ?>R!HLV26ZhU|@o5oc{@|5+!fbu*S+!lx_27@j=j_nai5@iI{_!z5w|BP| z#g{mO*W7;Gm&|fEkIr{I-@)||oJqn@$XpC@nKnwhhyIF_?*i zM~;t8PR5o!0U3oxb9=$Afs*Vb_n15DL5yPC!zk@|W_x+f4!491CR*0Wl|5m>(Z>xJ z+jAf-GrMc#z<>y28U@<{>z%6)O2A)a&T)Y2_o>Q_nQfgtlVpP4yn>?|n`r2-Kbl&% zt{X&c{xV`*Y;0LG+>P}k+yl6r>yM`;E^;xC3!`3`ZgEL`AMcYwGHAmHeBJLjh}vRW ztbFee9Pbx#Ew_F8*=1$F7akzxGQ`FvcsnO~vp(Ptyt+>n?nHCGzndY?4AZ}M%@)bs z4WrdQY3F{LmKew2A-sx>Zl@OW(Rh9rVEM#Y7}2=x2!Ep;$(V5~M^1;v?r?cYb3XzkFYHx1v zc)kc3*Kv2e`tUv+YwP0qBcB~T^$2R~4(hYM%?iIpZ(LzD>KL8(s(LaScF7iBopX|R zw8LJ#KEK2cH7PrG>M|6gjPRLmrjwt|_f6*0YTxMK_PO$sK=bDaeYs^?b0Zyj%SK~e z!PI$!-LBnp^G)Wv`|xk8=-G zdy>e5$*oZtypl0w?PB?WqM|{IIiuLzJYT!;% zjmw;Zcp(2fc@Ufuw8;8BaJ?3nz^6y|VFQi)Hc3#vPnZsUD z`C{07Od#C#_UcnN0Z!0d;AS1svIh2C#^7?T#Gu|qs8!M>!QOhfR89oyv~di;P6f8F zKTz++IW>fQm|nT>KVxA2UpKg&j5_uszQNk+ZeiKOetrXW0oUIp%J7KwF_qe zwDYSloHSbhi$e&ijyDNi2h~Jvhe-oy2Su`>uRB%I3$Y3n%c652=TC$w(5r}|5@u=E zQ3JTN@!KJ^`=r88*9`@oE*W$zM#Ta{m%xqc>X7p5{DzFw448yhawptGaw!2FFgg%h zD@iDI4tmzi96tf$UOOdx7kEx}1eYQvf*&Tq3HT>qo@~-^ZiH!o-VFMCjpQH@gQgrI z-WpbTMHIB3sv0`4qYj+1;ZKBi>S(M9y-~i0-5{+CBokG`A7Q67PQoZ{8o5x-xmyTL zEqqp196DoV#X$%73~h^Pz!G>)$=oEsz-yM~L^7Ow#n*uJ7QE)U=>cU8d(9#oBLq&1 z^`zZBm^%1o*vtWYQXx@|B`||{L(5oen86ni2w{C}F=6ynB1H~xU+JiV`c1L_HJ(5RVM-G) z#9-woy^JdkeH zG26wNZp5;TvS@XZp#?6aS<&KT8iD*<1!UsH4OPOGrr&AikEC3hcbfJk7>+^opKOxl8AwdJ4#~d1p9;j|D+Uj=-~vatQS~@0#5{O0dPUu;Buj~o<9-U zA5tN-+CsF@vYDTg%5*g!8($Tnid~y-3b5o$FJ*cNsB=g*h#PhkKAbfk6OELEwLDJraukeo0?u;BQmMVhpEX>q)ixJN|KT=VdJMl0Qbedv% zDX*7BA3`=j->xvmL}$aUo4D+&It`8|w7?7;FRZ90sMhla#6ti3&*(@|#Uu0>im6=& zp$fSHHANGZnDv+-TK(gFSmD*UkpBm@FkT%Ja%hT~5*-K^b(#uo1{4B9+FSBiZCEMj6_SGXN>j3^6qWBE~s2@{@UHR#tve zZFig0{};L-bxD*!5a11g=WiK0t3eeQ%3Cqni6?|c1&Pwo)+i07S%#?9mjp+kKDj`0 z?`u~l2enfCLo`hUakqzO0H%tdk;(-fsgHoA4HzB{d^!lTTi zHUPpbR+&ZyVWz=DtSMgnfEIt^(RBVotx)QuW2_4hH~!+`VOV)XL#95-@=Ds^He)~W z9sG+V^P)hg52~0)sR@PBAGzm1Na?VX32O&Z- z2yi6;T}WAp?L^?vw?A9-*b1k7;^2w;|1Zd}rStyD2+4=Kz@YMN&>cIzd9WV*+ zle1Y3+3)=b-N(0<^U;OXoPck^HT5^x3y1Ct?3q-zauor$yGQ9gVd4GxL!t14j_)y# zZ@3khD|_Yi@mjEe!L{2pQ)gbjXMg7-(YPav-5lKhVlf~f_mn%T1hw+>z4L1>qQmE@ zm*vfkzyhGX!ikf*)nj|`UG*j1`5C3HjMTW<8`GneZm7rn&#%Ejrs;^XIpn>OMeS*v zW$8s$Qb^7)L9ZH(qe`zDwRu1jh3a3EVMY@YM|Bf#Q(3u)A1Q995dCl}z1Z zdhX?D^@Vh^ZWOWxBOCT=r@}n?^1Qp;BKpFNWyj$V|7M&Qt{X>qU+go7ukTy?>tn!r z-?kOuxd$l28=Wht{VQOYnKwJO=;Ltd`ix;?$oL5ih#j6@`9C-Z^OEBvn=eSh)%%Jb ziCsxQ?bw;K8bpfmL?eN5x%(V*`%H{bcL2pMdWR2j?DY=rj_G{iX-dsaiq1{=vtrkpeU?dP z#gWdPz=nNB9|%~Q+^&tGeb|T4?$xc9AKS+V_lXX#jT3G%Xp5ShZMH4DJ<$!KfL&SS zza@}Ob_Z?|*Uow+7riE<<&XN~UdpPYW#$Ff&UEyb7q{U!9$o0aJ4@W~XfN`>Z+h47 zK{I|m$NYh;IXUawT`CSuIvKK(hi9^En^)}KyXfI<=$#wcF|CU;V=5I7j+{-kI4|-R zf5Ncu^6!#i^&0mjcaxrNGfTJyg^P8bI4)nl#(7?DF6kkh5%M=DCkre4*|rDSC~zbQ zb{`O3L8ib+Ivbl~Bxy=;J=iO4L(YFo?c^TGS1Gojv*k+Qx2vD-$WF+G48^VV_@vk% zOqb5cd~8_;V?=Ha@^N00!u6~l3wgg)*p+aFtBPaF|7~;4&)io|D6yF>JM?a6^MfQ% zK;d^%t2z?t`-t(&d~E~{Y1&m~lId32RAf6N#<6NVQ_jaf6}iL~KIM3pu|RknG77X5 z!Wi85z#+d+Jk@p}sew{bw<})?B6rAs!|CZ3#Hz_5MdP569cpu`DcAW)L-U6VUt}9| zE?=wTSep17vOcN?z9lUmgQd^Sv@jA+6qRTM!pJnhD|Wx1tC5k`*HhqLX*@TeQnVZf z6S?3p^f^G`*HuLpHGWY!VbWW%7fRU+V{0-@b?NJ^R`H?+x8^B@rEpBp@=Ih^*?SCF zM9!gCE$2CDpBRNocbj;otX-DC&kwaom1V-tphe5+$h`&krvAStaw(%1zVVmm#Q&d( zTvqU*k727dL7KE+d*LGJ#OG+F5SpEp+Fb$}DWBzPsCe;N72#UWu-yTISNtGDAifuycq71aNg$tCm9`L6ocGvO~}2QJD0yZBRJ;zy-%Q0)oVQe6U3u(*@=NFrH`y%~xwUQV zy^4Fl7s&Al{!D{RtGE~GKg*^ww6%l$XdGaBtN5(7zqFB#>@$5t>V52BCcR)qd%Q6h zp6~vFA+k%Zg&6|3=pZ9hMoyU}4b?>H+9Yh(LaHFMyjXL zkL;ce0VTwIymVOqSI|!~;B(t?qvIk`E!|3(gOPdjfY!lb1`2#+xY0UrS}-^(hJLQE zB>G(PCV_bwp_C5lpS2Z#*!6%$l<=Wnj1`IxsZ{o9&XoS%?zUdY&Rpzb+(f_}esSUw z&gyoA$H2nDQg$>j{>OSI#q>8g>v7$1#jQh;nc`h}s_xUZ?lc0UcR%Sya4b0pr9dV= zt0V08*vlLHl{^z^cnq9+GYUE*;U3;C2?LY#HV2ud*i4Buh#2PqsgftEyf1{oL3ySvIKxgML|C^U34N1#Kf=G4vRW0xIq+$&r)Y@TPIEtiHD>C35hgxX+wLj%#eW zL9uI}qBnxbmSRm_eswSzhM|D?`KOx4U}>KUD0|RyZ@MXm6N5gDKl?pDFZw{#hzNPb zF+@hsohcKgpl@EqK&WG3k^thM%E}i3(_zs^*nF=vHcBJh0~Iwo;eXHo64C3d*+(U{ z{nXPVSUji+)i<``y9fD$!!;mfi)FXMv=~&4m36pid(R@69B5cP>?ahdAk zNo}dFjxy>Z{X!6!=vPx&S9d){>$C9r<6`K!43h)^1sK;5Py*fA=e9sDygD88Pp9TT zIIRx4Qa~g0fw+}@FGJ$oU+C7V*!hm~+No?TTNja|||+rzO^ zAJQE?s{P76DnYSW0%qY@mnkz8(=LvBjZF*T=z@M7VW$E=1|pPgDKQ7|N|UI;A_AM) zv&SZdZa}c3B-qbNFKtYQIL*mR;FQwM%LSRnUq6#ABrvlA#Eg%4tU`H8v8DG(cz^cuW8$ogd;5_+2nNC_cOXLFM;GdFGG|a5w z^87jJs#{MZkS5v9${0d_$nBThsBgexxxoU=DJCfFAL8u1sNNwVM#9ZjhsmMH=7&Y> ziGXo3^ zFb%V*V`HQBmQ zS302TKn9kC&R8a}ME<@(B~BD=Uf!%K@zE8pgVG|02G;+7(%`aIwdD@xaV?Iuf$59Y+AAXJp56#DAGL9F5 z=lf5>@R+zk!h6eQyrg*z>@h4TSxaTUqO^e}8AHWWeNi5yO!u!k3{|(Eb!KPISn*5M z#;0XUl5Kr{5pGdl3deN8DlcA)2dW~ZO(?miCVZ-;-F#2iUAwzg?%FZ*P(0QP zm^RLpN+DD!e3ycI67&n$+m#SqaWE^{7r!25YX5*2@(*&|=wrBbqCJjn+2e@soNG+Q zew?GjPoQ+K$7JYo#K+0WCtJy`zBFV0skH06K5QH2=T_T@;}(Aowf`DiLWOq z`g1t%*Vn;Je6;9pJ0x({Wxn_Dah&fk3Em_e54w-z0L8*|hi`FU-m!HdoD#DIn^g?- zPsQnj4EDD7Lu{OY0kTFSr{AOG4gV!74WrIhkB$C+`$U|DmCy6KUVt)MWrz*NuyPGK zV<_y>_nM!wK*bQ_o~uhC9mnKNh9bhk?7zv@nTV&Rp@t~SU3?Ru* zRMJH*jtXdkK@RL)YLtgd)zbEEXg-t?@IJ4+%@9YwYBkZB$7TVT-FxPs^B^!<0Gr@J zjx5}+FU*frRDz4{gIisfTOQ7wHw0ykX&3}>SyHd_&Qc%0)5*%!0lL0_YxMd~(z&*x} z26D6$`oAySH}|{A>pi`TkSVwA;(tWlvOf!JE{xAZx$b@pa^Z}=LM`~7;EH~`xpT#3 zCwnDb2V=~c<`5(4ZPSA^SC-Ur6Ru%MpP-85JvLI5PYkt}0&&TaZ$1@dD__#HZmG)L z&lIk5v{Hhqp)G6U!fHCf1B2djGkhknpYWQ@2f{Z{{=QWoAU&;)%2pP)tLO9TcHZ6> z%FL*oD?T`$9#PYG9-MVY;I+VX!L$EZiweM6OEaC{gAsZ9#|*BUecPTgpac0m9mUAU zYUE_dRTeKKa9XP)4H=bVa|dp&MaSkw0>Obtnc-D>*eMt^a!0t(RnKY`r)Fox;hpI< zqTl3RRw;E(pVUP!1}ew^{(MqSzEeT=$^Pz2wB@KyKx+c+Zz(EaN znuiPoLjBvXk|Vrp&(&kPW6OP|9p5pq8poUbo?>nboztma!*LP13PvgPJ!c7C3`TAl z5Il~9Z*_)0FZ|FZ=&=9LG-vyxecK=Y$7&cD^^r^p%TZb$BEVa)xLX25K?G=@@+4?l_rX0Zz)x`{ z!XjWR;Cxn5U~M7_G>Y`IpSdL*wkI4TLjXss%lofH5pd5gdC8pwW`i)i0*mbwI}6gb#tf%OthejAmDBN|MIqW*L5?u)_Ju!*LBx# z)p4siI|?}Nbiie=I$V}BbG?3d@p+n2-x3)6WGOmu9jf$kwxy)FwsOM>w1ChWcBtFz z4H$vOVl&p%Lo7R`BZKiA;4zT-l5zvo7F6DI10eOddRVUHfKF#_3R99%4iuaXR0lyo zS1f}HI6CO8Vb$rL=XurwFzvt;``o(6%v!+zdSUa;{=#g>1er8AN0-_XnLAhdS5(vxU^w;P`}9idZ>)G5!JnSZAY>*DeHY1 zzqLZ!cMvJRe*&z$YE$Cq)i?ZFD2_Q9&Z!c7z~$77TBB31vC3|mgRJ}@SCezm{zF_G zd7Yl+eU8=tVVyOV<2`vi*7#>>RSpc+zk*jU5!AkzRe7X1wkmuHS2OvyCos%m9Y`Y@ z1+YhtwMLGbACEpk_(D6zl)rP|7$TvsYySMMYV=Xy#s|1^?1k10Kw-d3d!V{OjF@K= zkIN;V>_@e0Ty(Xb3Gwkm3;teF^HrIGXQ6|q;kG{m!FMPi)OB<$-cvD-g6DpZS3*3J zy6$6KmoaZzq~;5Tm#O1*A>1>*A@Upwr*7*Dl9x)KU9Np@Vd-(h!g!*-gBd|>9(^15 zWX_#MFTE5Oj7n@3A6@Xz2cUvW+W0xx&@h5~fzy!$0!9hCJy5hfR}}V2au=h-gh_1N zTKaM}ywUfKTPC&Wmebeq=ei*3P@-&K)+!%k$K8I@fn|uIeI6V*(zLL!z@WJFU8;xX z9jRsMYlcieGayF%@y@3Lb3`Yr53A!V1SyZ0T3tq}QdnIX0q(_}vmb)*0K|B0c?+cq z^3I+?nVepO+Op2df3~U(S-{!l|0eN%GL@X-zE*BNi&@`L}ku3?7l zeg55#?y8~FakRLHfiiSs~Pe*&``RMDl zVFpzzE(m?Nv^Via|5svT*D36S2BZR=&9LZL^HwaIPk$P1{jael;PVL_VR4=*3jv#sstOM5TYH+j80 zV+nZe-1>O34|rXbDCQw%Sg&=#qMcxWm#%j4513pRqn+pvi+xZ_i2|+Nme-|3)u&G? zV}#g;+TQZ(?BHQw^5yb%aMSQtN~>yDdJGO@+W0KK`1z&X7~F>JxkF92i^qvBK9`zi zB9FGadeQt7Yt_m%_El3Z=kjuT`qET$;XZvp^`_G4Ttyzo?oa;yrS{h)=M`7_m-jWR&A#juLuSQY@&M#jrx6i8BxCJDHIr{)Ho*A zvJ|Wm9px$Yop4xuf6zOacSJzzdS!O$3M4oj>Js>_@j#oY8#Z2j*Rk_QGpC%Sz>33i zd~LSt*=8GYj`cD9W8?RU7sEy^zdZ~9*=jt^=mJb_F|<2n2%Wt4v7q4_knMA{j@aHq zNaS3d9><4jb?+xM3)JU?NdLMov_A8{;s3>OsW;0y*m6xt-z_iQm>x8iI=#20#q-^{Z8%8F z0SjZkRD8t=9rMIZ>U*_KSL33D`|m=KaakXcA(*i={j-=oaqt)S~ z?~g`j2Cc76YQT8^MPG;p&7UJ3s(KU(`bO@nV&!6_|GpP=!vHL#;#EHfipB}2EkQq+ zv@4!|ojjJjw?7LU)7v)`g1Jeq&?i#J5=r(v-ezAP$1dg*x5Mt$R+vr)nCWGR66GxS z_qX*3Z8$2l<27w4KXzORE1dbpX{X4cFgjO`*VcT+U8Gm=j(v#?FVbow+{$&U?!0jhl5j#?#UuMm(*>KDmx#dv;lDS7mXKg zi{{wBw)T|tL8O;rMJLg*)1khrp(`!j2J^hC{6d=FU7ej^WB53X`*0q;KeurQ((Um~ z7HUW8=%`p|>l+f=I;0NGjo{hvoV*76{AO%{uNVMcXeXe=PO`0b*@g6$Zv&iTPAv_9 zitqghxOvc`>!^Jf<~G=@rLUfIC=|A^7C4Wk0x1kZh4K z(JJ_$P!#pZd8fu?>QK17@`-GuwwalQ^BWvyG3h%TW-*_x-xyvaspfbo8Ge>HL|m3! zj&0bFjzRE~D0t!c0K6PZJtNiTNfiQeal?e#vk#xHTXLd5!Z=}6XnL8bmJz-is&`a|=CQP;E>;K4QT)yetaUwMOasW1<9x_?-svbY(wj?%|u#)#i zgOI&dlEwohNn~pop(G1SSV^2st-g#r5;b2S$-*iNNo3e)f{Ra6-;s~VVmhB}^*2bC zi1E@q2^N;HG7?s7%f-fUlstiCGb^lQQ6VFVZjSx3T6wy?nh%g8SZ8GVlKrB>Vq@wAF21@Jn+z z|I1(|pz4iP{ye9eUlQ(qtgI=xhp@55=C*_P*Z-AEKw*1sO1-Ed{a?DGfZlKHtK?>J zsre`n_oHP|Apc)ktIHnzYN;s_HD4gf!Wt_{MA%p&s(ZJWkyo<%9VAIa`+u?{ob>0i z@5+y;-1xuq&D-IpI?__UB~o z?}uvUZwg=k@B69py@&4PzmZ;wuj%gntk|Z(1>ieF6W%{JdVY7DZbIzk2s#ar92I<2`oM?oh9GE3aL&oUMn!#ahfa+w|RKBD`7OQeP6K`$zSa& z*Y^k8|RDagA`0jE~*?lMZ>YMY{6$**F` zy3slJnLDQt!#oaxs`TgjV=!d5Y`nHBvHVQU4gz zK=t%o5M`fik2kyhnN{k-&p_P1;hvQ@*B^tUKsL0eh{O*Cbq;?OSb_Uh48sanxg{gY z%>N@6j=3Ip{+kQeaErQTZu%OwXD%3CEr-ga0Xc*>{f6^4GP$yyw;eppsOM4*4cges z+%&YOB5fZOhLeYS2PR1_TdpZU1`f~C%VmUnc2OCh@9jVc>Py}WP4_{;Yl4=iOX;-m z=5KGejjf}GCVb-rn_I1@v!Z0uv6M;`p^i3D9U$GFZa{d#hi>BJtKnEleRnV!1RnsM zpo1h{B~{|x zqJSB`+Yff4QJ&mce+TNZsQAU0TbE=Od00T}_ZgD5_8=8CLXc%M&hyn&G9OC7A##b| z*AzYeoPg=H9;~Paa1qO=+=*Cy=Zf{9FqPTlFW-}g2#zb#^Q2wTH~s5uf_G5+Z^LHlLAb0$=}5N;;(_?-msP69+8dyp0Sne zKHUcY5XnY$zZg*s`CTMtM@Z&4NmsbjBj^T_XE&-da9n|8^p*gPKBX@VNT>otJBVqp zV*Cu?7cd~S*oC0VsEnky4(hiNW8YVKVN5R?&-6b82i5v4FJN4GFS&0Ug>&xP^$*2E zd>ap5zGwXGwK7%WXQ2yx^YGi+@X(|nCMMYd`LYKpUsL%k9V%|Z23%+)Z&kD<>%(w4 zVGE28ZI2(VauQD&_`ZdCG!v#?a&duy!FkAzm52@~UclMbI<5-smjWt(5hxv0P!a$1 zC)G}uT!ZTLYgA~LQmF=Y!DlAF5TDBBiywV<73y7MuH0*yPh=wfhT2K(^7V2P@3{D( zBZlveUio4>*N2eUs(Wgcjw)79)1FhRSgUY!pNa2`*v;!~#XFhy6?zwoZP3dLcB!1R zip{p`)JB{xSV7vH3a}kQ_)xxcT^ezu#^R&i%altlQn?h#kAP zYIoIoF8J{C#tROY96~Y)zDr|`T7&RFpv9%y-p!c>>`&Z|g^2#D`%XaG_DU0N8_$I+ z(d%am|JfM23fQc~-EOFH-9y}yeE$97pa}&if#``D`*B>ieT8id>O%4BeGc$S`F8oQ zY5I!zUP3Il(6C~svHkQpU=kHUeom8D8c%CvJhdkP6&-C#Hk$R}Iw~nkZkA?E%;aDk zX=_$L)8#9s_0t8E%>sBCRaU%L%|U4-(ToESgIxy!-F@5&B9wI2$xlc$+Ds+>Ab~%W z$kyJ%0f}V`Css%S8l)T9c)>7u(eAQVR6jneoejViV)dS-+yYLbqhONA%}J=CP!Csw zYbgxRe7+o=kww$IX;$#6Xi@=MHaVoB{k@B0y1oCa#z5=H zPz?}3bJS7#i|Y9}bys|~M#U>XOkYhvSjY$9L_@oA<&IihHFHfQHC6HpfWg0Mr2b;r z9U(*)qa+I&cL?t-D3^{ZphHBsmtRR0moA6!%^3IPRopX-YI31Km!UiFe$-gIqx^Qf zy#)Z5XS-5b6bE=C26gq!H zr=aSZBWGEiPNY9vnBAqp0@@*rE)nVJyGRZC1-KRL;FBmQ1R2q# zbQ5nd{|48jr+RHbVKM$BP9TX;Y)I-Za1exJSb+xy#R6Tn-hnK`nej3PviW{=C}5vr zf&@Tei3Bx~=5a0^hk0G58w@-iVi$l+%AuvxBH zA#SkDmlEEw5J;YRqi+p{c0^uevwhY65YlI~wC%i`QsyfA?COGn0RUGa^buUV$VJ?RuRObB zl{(V_h3}*rq*5hM@`(p^b-l(nxNfT3U9)5=kk=t`?JTU^LSKL4%_t{g?YEt!3FW?N zP@_M`ce%Q2kM79uq@;k?BP5eGKu^c~DKnyWXQoEi{vOo@G&iRCK;ZfU$J~BQi`&}e zTAD2s-2@3Tb~ta64Q*bfFfa7SuixFjyf^=M@883pyCD2Mu=jrNi1zk^2LAy5`-7u9 zV1Mw3y!$Tzk^mG!I?zl8`P1O1howCb2_p|*X{oOve1?e9g#Ekkgyqj6{7#ZX`)*@; z$#847I2uX)ukNst!sh|T--Q-T{E~R^s>>Mo`ov1jqNPIj%!FcQ`9;8wVz2pMgw3Ri z@`lH)`WwV(Gl}AI-KDn_JoTACNILS?1nnM!at}K zbK%~(YNFf?o^VqZEzQ_q<~6yj7gIs1Nq50X;$H4?x_1!C-hKi>fIOUM&Ngn3n)(8a z^SH6y&0Vk5y~FYOI*D3^o0lH_$KcgR_HU)LG~o5y#;HCURoeK zcHgKR$ff11Txq;5*%1az+yxQ*lU|0HYIdAQ-%pSWhT$Y~a&Ar;#2>N*9Qyj5q9;Rp zL$zt>r2+Y%YY@ZeM3!5_7g31Xa=EXxF@$uHuZwPqNzRPq+Rb~NcQ|U#h0zn}0yvwy zZ(p$pxWloxXF>Wgt}JVUk=C`S4r@+vOLKc8BIN7f`-MEjWMh=Jnret<1eCZ6wegm8 zNX}JiLw2fb5z*~`5qHB9<7Z#rnJcq18ZC%EoM+30nJWqpE?Q`-EG8B=8O~<=P0o5rh-MA)LWj8#++pO2YCNdam3e|U zJFMe*7vYkuE5PJ#?G(qP`i(9F*^#B&D4Zc58wlPeEO!ui9Y#CwY;a8LUZ8bgHyPak zyBfy6KR_dthZNsCRFbd=PToM|8ZWP2f{_uNr|Z6%jn6G?A&T%*1NAE43?%=Zi3~9- z-qsi~E#B4$@7T`JLu%VvI6Uf*wJp&x7Bb2-z_8cbuIZ@Z}Co2BN zKvO{oGaK>(fBz#Rv>rG@3~g}N;IlN|_5hg!q^rL&P`IAcCio|l2M_GrPg;=H9H^lQVv(Qs-UOHRO)Vm zULMY*Db5>LKuuZEcX^{7qzLqd?0K@sb&+vfKVs7(HkumSN}|m}Y{Z#fj71H3G!Wsl zGN2U8Zk1DVffyr^Lh!PA#OgUr(566t=3e5xy|GuWtp`?2OspH=A5m$uDoz^E@+nof zhIl0a{kf_Lk{*EBK0Rbx=ya7ax9d&|KT`<{8GOfRm=GA_C?AHa?2o=8zps&4^0zYy zuZ5yb6RZpGO?$e7eMyLG;#JVfm6ZUWhE2t|e*AV_U(xImuigqUe;n@JIo?{`+i!8@ zLlM)neMQ#^MRSg@iU6FOL1`(6-wgaS>Hx~8&0QJoMB{-T7h++7xt}-f-P>&|tCCu)itS-nQ zXRx^RLbBMlVavK*h+=?61*(CLW_~$hYB8INi4zdm1{B1;ySOMaJhhB&2IaCkflG8V zj5mBuzcVxs7$f9C%Y-Sy4|~utWsU5%N{g~`fWVZK-4xfZ^8n9Plg^4Tw#)1Ww4-|s z6!!8Vp(MarZnF!fk^QY8d|U*epn&q^dOr7V{5*g%SJ2eO{(A!|y$@jgA@+00JqUuX z=GS*G0cLx!9;TgdmzO~}3;>NfJ~ZZ{VECMT05~v;z@LkU^FDQf7-*~U+asAWgqo)G zWDa0)@$kj#9mA%`#B~^2LlRTkFdS3mcr9s+zBK07QdkH(g5QsqKrvp)bpIwBz#qqqrIV{Najzl+vz{5TvEG^Sr~;6i9mZ^LgFBj= z@(F10rR|`9PsjyJkC{7?%xaOuo01u3mG8PF{-WnND^nT!NuV=rW6KEKB%m5oBIjh# zW@bV}s~y2ID+^a@Pk4;q{};yj_~iG5JdSI)@B|Wx#(5t-@Sr)G(TOdxrWyVt}5>_iZ2_jHlrX4p1ev z&&pBxL|o8C%?Vd~AE#IMskrs(CT6cM!)He25d8Bn3KJe&?n`to8jjWja5b@i;GDv+ zNQrgD2xqk(R#E+ZBP=2xD(Y$j2gB7Ey#d(ne0z=+px-SOi-MLH^}T-&`%%e%h3Jmj z2=FxFvGxwz8!^_#HPiX~zb*XDFl0@?>&_ADMeW3urnd~`m11mo?gt||7SkH}kBFRI#X;ZR21q^UASwGn&?!8NxS^m4Tdu#||HUKm@= zY)8vqPz?VWfUJ1Tte260@2klv4`J+LgiC{Bl9&|krM(JHbr2mM)U)9WrxYRY&2?5& z$_a@wltPLZ#B(38xYbw_Y-w@`^VTb4mnv>-ZKNepk(=mYrAAG;FgHtGBGI25t+6ul zuA4$vPMU8ush|qIBr2TPS67L#@K(&O;bz{ML${Pm`EMBg=)&7;?}!p4|&Ny*R^eYQuMEusT%?$C1}`)?)!C!Rk0<0a#MS>{3Z#Wy=COp=QO|!;cFQkF~*DLkEbvGyzyE8IeFr(h(pCVn zao<_CCH+ZtQ0Yh8IUw$`eb$p~$DhV_9_q(EPnCguzxHC<-;ZQ9E2ln7>W?US?C?Lc zw)MuQo7h`Nx`FleX!5&2?s^vrME~2lCqqO0 zpOukNdfsa6&MdbU{r6n2{4FUzZT1bNRL;>3iOC?RSzK1v;R>=O{;G{vUI&+w+UiG zt+Sc`qxvL%bUYZBOo}wL@@8@4CV|>GvWtI~r6(s5Eu)kB^!{KFHd}3BP0}@KObd$J z$t`}snfdP#l{M>vxqlP!lhivK4DyuL4MLULXp(t>>J)9+R?Lo+aa%3~$(F^W@NB;w zGa(sLus3e42O;!7=7`R z2SeWBeK}ejLj$M##C+d@0nPM*JZv)Kfdwbhrb8lcnls?du>SeV!NDv=7x0Kg9~Qqr zRo=hJ9^>{bQe-j?VzQ={$l>d{smJSu98J5s;w^DyF!b5)m*r0T3AkQ$^@GECF?ZLB z*QkJ75%s9L*rT0n<_rmmv{<`IG%iBoPC_PpcI9zx0awi8N|@EliE=|bRYR)6Y%7^C z&`OjjsotiHgjljK;?NF!>5hNH7N(rN$oaB5%@$J_e&lre1av74ILPUAh~R!envP7r zQV^47z8hW&dGZQ_0yr5CHiMwa6`bfXI)i^Ect*yCWH^9>10l!)vOxz7y(J8eL6P9f z2r*#7lw?weMLQI+C4!$jl60ZLkqpgXOU@cbG#HUU50T_&3g{{w1D!_*xl$`cg$6K} zA*}`p(Rb)NL69fO0|+S?#)s=^Do}X}Zt26LmZ|S6lA@QS7dZkp{vw2ZB4T~9-?Vni z6~iudUCEJF#t?M6EWX#<-1}7v_kX|ke@8BCd**f2pNS~9stU*Ez_wfoi_1CLZ8e>` zJV;`R;fIijQ}M~kWz38MNMS0=Gk;>7CfnC;NAH>!|GIOdu_s97w_Ir$`3pm{Fm>%S zUJP~<6}ninw4%o#1Cr#CWg3WOpT_XRA}uvhGONs5)>OAI@dcZ6mYOs#$W2RCrw6 zhC}acul%TrKsxm>Ee-{10p12MatVfvAyYS@6kQ79!hPinQPJdVjn7HKqeO&RTG^8J zYkR;l4pN*Pr>CspPeSOra2HN0(b6%c=WakIY8{2*1ae2_Ex+OF|W#NJ>r`Fq%4xMZqUCTV+a04iI{cXo9aGABB$giAy9p^M^>F&;;J)iiV!?eajyN(hKG^4%!lg*e{$RGSjlM zrOgyEWd|2E2mIRS@f*PhdMP79xgzd_m3!0s$Rt<7a#xQ{$lY3`uSYAj^BNXbz(XrE zCKg4$wLDRKd~%;@jIjz|4n^WRJEd&B7mqUk=DX%*ROZs!lW3qm=;yZ&g;Y#<5UDMl zN4VOQ%xdkeKf>>>PV}vMu5&w2d`f{d9&bHRHm$~(3!l#lJbI$3`8VCdaR?{T;z{Tv1OchFQUf4iy@x4Xu_2)siv~N>X7X zWol|zlBWAFU(cwRcgR=LJSN>}i@R0->YR7-`^D!Pr_CGhMs)5}MD=PcYj_{Ep`14~ z9N=2M&>R#)NW^o-hR?PQ)X6J8Rv0z;BoS`sR3fWnoj^~dFSuGl^-$$8DQu@?i70h& zA-Z#omv*bt1@^m`V?6=|{wQZ-r*2tFB3@it{^AL$EXssd?%F?N+b@Npi@YeYZ36=v_qNKz->-fNvw*9>yimN-8;aHe_sGD+)svjPC5 zsN{$JYlEnY@&RsC$rRZzXf<_DAF13_3%s~5n5cq;Z*)DkUz*M%&X5e69pFc!z)l)b z908eqRl7{{e%yH!Nq06RrFV|@HMXqC@w6N%(`iyhi&q#-u4m8E4KuMHBzIeotOf{J zC3(Ymu!&@6R!u~mtv*$kjYKXsv!(ePPMWjf^Qy8qR_p4WZeh!v5IepO{ZlWKl-oYl znqD#OD)wF$1lw5lmE{sGlWu4!hTV>s@svDqhi}nPl}!&h^*95; zVaN0N;A^x4oFU$&e*dLP%ayQn&+IscQ~vXS0_++nWieP2BzSYFU}%nso8Eq9YX}^~ zfR)az>`m^8C{7)u-1_&z{)G+iAODi(T-j_|+MGtoy+sz(!jg7X$}7RL7>0i7QseF= z>*;cM&9dGERUOF&3QeOr63goTFD@Tn6*v&nV8@ z?+^kj4m1ljYS(X~M|<=Npa4lX;^0^o)+Zj+vx5*ft-4&<$w60?qfL%whe$|6_*@NC zqV-x?(@vVe=FOQ0b44n(Q7aXf-hLWMPE>H5bz*O|8GZrz5?h zEjy?8#&xt6hT0;uqRF9SHFEoO5|veNw4z(B1^ae+D*H2lGky8hq1mB`b!4Pz)0VXH zeQmayV4nWd!q8qB+Wu8jMA;=G0@3pL6~QtwO16=xBOP0wzH#G%hUMDy;>PFdj-0In zpY2|zv=Lies9i&zPLvt$K;(t#y3vz4sqMLd2^TOj5vALPVp~le({1L25+gAQ0zWOT zEStFf@q;st%vn^%0opYE9{=ZD$`F&AV2X4eFvAp(iIY~L@Zxp5%-gebN+538(M1yx zujb`^6X05uC(ls?NPsJTcl?-31EIa@%2>gWa^r$TTTIWCdS&yhWGGUIvnEoWKc-Ew z;6ds?yo9^hX;JfE=jK(oCPwQ8rqnvysVm);xaXW&<)K|NuT)4jF%xB4Qqgeo=_|}V zQ}~G!m~HKOqhm+(oM=Y?p5I);W&(1?vX8ezJDwJn(_uzxQETDUWH`=aPv?t|WeyX| zrJ*^9bt4NKKdz?NG-1}v$Ew{|tVB9`^X^fhzVd6U*ysYYP~X^Y2}LZ|o5rLmC>~h%hVarCQM^((3dg$6A6>RIv$&d0 z{hP0(59EhLEA{GzdE(5m*1jmuE$y!OwMjfySuKdqZ#Dw=N3)D^cbaPaNBa>N12XpHpuZ{JwlqgR}pIQdC ziIGXuf?alQ0qMvcTyvvVMo?Qb@Xq+BNmRx_CF`a7DEGfbp|aeVBGzcl?BM_X{>Qtj zF?Rr-!b~D*v>7VsIQdMUs=Wk^=&ahgC^zazOxhY(8JJw_qzOx!8cZD7(RnU4 zZQGVp=>>j3)pgs-W4^F7=|$zkPv!P{AwM<*uusYKAF& zYyjXtX?d>*ff!dnZ`=-q?DX@CB0`}+ztg`#%c!X4!x|oz9*SoIG&~@VxKoooh3b!d$mqAQrl1xCy za%+KV2grSZcS^53MQZ*wUBH5MB4XB}L-Rc^^rN9qBh}l#^Th$CeBMQ?p>R_GV;+oe*}eGTYDk$ExMN7FbsbS<75Cap(qLWq3? zpmIaGLfSr9%i=^hsW)L)kBdQq4wXsb8z=-ba(BxLq&c}WHiG` zi55gi(K=#Eh0E$LO?@QLYTQ-1q)fp-!JHv2jvPf{BBxx97_A~;r>SKB?5yLwt6bTS zhA}li)~J!na&-fwu9yx zb^LK~2SJB-S}@}gPGpSx0qWC}e!~P2xKD^KAdix|TP~oVCcO{#18@!RpF@w{G%!|3 z9+O4NdTx@2sXuPa3Qv@sL!o$-u%Nr~tn~07Q5;5uwjRp~nUJx^I(;v&=tR9iU?)9#aRgTEu_CA_ziC(%kt!lZB-uLJkyQuDw(rzOA=K4em zY9=S>?a|xj6W&ec)4zssqn_LuL0ls>{O24uFHtGqZXzk=%xKE(L^C9LhI4)sH)al_ zBCY03(gj#7qA?LH4LRZjKjZt`;NJz&Q9Jwker>GM&Q2@YPxW2{_GLNIG=KwI!PRy} zi}`pSm%MQN*NRK%4;^og0x15=(twGn4Qt+|T4#KD9g)tDoVV5IM(*2PT6|e{x9>vd z$GL6cM)X_ee6~;nY4<{Oe!m$`t~?(oBA75A~62*?)-k2_aAM@IuNI{<1o4U%gOG9?A^|UT=zug)pE1N z%vowtf@l*3yDH|`*+afmS%eQh`{k(?3LSZNqO*ZluC!g*VNZBJmlJa$mOHNS$f2jE ztA_6^^?lrnM|0bj{=QC5YywhE=GH~=%)Fsu<2S6XpOzxeHuuX?5AB32^!$qpIX?x0 zPfM5I0L-VwsBh-N_Vpm`Z8JYV!SneZjxclG}KT+V)bSCAGfkSEi9tA4Ql&r#9PuVh8Abh z%jRsHBNug|%L$XwR*=}f7LHu92Hyl-nD6!q`VLT4O$ss<9X$xpYuyA9WDa_39h8q( z@VQPR@e(%}Ow+lWjn)=qayteknDHScOZI4mv8j;8U?c3p?B)g^&xMIqU*{CBLd9l( z-7b?;8gq>2ig)5%K&=(JkL#fH!^49Yp_c~C6{z82rAQOqm>XjZ*wtPf?7{_82l|61 z0f)nW?->@Uo4JlOyXLd}N>pxFxgFW=CgolkcopE+WX7Lx{v{ckH}s;kFC2c^16ParyT3%?i@g5HpZv*R{{|Tvy!`2Z?X3ZELvf}q z?3hR>l3+rYd^pU9lFU`T_vbi^6cZEvS+QIC3uU|1OvNG@>xK>)!%P);VK|inK_kSI zf0*$%8^PRRvS3${xRSDxCR{+b?>sp?p-)? zd3Tks-*wB`NIdax?$p(cBzPH4CX=beWg?MGhNJ)Kj&KyonBU=LOV|>U>cu3&LP?u& zF#2c9=n|ve7!(=MCt4<5W z_0@D=t=!DBH6_)qbY{7GaX!thx8vBY5XuTttRU}%hBV66T0){2<#J7loAyMZnia1a zkw!7Qj+ZB+M6-6&P2TsG^F*V)P$!A}K#b~EA+$?%Bqgz!D5_p<+_gu!2fchh%Ik&X zM7_N!%6nKO^b^L^8V>I-qeE_3+MA_mwHm2&$i^*t{E#2LYtfTi0j7-cj3}~BorNmvT4Rf z&CEcN0WD)(!`N_-@HtKHZO&Yz8QDNraTyO0O*3U67EOJ=`_H!U#aW89Hnr8xgJ}Qw zzsPKcXyy$pe$+Q4aqJk z+NLCU2KqZ0|&MAphMEykgRw9A|z15JH&4g6ni zIj8riqUfE6f1w+euBqyO*m7ITA~F{})x(c}8v4zhwj`6}x8YbM7>S32_=Nwz1@nXg$T8(!uXPc5 z`JxX5jmwUqN}g%pEaQMetT;$_L4)C7B*JwY*;cpQ?7e)El_X^eY29YA!g2|nAY#MC z7GvISPh+9+_nA;I#s2={_vG!;DTu2H`gl!L6%?K8cUeUz3F2;oqAnSDmj=)5trcLh zXtKs_uaCl;t2hZqHK3t`LQ4Im+ST>~6V#6=Bx z3s6>jto|$(iv33zJ!=Vgq}VqTQgZqazxhWxFbt$&UcT^Rg9qj(sPOW|36tO%%y2K~ZXPv7Sq6T4HHViF1BRYBB>5=n0)=<;s5WVvmZ)dY9YYE&qW4U5t^lVoop% z1A`KSk|vOKLPJw?!=>x~=XE@Ny>z|&&YZdEt$j)Vl>aGx&1>2#H5eIAdhrAR^*jce V&&j~aQ!ytwfuY%rfuWP32>^J7Qa%6x literal 0 HcmV?d00001 From 85698d480f6f0c36c167489ae8ba1911214c1532 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 5 May 2026 09:50:46 +0100 Subject: [PATCH 09/16] tests: cover also request and metadata --- .../org/commoncrawl/util/TestWarcWriter.java | 75 ++++++++++++++++++- 1 file changed, 74 insertions(+), 1 deletion(-) diff --git a/src/test/org/commoncrawl/util/TestWarcWriter.java b/src/test/org/commoncrawl/util/TestWarcWriter.java index c4bdef4646..56d13bd4c7 100644 --- a/src/test/org/commoncrawl/util/TestWarcWriter.java +++ b/src/test/org/commoncrawl/util/TestWarcWriter.java @@ -46,7 +46,7 @@ public void testWriteRevisitRecordContentType() throws Exception { String url = "https://de.wikipedia.org/wiki/Wikipedia:WikiCon_2025"; Content content = SegmenterRecordReader.retrieveContent(segmentPath, url); - assert (content.getContent() == null || content.getContent().length == 0); + assert (content.getContent() == null || content.getContent().length == 0) : "Content in revisit records must be null or empty."; URI targetUri = new URI(content.getUrl()); Metadata metadata = content.getMetadata(); @@ -95,6 +95,7 @@ public void testWriteResponseRecordWithMalformedURL() throws Exception { String url = "https:////sites.google.com/site/lebercailgiteennormandie/robots.txt"; Content content = SegmenterRecordReader.retrieveContent(segmentPath, url); + assert (content.getContent() != null && content.getContent().length > 0) : "Content in fetched 200s records must not be null."; URI targetUri = new URI(content.getUrl()); Metadata metadata = content.getMetadata(); @@ -124,4 +125,76 @@ public void testWriteResponseRecordWithMalformedURL() throws Exception { assertTrue(warcOutput.contains("WARC-Target-ID: https://sites.google.com/site/lebercailgiteennormandie/robots.txt"), "WARC-Target-ID should be normalized to a valid URL"); } + + @Test + public void testWriteRequestRecordWithMalformedURL() throws Exception { + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + WarcWriter writer = new WarcWriter(bos); + + File segmentDir = new File(System.getProperty("test.build.data", "."), + "test-segments/20260505091103-malformed-urls"); + assertNotNull(segmentDir, "Missing segment resource"); + String segmentPath = segmentDir.getAbsolutePath(); + String url = "https:////sites.google.com/site/lebercailgiteennormandie/robots.txt"; + + Content content = SegmenterRecordReader.retrieveContent(segmentPath, url); + assert (content.getContent() != null && content.getContent().length > 0) : "Content in fetched 200s records must not be null."; + URI targetUri = new URI(content.getUrl()); + + Metadata metadata = content.getMetadata(); + String ip = content.getMetadata().get("_ip_"); + + Date date = HttpDateFormat.toDate(metadata.get("date")); + URI warcinfoId = writer.getRecordId(); + + writer.writeWarcRequestRecord(targetUri, ip, date, + warcinfoId, null, null, content.getContent()); + + byte[] compressed = bos.toByteArray(); + ByteArrayInputStream bis = new ByteArrayInputStream(compressed); + GZIPInputStream gis = new GZIPInputStream(bis); + ByteArrayOutputStream decompressed = new ByteArrayOutputStream(); + gis.transferTo(decompressed); + + String warcOutput = decompressed.toString(); + + assertTrue(warcOutput.contains("WARC-Target-ID: https://sites.google.com/site/lebercailgiteennormandie/robots.txt"), + "WARC-Target-ID should be normalized to a valid URL"); + } + + @Test + public void testWriteMetadataRecordWithMalformedURL() throws Exception { + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + WarcWriter writer = new WarcWriter(bos); + + File segmentDir = new File(System.getProperty("test.build.data", "."), + "test-segments/20260505091103-malformed-urls"); + assertNotNull(segmentDir, "Missing segment resource"); + String segmentPath = segmentDir.getAbsolutePath(); + String url = "https:////sites.google.com/site/lebercailgiteennormandie/robots.txt"; + + Content content = SegmenterRecordReader.retrieveContent(segmentPath, url); + assert (content.getContent() != null && content.getContent().length > 0) : "Content in fetched 200s records must not be null."; + URI targetUri = new URI(content.getUrl()); + + Metadata metadata = content.getMetadata(); + URI relatedId = writer.getRecordId(); + String blockDigest = "sha1:def456"; + + Date date = HttpDateFormat.toDate(metadata.get("date")); + URI warcinfoId = writer.getRecordId(); + + writer.writeWarcMetadataRecord(targetUri, date, warcinfoId, relatedId, blockDigest, content.getContent()); + + byte[] compressed = bos.toByteArray(); + ByteArrayInputStream bis = new ByteArrayInputStream(compressed); + GZIPInputStream gis = new GZIPInputStream(bis); + ByteArrayOutputStream decompressed = new ByteArrayOutputStream(); + gis.transferTo(decompressed); + + String warcOutput = decompressed.toString(); + + assertTrue(warcOutput.contains("WARC-Target-ID: https://sites.google.com/site/lebercailgiteennormandie/robots.txt"), + "WARC-Target-ID should be normalized to a valid URL"); + } } From caa24b27f7fac640a2c605fdaf682c53121706d7 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 5 May 2026 10:00:06 +0100 Subject: [PATCH 10/16] fix: target the right WARC field --- src/test/org/commoncrawl/util/TestWarcWriter.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/test/org/commoncrawl/util/TestWarcWriter.java b/src/test/org/commoncrawl/util/TestWarcWriter.java index 56d13bd4c7..7028e9acef 100644 --- a/src/test/org/commoncrawl/util/TestWarcWriter.java +++ b/src/test/org/commoncrawl/util/TestWarcWriter.java @@ -122,8 +122,8 @@ public void testWriteResponseRecordWithMalformedURL() throws Exception { String warcOutput = decompressed.toString(); - assertTrue(warcOutput.contains("WARC-Target-ID: https://sites.google.com/site/lebercailgiteennormandie/robots.txt"), - "WARC-Target-ID should be normalized to a valid URL"); + assertTrue(warcOutput.contains("WARC-Target-URI: https://sites.google.com/site/lebercailgiteennormandie/robots.txt"), + "WARC-Target-URI should be normalized to a valid URL"); } @Test @@ -158,8 +158,8 @@ public void testWriteRequestRecordWithMalformedURL() throws Exception { String warcOutput = decompressed.toString(); - assertTrue(warcOutput.contains("WARC-Target-ID: https://sites.google.com/site/lebercailgiteennormandie/robots.txt"), - "WARC-Target-ID should be normalized to a valid URL"); + assertTrue(warcOutput.contains("WARC-Target-URI: https://sites.google.com/site/lebercailgiteennormandie/robots.txt"), + "WARC-Target-URI should be normalized to a valid URL"); } @Test @@ -194,7 +194,7 @@ public void testWriteMetadataRecordWithMalformedURL() throws Exception { String warcOutput = decompressed.toString(); - assertTrue(warcOutput.contains("WARC-Target-ID: https://sites.google.com/site/lebercailgiteennormandie/robots.txt"), - "WARC-Target-ID should be normalized to a valid URL"); + assertTrue(warcOutput.contains("WARC-Target-URI: https://sites.google.com/site/lebercailgiteennormandie/robots.txt"), + "WARC-Target-URI should be normalized to a valid URL"); } } From b41a29ca42f6c334bc7a47ce69b285c02894dd3c Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 12 May 2026 14:09:35 +0200 Subject: [PATCH 11/16] feat: add tests with normalization using HttpUrl --- .../TestOkHttpPunyCodeNormalization.java | 171 ++++++++++++++++++ ...pRobotsTxtInvalidSlashesNormalization.java | 42 +++++ 2 files changed, 213 insertions(+) create mode 100644 src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestOkHttpPunyCodeNormalization.java create mode 100644 src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestOkHttpRobotsTxtInvalidSlashesNormalization.java diff --git a/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestOkHttpPunyCodeNormalization.java b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestOkHttpPunyCodeNormalization.java new file mode 100644 index 0000000000..9dde13170e --- /dev/null +++ b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestOkHttpPunyCodeNormalization.java @@ -0,0 +1,171 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.okhttp; + +import okhttp3.HttpUrl; +import okhttp3.Interceptor; +import okhttp3.OkHttpClient; +import okhttp3.Request; +import okhttp3.Response; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.net.IDN; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicReference; + +import static org.junit.jupiter.api.Assertions.*; + +/** + * Tests for how OkHttp parses and normalizes hosts in three forms: + * - Unicode (e.g. "https://🧠.s.country/...") + * - Percent-encoded UTF-8 (e.g. "https://%F0%9F%A7%A0.s.country/...") + * - Punycode / ACE (e.g. "https://xn--nv8h.s.country/...") + */ +public class TestOkHttpPunyCodeNormalization { + + // U+1F9E0 BRAIN + private static final String BRAIN_UNICODE = "🧠"; + private static final String BRAIN_PCT_UTF8 = "%F0%9F%A7%A0"; + private static final String BRAIN_PUNYCODE = "xn--qv9h"; + + private static final String PARENT = ".s.country"; + private static final String PATH = "/p/human-protocol-aligning-hearts-bots"; + + + @Test + public void testOkHttpVersion() { + // Just for mental sanity, will be removed + assertEquals("5.3.2", okhttp3.OkHttp.VERSION); + } + + @Test + public void unicodeHostNormalizesToPunycode() { + HttpUrl url = HttpUrl.parse("https://" + BRAIN_UNICODE + PARENT + PATH); + assertNotNull(url, "HttpUrl.parse must accept Unicode host"); + assertEquals(BRAIN_PUNYCODE + PARENT, url.host()); + } + + @Test + public void percentEncodedHostNormalizesToPunycode() { + // This is the CC WARC-Target-URI form. The question: does OkHttp + // decode the percent-escapes in the host and IDN-normalize, or + // does it leave them as literal characters / mis-normalize? + HttpUrl url = HttpUrl.parse("https://" + BRAIN_PCT_UTF8 + PARENT + PATH); + assertNotNull(url, "HttpUrl.parse must accept percent-encoded host"); + assertEquals( + BRAIN_PUNYCODE + PARENT, url.host(), "Percent-encoded UTF-8 host must normalize to Punycode for the SAME emoji"); + } + + @Test + public void punycodeHostPassesThrough() { + HttpUrl url = HttpUrl.parse("https://" + BRAIN_PUNYCODE + PARENT + PATH); + assertNotNull(url); + assertEquals(BRAIN_PUNYCODE + PARENT, url.host()); + } + + @Test + public void allThreeFormsProduceEquivalentHost() { + HttpUrl uni = HttpUrl.parse("https://" + BRAIN_UNICODE + PARENT + PATH); + HttpUrl pct = HttpUrl.parse("https://" + BRAIN_PCT_UTF8 + PARENT + PATH); + HttpUrl ace = HttpUrl.parse("https://" + BRAIN_PUNYCODE + PARENT + PATH); + assertNotNull(uni); + assertNotNull(pct); + assertNotNull(ace); + assertEquals(uni.host(), pct.host()); + assertEquals(pct.host(), ace.host()); + } + + @Test + public void pathIsNotMangledByHostNormalization() { + // Sanity: percent-decoding the host must not bleed into the path. + HttpUrl url = HttpUrl.parse("https://" + BRAIN_PCT_UTF8 + PARENT + PATH); + assertNotNull(url); + assertEquals(PATH, url.encodedPath()); + } + + @Test + public void javaIdnAgreesWithOkHttp() { + // Cross-check OkHttp's host() output against the JDK's IDN.toASCII() + // so we know which spec OkHttp is following. + String jdk = IDN.toASCII(BRAIN_UNICODE + PARENT, IDN.ALLOW_UNASSIGNED); + HttpUrl url = HttpUrl.parse("https://" + BRAIN_UNICODE + PARENT + PATH); + assertNotNull(url); + assertEquals(jdk, url.host()); + } + + + @Test + public void hostHeaderMatchesNormalizedHost() throws IOException { + // Build a request and intercept it BEFORE it hits the network, so + // we can read the exact Host header OkHttp would send. We use an + // application interceptor that short-circuits with a synthetic + // response — no actual DNS / TCP needed. + AtomicReference seenHost = new AtomicReference<>(); + AtomicReference seenUrl = new AtomicReference<>(); + + Interceptor capture = chain -> { + Request req = chain.request(); + seenHost.set(req.header("Host") != null + ? req.header("Host") + : req.url().host()); // OkHttp adds Host at the network layer + seenUrl.set(req.url().toString()); + return new Response.Builder() + .request(req) + .protocol(okhttp3.Protocol.HTTP_1_1) + .code(204) + .message("No Content (synthetic)") + .build(); + }; + + OkHttpClient client = new OkHttpClient.Builder() + .addInterceptor(capture) + .callTimeout(2, TimeUnit.SECONDS) + .build(); + + String input = "https://" + BRAIN_PCT_UTF8 + PARENT + PATH; + Request req = new Request.Builder().url(input).head().build(); + try (Response r = client.newCall(req).execute()) { + assertEquals(204, r.code()); + } + + assertEquals( + BRAIN_PUNYCODE + PARENT, seenHost.get(), + "Effective host derived from a percent-encoded UTF-8 input must be the matching Punycode"); + } + + // -- Mismatch detector (the CC bug, reproduced if it triggers) ----------- + + @Test + public void parsedHostMustMatchOriginalEmoji() { + // If this ever fails, OkHttp itself is producing a host that + // disagrees with the input — which would be the CC WARC bug + // happening inside OkHttp. Currently expected to pass. + String[] inputs = { + "https://" + BRAIN_UNICODE + PARENT + PATH, + "https://" + BRAIN_PCT_UTF8 + PARENT + PATH, + "https://" + BRAIN_PUNYCODE + PARENT + PATH, + }; + for (String s : inputs) { + HttpUrl u = HttpUrl.parse(s); + assertNotNull(u, "parse failed for " + s); + assertTrue( + u.host().startsWith(BRAIN_PUNYCODE + "."), + "Host for " + s + " was " + u.host() + ", expected to contain " + BRAIN_PUNYCODE); + } + } +} diff --git a/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestOkHttpRobotsTxtInvalidSlashesNormalization.java b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestOkHttpRobotsTxtInvalidSlashesNormalization.java new file mode 100644 index 0000000000..10824f9c36 --- /dev/null +++ b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestOkHttpRobotsTxtInvalidSlashesNormalization.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.okhttp; + +import okhttp3.*; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.net.IDN; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicReference; + +import static org.junit.jupiter.api.Assertions.*; + +/** + * Tests for how OkHttp parses and normalizes hosts in three forms: +*/ +public class TestOkHttpRobotsTxtInvalidSlashesNormalization { + + @Test + public void unicodeHostNormalizesToPunycode() { + HttpUrl url = HttpUrl.parse("https:////sites.google.com/bao"); + assertNotNull(url, "HttpUrl.parse must accept Unicode host"); + assertEquals("sites.google.com", url.host()); + } + + +} From bbc1bb5624acb817178af1fc01eb8f8cf7fcbf60 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 12 May 2026 14:10:38 +0200 Subject: [PATCH 12/16] feat: store stringified URL --- src/java/org/apache/nutch/net/protocols/Response.java | 10 ++++++++++ .../apache/nutch/protocol/okhttp/OkHttpResponse.java | 3 +++ 2 files changed, 13 insertions(+) diff --git a/src/java/org/apache/nutch/net/protocols/Response.java b/src/java/org/apache/nutch/net/protocols/Response.java index 3fbe932667..0e5cbd0b17 100644 --- a/src/java/org/apache/nutch/net/protocols/Response.java +++ b/src/java/org/apache/nutch/net/protocols/Response.java @@ -41,6 +41,16 @@ public interface Response extends HttpHeaders { */ public static final String IP_ADDRESS = "_ip_"; + /** + * Key to hold the raw URL (it might not be fully compliant with encoding standards) + */ + public static final String RAW_URL = "_raw_url_"; + + /** + * Key to hold the stringified URL after passing through @see HttpUrl + */ + public static final String STRINGIFIED_URL = "_stringified_url_"; + /** * Key to hold the HTTP and SSL/TLS protocol versions if * store.protocol.versions is true. diff --git a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java index 9aa1526157..f65b83a7db 100644 --- a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java +++ b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java @@ -111,6 +111,9 @@ public OkHttpResponse(OkHttp okhttp, URL url, CrawlDatum datum) Metadata responsemetadata = new CaseInsensitiveMetadata(); okhttp3.Headers httpHeaders = response.headers(); + HttpUrl requestURL = request.url(); + responsemetadata.add(RAW_URL, url.toString()); + responsemetadata.add(STRINGIFIED_URL, requestURL.toString()); for (int i = 0, size = httpHeaders.size(); i < size; i++) { String key = httpHeaders.name(i); String value = httpHeaders.value(i); From f9fa3f17b9d7f22bb79c9afa9375633bfef8cf7d Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 18 May 2026 12:33:40 +0200 Subject: [PATCH 13/16] fix: missing import --- .../java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java index f65b83a7db..db8be9c29f 100644 --- a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java +++ b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java @@ -23,6 +23,7 @@ import java.util.Base64; import java.util.Locale; +import okhttp3.HttpUrl; import org.apache.hadoop.io.Text; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.metadata.CaseInsensitiveMetadata; From a5a4394ea7c08a14ce8e5f88126b20c8c06a627a Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 18 May 2026 13:08:19 +0200 Subject: [PATCH 14/16] fix: disable tests related to a part that may be out of scope in this PR --- src/test/org/commoncrawl/util/TestWarcWriter.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/test/org/commoncrawl/util/TestWarcWriter.java b/src/test/org/commoncrawl/util/TestWarcWriter.java index 7028e9acef..af448245eb 100644 --- a/src/test/org/commoncrawl/util/TestWarcWriter.java +++ b/src/test/org/commoncrawl/util/TestWarcWriter.java @@ -20,6 +20,7 @@ import org.apache.nutch.net.protocols.HttpDateFormat; import org.apache.nutch.protocol.Content; import org.commoncrawl.util.test.SegmenterRecordReader; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import java.io.ByteArrayInputStream; @@ -84,6 +85,7 @@ public void testWriteRevisitRecordContentType() throws Exception { } @Test + @Disabled("This test is testing a behaviour we are not sure we will implement - fixing the issue downstream instead of upstream. ") public void testWriteResponseRecordWithMalformedURL() throws Exception { ByteArrayOutputStream bos = new ByteArrayOutputStream(); WarcWriter writer = new WarcWriter(bos); @@ -127,6 +129,7 @@ public void testWriteResponseRecordWithMalformedURL() throws Exception { } @Test + @Disabled("This test is testing a behaviour we are not sure we will implement - fixing the issue downstream instead of upstream. ") public void testWriteRequestRecordWithMalformedURL() throws Exception { ByteArrayOutputStream bos = new ByteArrayOutputStream(); WarcWriter writer = new WarcWriter(bos); @@ -163,6 +166,7 @@ public void testWriteRequestRecordWithMalformedURL() throws Exception { } @Test + @Disabled("This test is testing a behaviour we are not sure we will implement - fixing the issue downstream instead of upstream. ") public void testWriteMetadataRecordWithMalformedURL() throws Exception { ByteArrayOutputStream bos = new ByteArrayOutputStream(); WarcWriter writer = new WarcWriter(bos); From d706ed685aa84c94ab820adc46703466a155ceb5 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 22 May 2026 12:31:34 +0100 Subject: [PATCH 15/16] Revert "feat: store stringified URL" This reverts commit bbc1bb5624acb817178af1fc01eb8f8cf7fcbf60. --- src/java/org/apache/nutch/net/protocols/Response.java | 10 ---------- .../apache/nutch/protocol/okhttp/OkHttpResponse.java | 3 --- 2 files changed, 13 deletions(-) diff --git a/src/java/org/apache/nutch/net/protocols/Response.java b/src/java/org/apache/nutch/net/protocols/Response.java index 0e5cbd0b17..3fbe932667 100644 --- a/src/java/org/apache/nutch/net/protocols/Response.java +++ b/src/java/org/apache/nutch/net/protocols/Response.java @@ -41,16 +41,6 @@ public interface Response extends HttpHeaders { */ public static final String IP_ADDRESS = "_ip_"; - /** - * Key to hold the raw URL (it might not be fully compliant with encoding standards) - */ - public static final String RAW_URL = "_raw_url_"; - - /** - * Key to hold the stringified URL after passing through @see HttpUrl - */ - public static final String STRINGIFIED_URL = "_stringified_url_"; - /** * Key to hold the HTTP and SSL/TLS protocol versions if * store.protocol.versions is true. diff --git a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java index db8be9c29f..65343c8bff 100644 --- a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java +++ b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java @@ -112,9 +112,6 @@ public OkHttpResponse(OkHttp okhttp, URL url, CrawlDatum datum) Metadata responsemetadata = new CaseInsensitiveMetadata(); okhttp3.Headers httpHeaders = response.headers(); - HttpUrl requestURL = request.url(); - responsemetadata.add(RAW_URL, url.toString()); - responsemetadata.add(STRINGIFIED_URL, requestURL.toString()); for (int i = 0, size = httpHeaders.size(); i < size; i++) { String key = httpHeaders.name(i); String value = httpHeaders.value(i); From 31a232485bb635b556eb75389582739cca7ebdc8 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 22 May 2026 12:31:41 +0100 Subject: [PATCH 16/16] Revert "fix: missing import" This reverts commit f9fa3f17b9d7f22bb79c9afa9375633bfef8cf7d. --- .../java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java | 1 - 1 file changed, 1 deletion(-) diff --git a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java index 65343c8bff..9aa1526157 100644 --- a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java +++ b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java @@ -23,7 +23,6 @@ import java.util.Base64; import java.util.Locale; -import okhttp3.HttpUrl; import org.apache.hadoop.io.Text; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.metadata.CaseInsensitiveMetadata;