From 8d19dadd7e226942b665fdad689d53848c5a7b62 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 23 Feb 2026 17:30:11 +0100 Subject: [PATCH 1/8] fix: update revisit content-type, move metadata content type in a constant too #40 --- src/java/org/commoncrawl/util/WarcWriter.java | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/java/org/commoncrawl/util/WarcWriter.java b/src/java/org/commoncrawl/util/WarcWriter.java index 57f7825354..aa9b20ba3b 100644 --- a/src/java/org/commoncrawl/util/WarcWriter.java +++ b/src/java/org/commoncrawl/util/WarcWriter.java @@ -95,6 +95,9 @@ public class WarcWriter { protected static final String DETECTED_CHARSET = "Detected-Charset"; protected static final String DETECTED_LANGUAGE = "Detected-Language"; + public static final String CONTENT_TYPE_RESPONSE = "application/http; msgtype=response"; + public static final String CONTENT_TYPE_METADATA = "application/warc-fields"; + private SimpleDateFormat isoDate; public static class CompressedOutputStream extends GZIPOutputStream { @@ -196,7 +199,7 @@ public URI writeWarcinfoRecord(String filename, String hostname, byte[] ba = sb.toString().getBytes(StandardCharsets.UTF_8); URI recordId = getRecordId(); - writeRecord(WARC_INFO, date, "application/warc-fields", recordId, extra, + writeRecord(WARC_INFO, date, CONTENT_TYPE_METADATA, recordId, extra, new ByteArrayInputStream(ba), ba.length); return recordId; } @@ -263,8 +266,7 @@ public URI writeWarcResponseRecord(final URI targetUri, final String ip, extra.put(WARC_IDENTIFIED_PAYLOAD_TYPE, content.getContentType()); URI recordId = getRecordId(); - writeRecord(WARC_RESPONSE, date, "application/http; msgtype=response", - recordId, extra, block); + writeRecord(WARC_RESPONSE, date, CONTENT_TYPE_RESPONSE, recordId, extra, block); return recordId; } @@ -304,7 +306,7 @@ public URI writeWarcRevisitRecord(final URI targetUri, final String ip, } URI recordId = getRecordId(); - writeRecord(WARC_REVISIT, date, "message/http", recordId, extra, block); + writeRecord(WARC_REVISIT, date, CONTENT_TYPE_RESPONSE, recordId, extra, block); return recordId; } @@ -321,8 +323,7 @@ public URI writeWarcMetadataRecord(final URI targetUri, final Date date, } URI recordId = getRecordId(); - writeRecord(WARC_METADATA, date, "application/warc-fields", recordId, extra, - block); + writeRecord(WARC_METADATA, date, CONTENT_TYPE_METADATA, recordId, extra, block); return recordId; } From 5c005d14eaea0a79899af6417c4b70ec203404f6 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 24 Feb 2026 07:39:22 +0100 Subject: [PATCH 2/8] feat: add unit test to verify the correct content-type of a revisit record #40 --- .../org/commoncrawl/util/TestWarcWriter.java | 82 +++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 src/test/org/commoncrawl/util/TestWarcWriter.java diff --git a/src/test/org/commoncrawl/util/TestWarcWriter.java b/src/test/org/commoncrawl/util/TestWarcWriter.java new file mode 100644 index 0000000000..90a504879e --- /dev/null +++ b/src/test/org/commoncrawl/util/TestWarcWriter.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.commoncrawl.util; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; +import java.util.zip.GZIPInputStream; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.jupiter.api.Test; + +public class TestWarcWriter { + + @Test + public void testWriteRevisitRecordContentType() throws IOException, URISyntaxException { + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + WarcWriter writer = new WarcWriter(bos); + + byte[] block = "HTTP/1.1 304\r\nContent-Type: text/html\r\n\r\n".getBytes(); + + Configuration conf = NutchConfiguration.create(); + Metadata metadata = new Metadata(); + metadata.add("Content-Type", "text/html"); + Content content = new Content("https://de.wikipedia.org/wiki/Wikipedia:WikiCon_2025", "https://de.wikipedia.org", + block, "text/html", metadata, conf); + + URI targetUri = new URI("https://de.wikipedia.org/wiki/Wikipedia:WikiCon_2025"); + String ip = "208.80.154.224"; + int httpStatusCode = 304; + java.util.Date date = new java.util.Date(); + URI warcinfoId = writer.getRecordId(); + URI relatedId = writer.getRecordId(); + String warcProfile = WarcWriter.PROFILE_REVISIT_IDENTICAL_DIGEST; + java.util.Date refersToDate = new java.util.Date(System.currentTimeMillis() - 3600000); + String payloadDigest = "sha1:abc123"; + String blockDigest = "sha1:def456"; + + writer.writeWarcRevisitRecord(targetUri, ip, httpStatusCode, date, + warcinfoId, relatedId, warcProfile, refersToDate, payloadDigest, + blockDigest, null, null, block, content); + + byte[] compressed = bos.toByteArray(); + ByteArrayInputStream bis = new ByteArrayInputStream(compressed); + GZIPInputStream gis = new GZIPInputStream(bis); + ByteArrayOutputStream decompressed = new ByteArrayOutputStream(); + gis.transferTo(decompressed); + + String warcOutput = decompressed.toString(); + System.out.println(warcOutput); + + assertTrue(warcOutput.contains("WARC-Type: revisit"), + "WARC record should have WARC-Type: revisit"); + assertTrue(warcOutput.contains("Content-Type: application/http; msgtype=response"), + "WARC revisit record should have Content-Type: application/http; msgtype=response"); + assertTrue(warcOutput.contains("WARC-Refers-To-Target-URI: http://example.com/page"), + "WARC record should have WARC-Refers-To-Target-URI header"); + assertTrue(warcOutput.contains("WARC-Profile: " + warcProfile), + "WARC record should have WARC-Profile header"); + } +} From ee3da841c2c999fd6da8e8afafc69b68827ddc61 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 24 Feb 2026 08:34:22 +0100 Subject: [PATCH 3/8] chore: add corrected assert --- src/test/org/commoncrawl/util/TestWarcWriter.java | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/test/org/commoncrawl/util/TestWarcWriter.java b/src/test/org/commoncrawl/util/TestWarcWriter.java index 90a504879e..28bbb59cd5 100644 --- a/src/test/org/commoncrawl/util/TestWarcWriter.java +++ b/src/test/org/commoncrawl/util/TestWarcWriter.java @@ -74,6 +74,12 @@ public void testWriteRevisitRecordContentType() throws IOException, URISyntaxExc "WARC record should have WARC-Type: revisit"); assertTrue(warcOutput.contains("Content-Type: application/http; msgtype=response"), "WARC revisit record should have Content-Type: application/http; msgtype=response"); + // This line is the correct assert, is left for testing whether this test is running automatically. + // Uncomment when those tests are running + // assertTrue(warcOutput.contains("WARC-Refers-To-Target-URI: https://de.wikipedia.org/wiki/Wikipedia:WikiCon_2025"), + // "WARC record should have WARC-Refers-To-Target-URI header"); + + //This line will fail. Remove after the test work assertTrue(warcOutput.contains("WARC-Refers-To-Target-URI: http://example.com/page"), "WARC record should have WARC-Refers-To-Target-URI header"); assertTrue(warcOutput.contains("WARC-Profile: " + warcProfile), From 409c52a3bd91cddff0b8d052586d3e11c836e837 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 24 Feb 2026 21:26:34 +0100 Subject: [PATCH 4/8] fix: revisit records do not have a payload, use more realistic data --- src/test/org/commoncrawl/util/TestWarcWriter.java | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/test/org/commoncrawl/util/TestWarcWriter.java b/src/test/org/commoncrawl/util/TestWarcWriter.java index 28bbb59cd5..3237d0c797 100644 --- a/src/test/org/commoncrawl/util/TestWarcWriter.java +++ b/src/test/org/commoncrawl/util/TestWarcWriter.java @@ -38,7 +38,7 @@ public void testWriteRevisitRecordContentType() throws IOException, URISyntaxExc ByteArrayOutputStream bos = new ByteArrayOutputStream(); WarcWriter writer = new WarcWriter(bos); - byte[] block = "HTTP/1.1 304\r\nContent-Type: text/html\r\n\r\n".getBytes(); + byte[] block = "HTTP/1.1 304\r\ndate: Fri, 06 Feb 2026 10:55:35 GMT\r\n\r\n".getBytes(); Configuration conf = NutchConfiguration.create(); Metadata metadata = new Metadata(); @@ -74,11 +74,8 @@ public void testWriteRevisitRecordContentType() throws IOException, URISyntaxExc "WARC record should have WARC-Type: revisit"); assertTrue(warcOutput.contains("Content-Type: application/http; msgtype=response"), "WARC revisit record should have Content-Type: application/http; msgtype=response"); - // This line is the correct assert, is left for testing whether this test is running automatically. - // Uncomment when those tests are running - // assertTrue(warcOutput.contains("WARC-Refers-To-Target-URI: https://de.wikipedia.org/wiki/Wikipedia:WikiCon_2025"), - // "WARC record should have WARC-Refers-To-Target-URI header"); - + assertTrue(warcOutput.contains("WARC-Refers-To-Target-URI: https://de.wikipedia.org/wiki/Wikipedia:WikiCon_2025"), + "WARC record should have WARC-Refers-To-Target-URI header"); //This line will fail. Remove after the test work assertTrue(warcOutput.contains("WARC-Refers-To-Target-URI: http://example.com/page"), "WARC record should have WARC-Refers-To-Target-URI header"); From b0110c9415d574596b1e6e139dbc3774770fd7ef Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 24 Feb 2026 23:04:54 +0100 Subject: [PATCH 5/8] fix: make the test working again --- src/test/org/commoncrawl/util/TestWarcWriter.java | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/test/org/commoncrawl/util/TestWarcWriter.java b/src/test/org/commoncrawl/util/TestWarcWriter.java index 3237d0c797..70f4ea240f 100644 --- a/src/test/org/commoncrawl/util/TestWarcWriter.java +++ b/src/test/org/commoncrawl/util/TestWarcWriter.java @@ -76,9 +76,6 @@ public void testWriteRevisitRecordContentType() throws IOException, URISyntaxExc "WARC revisit record should have Content-Type: application/http; msgtype=response"); assertTrue(warcOutput.contains("WARC-Refers-To-Target-URI: https://de.wikipedia.org/wiki/Wikipedia:WikiCon_2025"), "WARC record should have WARC-Refers-To-Target-URI header"); - //This line will fail. Remove after the test work - assertTrue(warcOutput.contains("WARC-Refers-To-Target-URI: http://example.com/page"), - "WARC record should have WARC-Refers-To-Target-URI header"); assertTrue(warcOutput.contains("WARC-Profile: " + warcProfile), "WARC record should have WARC-Profile header"); } From 838d197c6f198750cda77210659ae200d6b20edf Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 24 Mar 2026 15:14:17 +0100 Subject: [PATCH 6/8] fix: set empty content of the 304 response --- .../org/commoncrawl/util/TestWarcWriter.java | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/test/org/commoncrawl/util/TestWarcWriter.java b/src/test/org/commoncrawl/util/TestWarcWriter.java index 70f4ea240f..57c346d31e 100644 --- a/src/test/org/commoncrawl/util/TestWarcWriter.java +++ b/src/test/org/commoncrawl/util/TestWarcWriter.java @@ -44,11 +44,20 @@ public void testWriteRevisitRecordContentType() throws IOException, URISyntaxExc Metadata metadata = new Metadata(); metadata.add("Content-Type", "text/html"); Content content = new Content("https://de.wikipedia.org/wiki/Wikipedia:WikiCon_2025", "https://de.wikipedia.org", - block, "text/html", metadata, conf); + new byte[]{}, "text/html", metadata, conf); + + + + URL resource = getClass().getResource("/test-segments/20260224170658-revisit"); + assertNotNull(resource, "Missing test resource"); + String segmentPath = Paths.get(resource.toURI()).toAbsolutePath().toString(); + String url = "https://de.wikipedia.org/wiki/Wikipedia:WikiCon_2025"; + + Content content = SegmenterRecordReader.retrieveContent(segmentPath, url); + String targetUri = content.getUrl(); + + Metadata metadataFromContent = content.getMetadata(); - URI targetUri = new URI("https://de.wikipedia.org/wiki/Wikipedia:WikiCon_2025"); - String ip = "208.80.154.224"; - int httpStatusCode = 304; java.util.Date date = new java.util.Date(); URI warcinfoId = writer.getRecordId(); URI relatedId = writer.getRecordId(); @@ -68,7 +77,6 @@ public void testWriteRevisitRecordContentType() throws IOException, URISyntaxExc gis.transferTo(decompressed); String warcOutput = decompressed.toString(); - System.out.println(warcOutput); assertTrue(warcOutput.contains("WARC-Type: revisit"), "WARC record should have WARC-Type: revisit"); From ca5c8e895e912697996e570db89713f0ad91ef46 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 24 Mar 2026 15:35:37 +0100 Subject: [PATCH 7/8] fix: deserialize real data from the segments to partially populate the test --- .../org/commoncrawl/util/TestWarcWriter.java | 42 ++++++-------- .../util/test/SegmenterRecordReader.java | 52 ++++++++++++++++++ .../content/part-r-00000/.data.crc | Bin 0 -> 72 bytes .../content/part-r-00000/.index.crc | Bin 0 -> 12 bytes .../content/part-r-00000/data | Bin 0 -> 7964 bytes .../content/part-r-00000/index | Bin 0 -> 228 bytes 6 files changed, 70 insertions(+), 24 deletions(-) create mode 100644 src/test/org/commoncrawl/util/test/SegmenterRecordReader.java create mode 100644 src/testresources/test-segments/20260224170658-revisit/content/part-r-00000/.data.crc create mode 100644 src/testresources/test-segments/20260224170658-revisit/content/part-r-00000/.index.crc create mode 100644 src/testresources/test-segments/20260224170658-revisit/content/part-r-00000/data create mode 100644 src/testresources/test-segments/20260224170658-revisit/content/part-r-00000/index diff --git a/src/test/org/commoncrawl/util/TestWarcWriter.java b/src/test/org/commoncrawl/util/TestWarcWriter.java index 57c346d31e..72eb88f18c 100644 --- a/src/test/org/commoncrawl/util/TestWarcWriter.java +++ b/src/test/org/commoncrawl/util/TestWarcWriter.java @@ -16,59 +16,53 @@ */ package org.commoncrawl.util; -import static org.junit.jupiter.api.Assertions.assertTrue; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.net.protocols.HttpDateFormat; +import org.apache.nutch.protocol.Content; +import org.commoncrawl.util.test.SegmenterRecordReader; +import org.junit.jupiter.api.Test; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; -import java.io.IOException; import java.net.URI; -import java.net.URISyntaxException; +import java.net.URL; +import java.nio.file.Paths; +import java.util.Date; import java.util.zip.GZIPInputStream; -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.metadata.Metadata; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.util.NutchConfiguration; -import org.junit.jupiter.api.Test; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; public class TestWarcWriter { @Test - public void testWriteRevisitRecordContentType() throws IOException, URISyntaxException { + public void testWriteRevisitRecordContentType() throws Exception { ByteArrayOutputStream bos = new ByteArrayOutputStream(); WarcWriter writer = new WarcWriter(bos); - byte[] block = "HTTP/1.1 304\r\ndate: Fri, 06 Feb 2026 10:55:35 GMT\r\n\r\n".getBytes(); - - Configuration conf = NutchConfiguration.create(); - Metadata metadata = new Metadata(); - metadata.add("Content-Type", "text/html"); - Content content = new Content("https://de.wikipedia.org/wiki/Wikipedia:WikiCon_2025", "https://de.wikipedia.org", - new byte[]{}, "text/html", metadata, conf); - - - URL resource = getClass().getResource("/test-segments/20260224170658-revisit"); assertNotNull(resource, "Missing test resource"); String segmentPath = Paths.get(resource.toURI()).toAbsolutePath().toString(); String url = "https://de.wikipedia.org/wiki/Wikipedia:WikiCon_2025"; Content content = SegmenterRecordReader.retrieveContent(segmentPath, url); - String targetUri = content.getUrl(); + URI targetUri = new URI(content.getUrl()); - Metadata metadataFromContent = content.getMetadata(); + Metadata metadata = content.getMetadata(); + String ip = content.getMetadata().get("_ip_"); + int httpStatusCode = 304; - java.util.Date date = new java.util.Date(); + Date date = HttpDateFormat.toDate(metadata.get("date")); URI warcinfoId = writer.getRecordId(); URI relatedId = writer.getRecordId(); String warcProfile = WarcWriter.PROFILE_REVISIT_IDENTICAL_DIGEST; - java.util.Date refersToDate = new java.util.Date(System.currentTimeMillis() - 3600000); + Date refersToDate = new Date(System.currentTimeMillis() - 3600000); String payloadDigest = "sha1:abc123"; String blockDigest = "sha1:def456"; writer.writeWarcRevisitRecord(targetUri, ip, httpStatusCode, date, warcinfoId, relatedId, warcProfile, refersToDate, payloadDigest, - blockDigest, null, null, block, content); + blockDigest, null, null, content.getContent(), content); byte[] compressed = bos.toByteArray(); ByteArrayInputStream bis = new ByteArrayInputStream(compressed); diff --git a/src/test/org/commoncrawl/util/test/SegmenterRecordReader.java b/src/test/org/commoncrawl/util/test/SegmenterRecordReader.java new file mode 100644 index 0000000000..62057f4e17 --- /dev/null +++ b/src/test/org/commoncrawl/util/test/SegmenterRecordReader.java @@ -0,0 +1,52 @@ +package org.commoncrawl.util.test; + +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.MapFile; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.util.NutchConfiguration; + +import java.util.Arrays; + +public class SegmenterRecordReader extends Configured implements Tool { + + private Content content; + + @Override + public int run(String[] args) throws Exception { + return run(args[0], args[1]); + } + + private int run(String path, String url) throws Exception { + Path p = new Path(path, Content.DIR_NAME); + Text k = new Text(url); + MapFile.Reader[] readers = MapFileOutputFormat.getReaders(p, getConf()); + Content c = new Content(); + readers[0].get(k, c); + assert (c.getUrl().equals(url)); + assert (c.getContent() == null || c.getContent().length == 0); + this.content = c; + + return 0; + } + + public static Content retrieveContent(String segmentPath, String url) throws Exception { + SegmenterRecordReader reader = new SegmenterRecordReader(); + ToolRunner.run(NutchConfiguration.create(), + reader, Arrays.asList(segmentPath, url).toArray(new String[0])); + + return reader.getContent(); + } + + public Content getContent() { + return content; + } + + public void setContent(Content content) { + this.content = content; + } +} diff --git a/src/testresources/test-segments/20260224170658-revisit/content/part-r-00000/.data.crc b/src/testresources/test-segments/20260224170658-revisit/content/part-r-00000/.data.crc new file mode 100644 index 0000000000000000000000000000000000000000..4685c926b86720fdf22638aca93068a962c4a60c GIT binary patch literal 72 zcmV-O0Jr~Ra$^7h00IF1TcUSFct5tJtd~Z36ze?C-v~WZoIcWg-KP>c9e^HuASgDn ee>*8j0GAI3nMRDXx!cI+L*bonB~?1!t(Qv>&>n37 literal 0 HcmV?d00001 diff --git a/src/testresources/test-segments/20260224170658-revisit/content/part-r-00000/.index.crc b/src/testresources/test-segments/20260224170658-revisit/content/part-r-00000/.index.crc new file mode 100644 index 0000000000000000000000000000000000000000..9f5864f594e6cfb3a174c53b57dbd27cd12de2d4 GIT binary patch literal 12 TcmYc;N@ieSU}6Z4)>8lg5Lp5@ literal 0 HcmV?d00001 diff --git a/src/testresources/test-segments/20260224170658-revisit/content/part-r-00000/data b/src/testresources/test-segments/20260224170658-revisit/content/part-r-00000/data new file mode 100644 index 0000000000000000000000000000000000000000..526d1fd8717c1e6dfe462d60cbbea558bfadd7a9 GIT binary patch literal 7964 zcmZ{oRa6uV*sbYKX{5Uc25FE+N*d`Nx`rNLXarHZq=$|{I-~@oyFF(`r*kG&#Pz>+{@=BYyDQk!*2{}e!Pd^o$HiOT9b{{R^k0S4v+bK> zjrvkJN6r~%{P~51M1l5Sn}ylI+Z*g94gi2``TQK69l^FBM=QSnegiz+t=+x7_`D(B zkem}q=LF_K4&%dz}MP*$6(Eqau?tTtawhtQ7)Pe^@xqA*R zNr?F-ro7w!xL8>UuY?D-J?8qY``fqgKUSC7H@K!#oxk@p%ASo)T>C-FyA??FI<|qf zmuX*Z@o+}}?47M{o)n*mrj6{yPdQfb=CD!agXdo3uQ^|*Y#Ni;fLPHUdvUP?fnluv z*DtJ4Ch@)!vpwQ1qOL)yZ4cN92lWdUsRN|Pu5Zu$+YN%#nPhU;l?|EuoLRK%L(mLa zALUemER<0>Iv_a8^enLei;tVjRCTYl=K?Z{!gg-{#wly;3JLju#K?AK52=I=$vQl+|O6fCYFmaOpodn*abc|&5Oq8C6hf*VEE_^(%bm5FX zLsCvg*7+$szTwT9nb)nDP@!R-&1E*mVJZ;mYRQbBO#iYFV;gxpQ1dWNn{ZfW=;aO9 zqLj+pS*BvUEO)go_O`}Zt68FzzcMpnpG);I6h9nzX;FA7Oq~<}6~5v7DdXZvBRm>D zU>tkCCBrfgeXlen8>+EsI8rSojJA_PfNCVWLRa)AF=KX@;0h?nwB~y8<3%u{~uod_zJ-5Ki=hmzZMFDxX~18I)*ln4tM zl}{e?23REUxh;b_qhU=}t6^{~I7;Gu3c;1s0Lz1_Iw55f`IFs%NY-M)0#^lgPBy`b z?@p+>RaGgV^{5(IoQN+0wRHsA@U!jSx6CH2Vt6{!8Jp?@BGPD#3|M-_{KbaTS~2+M z;o{IYv6MdIWqUZL*iL!?Wi_>jgjU+?b|P*p4BVacV}286r9s4um&SYAhrJSxMZW{+ z9XOF#w2s2PYiD)HUfrIM5|NU%^cM4+%IUDb2Px&v_V4^r zXb){qSSX0dT`CwS$+VFwtXPhbknNtt@>Rdsu156{9HPc$^f;cC-^F4t z-f&V{B@fnq1(%P@tPnNyKrw*eGG?IEMzbB|hPWG#U9&i4_YG=A_z8jIhv5ArzMyVi z+#i2GI>jO11~TA!Fd})8%+4E2O*`A7S%14fpRB+4#1z`9BVYG_2%<&DvsrmG6W zLYAY&Tu?a+us+cchudOqC%`3miTQihCx$gOC%wu$8EyAff@9XAfCR8UNL7!{VHnqR z>`gvlEQs{TIFYX#uI6}hsiz4b!NMf%i$sKzaiL*QWt5lZD!;Rq6~&g+mx#2^i)AL3 z4cVS!I>y5!8(_i%-0U0?Js7WAms4Le@0Ku2A;r{^z5ByNFMq^A3pdtPbWY$T&i@xW z^7}%NQ8z)+G9QXZkqD0$N|B43{xmXhj@gfA_`Uom8W$4SKelgypDS<$MPMoOo2`am zsyQxi*tja8R=c7e`;*6)XydFy?sBB|!2NRn!iQCIyU>XEsE3FLGpejYLQutM14Ep9 z&RzomiRDf1zdr7YfOn~wSlJ$dr?`3*Rw0F>2_KOQN=fJL)+E1iT0-Nxjc7ED90p%k z=5h%bBaW{(68~Ip1gWfLQiu)?DRIqx7m_rb->h)GC6B+7_{D;~N=S&AZyFIACoiwc0YTnM6Bbv44=K!UJB303thJ0w1i zp1D(ARgt_QgvbnLzqiezU|a!1Ow?IukCuHen(azXb872T?L4uK=hC@ch+$xP8IV{b{ zKGDOqqlB86qY~3Gq{`{dPiZvzIT9?F$Xnp*V3c-rVjj&lCv*%oe04voI7}fn@-Zro zd`h%lRa=pXgPxutL8LWG^byKH*^uNy1VDZ_FH)8X@=R&thjJLR>5~ziHCAsqksPY% zS&U83h63Qs{^)Uyx<$Zx_7BRObrevCmmGRDxP=zigb#b%q;vjJi&PaAQ1>a$Dy$3X zf{uY&6uxc5Bgg!0&D2)Fz1;R=Bw@6_^E*QW)cAoLjQl9gs+WOCvTG(E(qD#O@FV>S z_22AzwLl^=K~*GXNY3dbV-2;n8r`;11Ev=I0Ekp)@qpP{3N`_}TC03O^pitDv4<U2N~$_YmJxmE?4jT*yO7-kzrx=@Dg&d&bc z)VI`~Tq$fwc{N&3ENL1kt2X;5;BeG#k_cCV20Ts26x$1_U)%DY^rK^AWNIVt0*5;% zhUdw-z#`P`C%H(kC^^yh%(GeD!?$$O_~a1}YFn$ehz}_lfm^TS>{)1ky2K43@iVA-CRn+0)cq*N#HwswNPO%k*f{cx<z{W<@-WmrLT0~*idh(Shg?4tiO>aMz69POD$k#?UJ&gGFI2%# zYei>N=aA>=pIb4!8~k)Zw81Y;D6h-bW&7C3N8ggO4xz~# zz)b_XAl%osq!U=^YX7lCMtS3cVM#%bfF{`ZqJeVPpLtG>yw!24Y!A1KVH@z!Z?Ltm zd-WBUq&-nHvZip)g3bNz9=95(TPrD8QR0?%a>;*ToEBqOU}sR+DqYRUwX5TTE$dMk zGhG@L1A`W@q+Ouf0@0rNf*H4ph9%UG)algT6F-4s(%!CyspMWO@Z2bz2dw28S~!5g3>y*lyz5(RH;x_Xd`jD)4?l2ww4cGtzEKq2k%8LyHsH+` z;-2D!k6W!;X~3P-Z(8rFLA1QPeSmm2#k>=PmMw+h17It*@myCNvFx)Mkru+Qg;(wu zH)!>-8y`R#GL5d}(4v}%C9v+M`lk+_5td!imTb%;y9mP=ulqz0iT&E=6N28i1<5tS ze>HR(wPa5fKv^%sYRRCF=Q?oWT`Mn17&&kfGP8KDb4T{%>XS3%C3O~%mtmZrnGbj< zkCLU=8P(=o6AhnAaA%-;N3hNQMN?UpKE|v!UZyZYXTLnFdT(al^PxPWntZk?;JU~o zKbUkT4*rldSd;U%)&lU_J{FQKDwk=*`!W6NXY?3qAfkblKB)J0w39NL*Y)P+ZytDi zrgv-*NCl%8{yaaBBT^>)TctdR#K7- zMvj=eU*>49%0sT`mt5=cUT!v(gI*d0HY~345ZkK?+SXPXY%niamWO}yBJ=!Suj?)2 z(;;godx@k3)s+T28#gJAfGz6wMRTGO`zj-2ru}ra{9uR(i>n~jTdSKgrLCuE5|_JQ z)D5ROdD4+F^T{y`?>>(Hk;k{n(?8WhIID{#==(*UTB29yo%avSUn&d}``aiN*&z>AKQ|zE3$U)2JNmBd z=Nj14ipj=Z5x(l+s@_S{w-Upu*nS$lo*O4&uEFl50{pM_POBo7jxX-PyMW+-5g{Q* z7BA=6tTGwK$bdE_l=haXX(c>X~@$o{qr%V zMevhI$kT$w^YG4D2)9NmPE>Z(Hm2vhLpZ#;06l)LM~}A#Bre0KMc;0y#Jr%dfG4D< z>z;pH&E>OzR9ZOocB+J5Z|jyNW)KHF8ZOeLGGv7cP`xi%-9FY(h8aK9ffDNtSI)s? zprw~@L|VMoV-Xq)Rf*rJqwb*|_v~Jnw<~d917~FL%qhJD;mh z$1xz40Zp8T)SV>U&5;OzCL!E51`9&^ZXVCi?Fh7cur$B&t3k}Cnz5~7LmnL)96 zm_1l+R;eswSuS@fag4l-M{@K~CnX7y6TPonp|Jv&Z-y|rVJ3)dQGM?eXT{FYFbo0> zU5jO$^ApkW_2vC;LBL7G`ikTMps){#GAgkOCc!xV~ltTpxjWOpf~)h4EL>*2d<@>zJ6;|^cD|K`@{cIE(e(HT3r96A@_c4^bk_U z5?KPb(+RKg(^wR`X!xMePOlLVvicMK%0WIcayfAtjcV`sFv^hahSL>P)7jCT0FQY_J8;$G z-cV;{V!y8?2>|)G()u2_t!Qm?I_hd*1 z2B~;v?7w$CU&MX)qAUf=42A}CWs=(=daE{xRXO`Mj(ie4sN!m|Bxi0{A6ynZmBB3R z9)x*!p;Od{xb;PuO3`89*&O+D)ku$ENmTdERcUg%UL9bp7(r251jw|s?%Z$@p@GX& zwo2!!^3=ms46i))FuQT1d3Y#!pK)_NVf?w$F-xa7GP-uYVhr87$ma97rO^~su9E2z z9BhFvzE8;e%`bDhPniUh5~m&G&uS{XHTm!rCTwEvD!Js?fFE@Y*z|9t3{r*NW(rx> zC;XSJUys=9MSGVnmoi~evpq{Qf|HV0ZzrwsqUq1>$5W8@j6c6=9WgFWpd4|R@vfYLSL42xuWk*dwqON!hQc{pq$m*r!^pOFyq?)yc9SI zqyW+d@6BYm$)zEsiqV@w>jVD?Q@!rjOzW<#djImp`u5_Uqfnu)y!Z4ha-2)ZQtH$a zs-|6imjzhB-CeU7JbUis|7=^1V=BrBfgAimj-7>;-c=G|jGLE6h}3}2(=lB(Mhv*_ zjVnHJkba%NOjne$Xplt^w4}`Ra{u@x@2TiHq&_E~`4=akwRGR9JUVp{Iv(jL^1=G- zag1A}#z`bm!E9}^(SpMMwy1E3*U9pHgx^Y%Fm0p*YoOxS=0ATw(4x;|j z8$SMYZOmpduc;QuVy(v11LAz$_|*^X0An|xwuc!go$Lc!O@mpy1r^-n())eBKAZ60 zthB34HU@k@q44u+1q2Zaffu)KZ#AV)hDdfg*RI9=EMAT)b17O!k6@ktW$_`^dCi)A zMg=bw&VJ*%x4~zE?RJjJxs{7f?9CRHs{!}2!Jnf(W!QfzPFhaYRmb@7nIl)xDX^xO z$2)i9G00N|p6!QcVLCHh66Pv1TSDtOTxyTe#|~-D=GONTgUlDKqC{b!@%cdMmt_=f z9WGF8uUgXqQ2sVMnXBk(&)o$0^s(_&tCaP>b5+z3>2nF=n^rBnaSE*4l2^3fZ8~VD z#s&FqUg*){H0HqswJ45^6C5&fxF>?+Z_FfF1t^#`uZ(=Z;&KCISnTK4=}E`cIk1p+ z%+NIuq!=tTjM7o8ogd?5E}wZq?aE{u1oM}m7^L7B=+@O=ToWsn8R!$`a{sBB_D}xf zCab*Kkcz>eDrQ;jso#0$Pl%=ia<8E@;HZ$>axpbklF+a_rB7a(guf|8!wqb@d>wazR@G&-iLu3y{ zlkW-rFmvYJIVQPBtvYYYfdvK@XebT&%b|8Vc#wjXc0pM8VC8!$}DVlGqemkww)$ z+7q({w+)l-B|VabB)dy3p+AdGAp*TNq4sxseiM5e%M0s4Sb(Nn?j`O+10Mw}x$;?j z8GCW7c;BC`y)@`rbf~ewbPZ$Avd7SNlM0I8uHL{d(ZZDPlwM73{hCXk zoVaholx>`}qN<|?^ZkbY`ZcF+;M9=Lbhl_7Gao-1%MW>OG9`&`m<;U(k5jHyeLoO* z8lWg`0(8FWlpA45i`A#e?a!&WOsuPpL)tZjeQ&DtM2EbA!#5@sy96aEL)k=@$@4yM zO1z=8{%v$-v2|%#!)*{+@F2$ zynPo~^%Arr#Eo2M=Z-C>hattWsg#Obxhz<^6=`8I>@kJipPX{Ay)Q#Jaf@XF=8OCU zw%k7I2W}Rh=~PgR?(XZ}asue)kHj;17wmTfoyfL%me_eIDAMjmwZ%28)BAI(H)d5T zNC%cxs*o|3@~65bug%XP>g1EwKOX6aKJMfpAz?=U$HIl){eM~b|B!Kj!T-hM;{T)m z1MwC>exT5QaNO|OZFPV+fuuDoV+vn~LHWg@u7&*5(fZ{PgnoM~k*l8NZ5<7}U;HX5 zZ{3GeT7x|gPEUV7&n*Mx^`e>?%tP4eaq=zIlFAN?j0tvlUvBPF?(_KZ*l`EsV*B%3 z?4-Rs$EdlCx-y;;MrMr3fl``Gd)#F9E&~5;iS*egSvmLa$?AMBD<$hUy_+HujKrkx z-JSxpro=r>z15?!ASOd@0o5l$S%|;h=ihSdbhA_jmO)YW-VRf^N^&-GhT$e|WlL_K zWr;SgAmXr7broMu|8`7S8QrjiCRG}5vulURGR>ek_-{w1$uFKh2T)Oi=(X+exGQ-u18ni5>&C`EacZtK5 zH99WLb-pkFmGoZ7TncMvPkdL_{cdRo9gS(F*jl?uru_RH%4<(ZkpC=myHv&RH+DpE z*Ufw4jB@jb-tVO}%-TH3@G<@#y~^9MPVCjGe*E&K+}`&r>_l2^d(=%Mf~jb&$bSTw zbvqIyjz|Ti;nPmmZyKEHy70zoTEAzx&_CMHCqHx*Um#q$dBNVjrN23)^M~d5!%(`z zStccax;8GacUt*%RIDh@Hd=yU{jmS#Ut(a&JI^e9XMc}C4S)&QR$L2{3l zckA+bQ>UDnYJP`8#_2cX(kv(mB)-$xA@_d;H%?(=g`IyiF{bygqAj3e63^Q*FEpCH zZ0{wW4s+rNUnGedzFl~1Q03fC(nEce&y;+<|8}4ie9y4gEkSp&e`neSb}i{*eV z_Ax_YcNG6Q-Ef4bhhi+TjC2qx86{DDp2H)*G$(vOIyWjkp{O)mEaHr&RDs2xl1im= zK=OVZn71|&@kQf{1?Y!8N@eSWlUr?8GxB~-!G4J1R_&MJUQ zX3_K~?e4!zLSd`YvFg@j9T=d(`pyp#88v#SrWT6hZmUB$^z^(N!RAS4&N2!PwRuW7lB0m?VbZ^2?Q5#9bh{Iaf-uLqJnmWRh`s(g-3UUd9Kq}sTi?tfFdOps})*Nt=SMbE3cWbW-R|U>T>jZ zyeHw&CyOJhw0ezxYZjh97}P@CeR)!*Gun`4uB-1h%1V(ycfR<8Z-ckZmv97=(6#H2 zH@=KYdHAwCOebqQg3?rZ(fj0tixa;5gGY>rrC;1QD7)JHmo~$2LWOFMNc8vf>(+w= zu858$Lx_#1}d3S#Xy;SuSj0s9sutXjREN!7z&|H4VCr7(P|crqlf*ED6#+S};P zwe;~2%f7#vk!PBA8%E{1(+yZc2+j7Nk2*qfZ)>hznhaiI{Vf)S+sxsVoN-d?N`F`Sh>N2TS*+rinY2oh0 z73rh|;7a)XVbaCS#MfBzx@*pMS|-q^5LOE-sS(nZy4Lm4v1lEgoc*M4-r*1)=+J;l zuxS5_bGKmeQ28LsWj_!h6mXv%`s_fvHEwJ1Q|igJ^2M+5YYF!c;|RKk_d&AvjgU$w PzSAM_gZo#X7sCGoAahDR literal 0 HcmV?d00001 diff --git a/src/testresources/test-segments/20260224170658-revisit/content/part-r-00000/index b/src/testresources/test-segments/20260224170658-revisit/content/part-r-00000/index new file mode 100644 index 0000000000000000000000000000000000000000..6aca7f950611c2d5bbe49668117243ecf178d904 GIT binary patch literal 228 zcmWG`4P=wdFG|--EJ#ewNY%?oOv%qL(96u%3rVdgQN$|alb@F!UX)pqn3R*s$f$)? zO>%y2K~ZXPv7Sq6T4HHViF1BRYBB>5JQFdlJXbpX_<`eE_muSx>;4AQ`c Zw#Ps#I2ky3D&{07Ff_X{Fmy6B0RWDmR=NNH literal 0 HcmV?d00001 From f1c35e3575aed3c6cd8d8beaf193ce4c8fac79c7 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 24 Mar 2026 16:01:43 +0100 Subject: [PATCH 8/8] fix: tests to run with ant --- src/test/org/commoncrawl/util/TestWarcWriter.java | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/test/org/commoncrawl/util/TestWarcWriter.java b/src/test/org/commoncrawl/util/TestWarcWriter.java index 72eb88f18c..4f7344010d 100644 --- a/src/test/org/commoncrawl/util/TestWarcWriter.java +++ b/src/test/org/commoncrawl/util/TestWarcWriter.java @@ -24,9 +24,8 @@ import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; +import java.io.File; import java.net.URI; -import java.net.URL; -import java.nio.file.Paths; import java.util.Date; import java.util.zip.GZIPInputStream; @@ -40,9 +39,9 @@ public void testWriteRevisitRecordContentType() throws Exception { ByteArrayOutputStream bos = new ByteArrayOutputStream(); WarcWriter writer = new WarcWriter(bos); - URL resource = getClass().getResource("/test-segments/20260224170658-revisit"); - assertNotNull(resource, "Missing test resource"); - String segmentPath = Paths.get(resource.toURI()).toAbsolutePath().toString(); + File segmentDir = new File(System.getProperty("test.build.data", "."), "test-segments/20260224170658-revisit"); + assertNotNull(segmentDir, "Missing segment resource"); + String segmentPath = segmentDir.getAbsolutePath(); String url = "https://de.wikipedia.org/wiki/Wikipedia:WikiCon_2025"; Content content = SegmenterRecordReader.retrieveContent(segmentPath, url);