diff --git a/src/java/org/commoncrawl/util/WarcWriter.java b/src/java/org/commoncrawl/util/WarcWriter.java index 57f7825354..aa9b20ba3b 100644 --- a/src/java/org/commoncrawl/util/WarcWriter.java +++ b/src/java/org/commoncrawl/util/WarcWriter.java @@ -95,6 +95,9 @@ public class WarcWriter { protected static final String DETECTED_CHARSET = "Detected-Charset"; protected static final String DETECTED_LANGUAGE = "Detected-Language"; + public static final String CONTENT_TYPE_RESPONSE = "application/http; msgtype=response"; + public static final String CONTENT_TYPE_METADATA = "application/warc-fields"; + private SimpleDateFormat isoDate; public static class CompressedOutputStream extends GZIPOutputStream { @@ -196,7 +199,7 @@ public URI writeWarcinfoRecord(String filename, String hostname, byte[] ba = sb.toString().getBytes(StandardCharsets.UTF_8); URI recordId = getRecordId(); - writeRecord(WARC_INFO, date, "application/warc-fields", recordId, extra, + writeRecord(WARC_INFO, date, CONTENT_TYPE_METADATA, recordId, extra, new ByteArrayInputStream(ba), ba.length); return recordId; } @@ -263,8 +266,7 @@ public URI writeWarcResponseRecord(final URI targetUri, final String ip, extra.put(WARC_IDENTIFIED_PAYLOAD_TYPE, content.getContentType()); URI recordId = getRecordId(); - writeRecord(WARC_RESPONSE, date, "application/http; msgtype=response", - recordId, extra, block); + writeRecord(WARC_RESPONSE, date, CONTENT_TYPE_RESPONSE, recordId, extra, block); return recordId; } @@ -304,7 +306,7 @@ public URI writeWarcRevisitRecord(final URI targetUri, final String ip, } URI recordId = getRecordId(); - writeRecord(WARC_REVISIT, date, "message/http", recordId, extra, block); + writeRecord(WARC_REVISIT, date, CONTENT_TYPE_RESPONSE, recordId, extra, block); return recordId; } @@ -321,8 +323,7 @@ public URI writeWarcMetadataRecord(final URI targetUri, final Date date, } URI recordId = getRecordId(); - writeRecord(WARC_METADATA, date, "application/warc-fields", recordId, extra, - block); + writeRecord(WARC_METADATA, date, CONTENT_TYPE_METADATA, recordId, extra, block); return recordId; } diff --git a/src/test/org/commoncrawl/util/TestWarcWriter.java b/src/test/org/commoncrawl/util/TestWarcWriter.java new file mode 100644 index 0000000000..4f7344010d --- /dev/null +++ b/src/test/org/commoncrawl/util/TestWarcWriter.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.commoncrawl.util; + +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.net.protocols.HttpDateFormat; +import org.apache.nutch.protocol.Content; +import org.commoncrawl.util.test.SegmenterRecordReader; +import org.junit.jupiter.api.Test; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.net.URI; +import java.util.Date; +import java.util.zip.GZIPInputStream; + +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class TestWarcWriter { + + @Test + public void testWriteRevisitRecordContentType() throws Exception { + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + WarcWriter writer = new WarcWriter(bos); + + File segmentDir = new File(System.getProperty("test.build.data", "."), "test-segments/20260224170658-revisit"); + assertNotNull(segmentDir, "Missing segment resource"); + String segmentPath = segmentDir.getAbsolutePath(); + String url = "https://de.wikipedia.org/wiki/Wikipedia:WikiCon_2025"; + + Content content = SegmenterRecordReader.retrieveContent(segmentPath, url); + URI targetUri = new URI(content.getUrl()); + + Metadata metadata = content.getMetadata(); + String ip = content.getMetadata().get("_ip_"); + int httpStatusCode = 304; + + Date date = HttpDateFormat.toDate(metadata.get("date")); + URI warcinfoId = writer.getRecordId(); + URI relatedId = writer.getRecordId(); + String warcProfile = WarcWriter.PROFILE_REVISIT_IDENTICAL_DIGEST; + Date refersToDate = new Date(System.currentTimeMillis() - 3600000); + String payloadDigest = "sha1:abc123"; + String blockDigest = "sha1:def456"; + + writer.writeWarcRevisitRecord(targetUri, ip, httpStatusCode, date, + warcinfoId, relatedId, warcProfile, refersToDate, payloadDigest, + blockDigest, null, null, content.getContent(), content); + + byte[] compressed = bos.toByteArray(); + ByteArrayInputStream bis = new ByteArrayInputStream(compressed); + GZIPInputStream gis = new GZIPInputStream(bis); + ByteArrayOutputStream decompressed = new ByteArrayOutputStream(); + gis.transferTo(decompressed); + + String warcOutput = decompressed.toString(); + + assertTrue(warcOutput.contains("WARC-Type: revisit"), + "WARC record should have WARC-Type: revisit"); + assertTrue(warcOutput.contains("Content-Type: application/http; msgtype=response"), + "WARC revisit record should have Content-Type: application/http; msgtype=response"); + assertTrue(warcOutput.contains("WARC-Refers-To-Target-URI: https://de.wikipedia.org/wiki/Wikipedia:WikiCon_2025"), + "WARC record should have WARC-Refers-To-Target-URI header"); + assertTrue(warcOutput.contains("WARC-Profile: " + warcProfile), + "WARC record should have WARC-Profile header"); + } +} diff --git a/src/test/org/commoncrawl/util/test/SegmenterRecordReader.java b/src/test/org/commoncrawl/util/test/SegmenterRecordReader.java new file mode 100644 index 0000000000..62057f4e17 --- /dev/null +++ b/src/test/org/commoncrawl/util/test/SegmenterRecordReader.java @@ -0,0 +1,52 @@ +package org.commoncrawl.util.test; + +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.MapFile; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.util.NutchConfiguration; + +import java.util.Arrays; + +public class SegmenterRecordReader extends Configured implements Tool { + + private Content content; + + @Override + public int run(String[] args) throws Exception { + return run(args[0], args[1]); + } + + private int run(String path, String url) throws Exception { + Path p = new Path(path, Content.DIR_NAME); + Text k = new Text(url); + MapFile.Reader[] readers = MapFileOutputFormat.getReaders(p, getConf()); + Content c = new Content(); + readers[0].get(k, c); + assert (c.getUrl().equals(url)); + assert (c.getContent() == null || c.getContent().length == 0); + this.content = c; + + return 0; + } + + public static Content retrieveContent(String segmentPath, String url) throws Exception { + SegmenterRecordReader reader = new SegmenterRecordReader(); + ToolRunner.run(NutchConfiguration.create(), + reader, Arrays.asList(segmentPath, url).toArray(new String[0])); + + return reader.getContent(); + } + + public Content getContent() { + return content; + } + + public void setContent(Content content) { + this.content = content; + } +} diff --git a/src/testresources/test-segments/20260224170658-revisit/content/part-r-00000/.data.crc b/src/testresources/test-segments/20260224170658-revisit/content/part-r-00000/.data.crc new file mode 100644 index 0000000000..4685c926b8 Binary files /dev/null and b/src/testresources/test-segments/20260224170658-revisit/content/part-r-00000/.data.crc differ diff --git a/src/testresources/test-segments/20260224170658-revisit/content/part-r-00000/.index.crc b/src/testresources/test-segments/20260224170658-revisit/content/part-r-00000/.index.crc new file mode 100644 index 0000000000..9f5864f594 Binary files /dev/null and b/src/testresources/test-segments/20260224170658-revisit/content/part-r-00000/.index.crc differ diff --git a/src/testresources/test-segments/20260224170658-revisit/content/part-r-00000/data b/src/testresources/test-segments/20260224170658-revisit/content/part-r-00000/data new file mode 100644 index 0000000000..526d1fd871 Binary files /dev/null and b/src/testresources/test-segments/20260224170658-revisit/content/part-r-00000/data differ diff --git a/src/testresources/test-segments/20260224170658-revisit/content/part-r-00000/index b/src/testresources/test-segments/20260224170658-revisit/content/part-r-00000/index new file mode 100644 index 0000000000..6aca7f9506 Binary files /dev/null and b/src/testresources/test-segments/20260224170658-revisit/content/part-r-00000/index differ