Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions src/java/org/commoncrawl/util/WarcWriter.java
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,9 @@ public class WarcWriter {
protected static final String DETECTED_CHARSET = "Detected-Charset";
protected static final String DETECTED_LANGUAGE = "Detected-Language";

public static final String CONTENT_TYPE_RESPONSE = "application/http; msgtype=response";
public static final String CONTENT_TYPE_METADATA = "application/warc-fields";

private SimpleDateFormat isoDate;

public static class CompressedOutputStream extends GZIPOutputStream {
Expand Down Expand Up @@ -196,7 +199,7 @@ public URI writeWarcinfoRecord(String filename, String hostname,
byte[] ba = sb.toString().getBytes(StandardCharsets.UTF_8);
URI recordId = getRecordId();

writeRecord(WARC_INFO, date, "application/warc-fields", recordId, extra,
writeRecord(WARC_INFO, date, CONTENT_TYPE_METADATA, recordId, extra,
new ByteArrayInputStream(ba), ba.length);
return recordId;
}
Expand Down Expand Up @@ -263,8 +266,7 @@ public URI writeWarcResponseRecord(final URI targetUri, final String ip,
extra.put(WARC_IDENTIFIED_PAYLOAD_TYPE, content.getContentType());

URI recordId = getRecordId();
writeRecord(WARC_RESPONSE, date, "application/http; msgtype=response",
recordId, extra, block);
writeRecord(WARC_RESPONSE, date, CONTENT_TYPE_RESPONSE, recordId, extra, block);
return recordId;
}

Expand Down Expand Up @@ -304,7 +306,7 @@ public URI writeWarcRevisitRecord(final URI targetUri, final String ip,
}

URI recordId = getRecordId();
writeRecord(WARC_REVISIT, date, "message/http", recordId, extra, block);
writeRecord(WARC_REVISIT, date, CONTENT_TYPE_RESPONSE, recordId, extra, block);
return recordId;
}

Expand All @@ -321,8 +323,7 @@ public URI writeWarcMetadataRecord(final URI targetUri, final Date date,
}

URI recordId = getRecordId();
writeRecord(WARC_METADATA, date, "application/warc-fields", recordId, extra,
block);
writeRecord(WARC_METADATA, date, CONTENT_TYPE_METADATA, recordId, extra, block);
return recordId;
}

Expand Down
83 changes: 83 additions & 0 deletions src/test/org/commoncrawl/util/TestWarcWriter.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.commoncrawl.util;

import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.net.protocols.HttpDateFormat;
import org.apache.nutch.protocol.Content;
import org.commoncrawl.util.test.SegmenterRecordReader;
import org.junit.jupiter.api.Test;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.net.URI;
import java.util.Date;
import java.util.zip.GZIPInputStream;

import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;

public class TestWarcWriter {

@Test
public void testWriteRevisitRecordContentType() throws Exception {
ByteArrayOutputStream bos = new ByteArrayOutputStream();
WarcWriter writer = new WarcWriter(bos);

File segmentDir = new File(System.getProperty("test.build.data", "."), "test-segments/20260224170658-revisit");
assertNotNull(segmentDir, "Missing segment resource");
String segmentPath = segmentDir.getAbsolutePath();
String url = "https://de.wikipedia.org/wiki/Wikipedia:WikiCon_2025";

Content content = SegmenterRecordReader.retrieveContent(segmentPath, url);
URI targetUri = new URI(content.getUrl());

Metadata metadata = content.getMetadata();
String ip = content.getMetadata().get("_ip_");
int httpStatusCode = 304;

Date date = HttpDateFormat.toDate(metadata.get("date"));
URI warcinfoId = writer.getRecordId();
URI relatedId = writer.getRecordId();
String warcProfile = WarcWriter.PROFILE_REVISIT_IDENTICAL_DIGEST;
Date refersToDate = new Date(System.currentTimeMillis() - 3600000);
String payloadDigest = "sha1:abc123";
String blockDigest = "sha1:def456";

writer.writeWarcRevisitRecord(targetUri, ip, httpStatusCode, date,
warcinfoId, relatedId, warcProfile, refersToDate, payloadDigest,
blockDigest, null, null, content.getContent(), content);

byte[] compressed = bos.toByteArray();
ByteArrayInputStream bis = new ByteArrayInputStream(compressed);
GZIPInputStream gis = new GZIPInputStream(bis);
ByteArrayOutputStream decompressed = new ByteArrayOutputStream();
gis.transferTo(decompressed);

String warcOutput = decompressed.toString();

assertTrue(warcOutput.contains("WARC-Type: revisit"),
"WARC record should have WARC-Type: revisit");
assertTrue(warcOutput.contains("Content-Type: application/http; msgtype=response"),
"WARC revisit record should have Content-Type: application/http; msgtype=response");
assertTrue(warcOutput.contains("WARC-Refers-To-Target-URI: https://de.wikipedia.org/wiki/Wikipedia:WikiCon_2025"),
"WARC record should have WARC-Refers-To-Target-URI header");
assertTrue(warcOutput.contains("WARC-Profile: " + warcProfile),
"WARC record should have WARC-Profile header");
}
}
52 changes: 52 additions & 0 deletions src/test/org/commoncrawl/util/test/SegmenterRecordReader.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
package org.commoncrawl.util.test;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.NutchConfiguration;

import java.util.Arrays;

public class SegmenterRecordReader extends Configured implements Tool {

private Content content;

@Override
public int run(String[] args) throws Exception {
return run(args[0], args[1]);
}

private int run(String path, String url) throws Exception {
Path p = new Path(path, Content.DIR_NAME);
Text k = new Text(url);
MapFile.Reader[] readers = MapFileOutputFormat.getReaders(p, getConf());
Content c = new Content();
readers[0].get(k, c);
assert (c.getUrl().equals(url));
assert (c.getContent() == null || c.getContent().length == 0);
this.content = c;

return 0;
}

public static Content retrieveContent(String segmentPath, String url) throws Exception {
SegmenterRecordReader reader = new SegmenterRecordReader();
ToolRunner.run(NutchConfiguration.create(),
reader, Arrays.asList(segmentPath, url).toArray(new String[0]));

return reader.getContent();
}

public Content getContent() {
return content;
}

public void setContent(Content content) {
this.content = content;
}
}
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading