Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 40 additions & 14 deletions src/java/org/commoncrawl/util/WarcCdxWriter.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,10 @@

import java.io.IOException;
import java.io.OutputStream;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.text.SimpleDateFormat;
Expand Down Expand Up @@ -92,36 +94,50 @@ public WarcCdxWriter(OutputStream warcOut, OutputStream cdxOut,
}

public URI writeWarcRevisitRecord(final URI targetUri, final String ip,
final Date date, final URI warcinfoId, final URI relatedId,
final String warcProfile, final Date refersToDate,
final int httpStatusCode, final Date date, final URI warcinfoId,
final URI relatedId, final String warcProfile, final Date refersToDate,
final String payloadDigest, final String blockDigest, byte[] block,
Content content) throws IOException {
long offset = countingOut.getByteCount();
URI recordId = super.writeWarcRevisitRecord(targetUri, ip, date, warcinfoId,
relatedId, warcProfile, refersToDate, payloadDigest, blockDigest, block,
content);
URI recordId = super.writeWarcRevisitRecord(targetUri, ip, httpStatusCode,
date, warcinfoId, relatedId, warcProfile, refersToDate, payloadDigest,
blockDigest, block, content);
long length = (countingOut.getByteCount() - offset);
writeCdxLine(targetUri, date, offset, length, payloadDigest, content, true);
writeCdxLine(targetUri, date, offset, length, payloadDigest, content, true,
null, null);
return recordId;
}

public URI writeWarcResponseRecord(final URI targetUri, final String ip,
final Date date, final URI warcinfoId, final URI relatedId,
final String payloadDigest, final String blockDigest,
final int httpStatusCode, final Date date, final URI warcinfoId,
final URI relatedId, final String payloadDigest, final String blockDigest,
final String truncated, final byte[] block, Content content)
throws IOException {
long offset = countingOut.getByteCount();
URI recordId = super.writeWarcResponseRecord(targetUri, ip, date,
warcinfoId, relatedId, payloadDigest, blockDigest, truncated, block,
content);
URI recordId = super.writeWarcResponseRecord(targetUri, ip, httpStatusCode,
date, warcinfoId, relatedId, payloadDigest, blockDigest, truncated,
block, content);
long length = (countingOut.getByteCount() - offset);
writeCdxLine(targetUri, date, offset, length, payloadDigest, content, false);
String redirectLocation = null;
if (isRedirect(httpStatusCode)) {
redirectLocation = content.getMetadata().get("Location");
if (redirectLocation != null) {
try {
redirectLocation = new URL(targetUri.toURL(), redirectLocation)
.toURI().toString();
} catch (URISyntaxException | MalformedURLException e) {
redirectLocation = null;
}
}
}
writeCdxLine(targetUri, date, offset, length, payloadDigest, content, false,
redirectLocation, truncated);
return recordId;
}

public void writeCdxLine(final URI targetUri, final Date date, long offset,
long length, String payloadDigest, Content content, boolean revisit)
throws IOException {
long length, String payloadDigest, Content content, boolean revisit,
String redirectLocation, String truncated) throws IOException {
String url = targetUri.toString();
String surt = url;
Metadata meta = content.getMetadata();
Expand Down Expand Up @@ -164,6 +180,12 @@ public void writeCdxLine(final URI targetUri, final Date date, long offset,
if (val != null) {
data.put("languages", val);
}
if (truncated != null) {
data.put("truncated", truncated);
}
if (redirectLocation != null) {
data.put("redirect", redirectLocation);
}
cdxOut.write(jsonWriter.writeValueAsBytes(data));
cdxOut.write('\n');
}
Expand All @@ -182,4 +204,8 @@ protected static String cleanMimeType(String mime) {
return mime;
}

protected static boolean isRedirect(int httpStatusCode) {
return httpStatusCode >= 300 && httpStatusCode < 400
&& httpStatusCode != 304;
}
}
20 changes: 14 additions & 6 deletions src/java/org/commoncrawl/util/WarcRecordWriter.java
Original file line number Diff line number Diff line change
Expand Up @@ -498,6 +498,14 @@ public synchronized void write(Text key, WarcCapture value)
return;
}
}
String httpStatusCodeVal = value.datum.getMetaData()
.get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString();
if (httpStatusCodeVal != null) {
try {
httpStatusCode = Integer.parseInt(httpStatusCodeVal);
} catch (NumberFormatException e) {
}
}
if (value.datum.getMetaData().get(FETCH_DURATION) != null) {
fetchDuration = value.datum.getMetaData().get(FETCH_DURATION)
.toString();
Expand All @@ -507,7 +515,7 @@ public synchronized void write(Text key, WarcCapture value)
String fetchTime = value.content.getMetadata().get(Nutch.FETCH_TIME_KEY);
if (fetchTime != null) {
try {
date = new Date(new Long(fetchTime));
date = new Date(Long.parseLong(fetchTime));
} catch (NumberFormatException e) {
LOG.error("Invalid fetch time '{}' in content metadata of {}",
fetchTime, value.url.toString());
Expand Down Expand Up @@ -687,8 +695,8 @@ public synchronized void write(Text key, WarcCapture value)
* a well-defined payload."
*/
String payloadDigest = null;
writer.writeWarcRevisitRecord(targetUri, ip, date, infoId, requestId,
WarcWriter.PROFILE_REVISIT_NOT_MODIFIED, lastModifiedDate,
writer.writeWarcRevisitRecord(targetUri, ip, httpStatusCode, date, infoId,
requestId, WarcWriter.PROFILE_REVISIT_NOT_MODIFIED, lastModifiedDate,
payloadDigest, blockDigest, responseHeaderBytes, value.content);
} else {
StringBuilder responsesb = new StringBuilder(4096);
Expand All @@ -705,9 +713,9 @@ public synchronized void write(Text key, WarcCapture value)

String payloadDigest = getSha1DigestWithAlg(value.content.getContent());
String blockDigest = getSha1DigestWithAlg(responseBytes);
URI responseId = writer.writeWarcResponseRecord(targetUri, ip, date,
infoId, requestId, payloadDigest, blockDigest, truncatedReason,
responseBytes, value.content);
URI responseId = writer.writeWarcResponseRecord(targetUri, ip,
httpStatusCode, date, infoId, requestId, payloadDigest, blockDigest,
truncatedReason, responseBytes, value.content);

// Write metadata record
StringBuilder metadatasb = new StringBuilder(4096);
Expand Down
8 changes: 4 additions & 4 deletions src/java/org/commoncrawl/util/WarcWriter.java
Original file line number Diff line number Diff line change
Expand Up @@ -165,8 +165,8 @@ public URI writeWarcRequestRecord(final URI targetUri, final String ip,
}

public URI writeWarcResponseRecord(final URI targetUri, final String ip,
final Date date, final URI warcinfoId, final URI relatedId,
final String payloadDigest, final String blockDigest,
final int httpStatusCode, final Date date, final URI warcinfoId,
final URI relatedId, final String payloadDigest, final String blockDigest,
final String truncated, final byte[] block, Content content)
throws IOException {
Map<String, String> extra = new LinkedHashMap<String, String>();
Expand Down Expand Up @@ -198,8 +198,8 @@ public URI writeWarcResponseRecord(final URI targetUri, final String ip,
}

public URI writeWarcRevisitRecord(final URI targetUri, final String ip,
final Date date, final URI warcinfoId, final URI relatedId,
final String warcProfile, final Date refersToDate,
final int httpStatusCode, final Date date, final URI warcinfoId,
final URI relatedId, final String warcProfile, final Date refersToDate,
final String payloadDigest, final String blockDigest, byte[] block,
Content content) throws IOException {
Map<String, String> extra = new LinkedHashMap<String, String>();
Expand Down