diff --git a/src/java/org/commoncrawl/util/WarcCdxWriter.java b/src/java/org/commoncrawl/util/WarcCdxWriter.java index 9225516c78..b3225c1bda 100644 --- a/src/java/org/commoncrawl/util/WarcCdxWriter.java +++ b/src/java/org/commoncrawl/util/WarcCdxWriter.java @@ -18,8 +18,10 @@ import java.io.IOException; import java.io.OutputStream; +import java.net.MalformedURLException; import java.net.URI; import java.net.URISyntaxException; +import java.net.URL; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.text.SimpleDateFormat; @@ -92,36 +94,50 @@ public WarcCdxWriter(OutputStream warcOut, OutputStream cdxOut, } public URI writeWarcRevisitRecord(final URI targetUri, final String ip, - final Date date, final URI warcinfoId, final URI relatedId, - final String warcProfile, final Date refersToDate, + final int httpStatusCode, final Date date, final URI warcinfoId, + final URI relatedId, final String warcProfile, final Date refersToDate, final String payloadDigest, final String blockDigest, byte[] block, Content content) throws IOException { long offset = countingOut.getByteCount(); - URI recordId = super.writeWarcRevisitRecord(targetUri, ip, date, warcinfoId, - relatedId, warcProfile, refersToDate, payloadDigest, blockDigest, block, - content); + URI recordId = super.writeWarcRevisitRecord(targetUri, ip, httpStatusCode, + date, warcinfoId, relatedId, warcProfile, refersToDate, payloadDigest, + blockDigest, block, content); long length = (countingOut.getByteCount() - offset); - writeCdxLine(targetUri, date, offset, length, payloadDigest, content, true); + writeCdxLine(targetUri, date, offset, length, payloadDigest, content, true, + null, null); return recordId; } public URI writeWarcResponseRecord(final URI targetUri, final String ip, - final Date date, final URI warcinfoId, final URI relatedId, - final String payloadDigest, final String blockDigest, + final int httpStatusCode, final Date date, final URI warcinfoId, + final URI relatedId, final String payloadDigest, final String blockDigest, final String truncated, final byte[] block, Content content) throws IOException { long offset = countingOut.getByteCount(); - URI recordId = super.writeWarcResponseRecord(targetUri, ip, date, - warcinfoId, relatedId, payloadDigest, blockDigest, truncated, block, - content); + URI recordId = super.writeWarcResponseRecord(targetUri, ip, httpStatusCode, + date, warcinfoId, relatedId, payloadDigest, blockDigest, truncated, + block, content); long length = (countingOut.getByteCount() - offset); - writeCdxLine(targetUri, date, offset, length, payloadDigest, content, false); + String redirectLocation = null; + if (isRedirect(httpStatusCode)) { + redirectLocation = content.getMetadata().get("Location"); + if (redirectLocation != null) { + try { + redirectLocation = new URL(targetUri.toURL(), redirectLocation) + .toURI().toString(); + } catch (URISyntaxException | MalformedURLException e) { + redirectLocation = null; + } + } + } + writeCdxLine(targetUri, date, offset, length, payloadDigest, content, false, + redirectLocation, truncated); return recordId; } public void writeCdxLine(final URI targetUri, final Date date, long offset, - long length, String payloadDigest, Content content, boolean revisit) - throws IOException { + long length, String payloadDigest, Content content, boolean revisit, + String redirectLocation, String truncated) throws IOException { String url = targetUri.toString(); String surt = url; Metadata meta = content.getMetadata(); @@ -164,6 +180,12 @@ public void writeCdxLine(final URI targetUri, final Date date, long offset, if (val != null) { data.put("languages", val); } + if (truncated != null) { + data.put("truncated", truncated); + } + if (redirectLocation != null) { + data.put("redirect", redirectLocation); + } cdxOut.write(jsonWriter.writeValueAsBytes(data)); cdxOut.write('\n'); } @@ -182,4 +204,8 @@ protected static String cleanMimeType(String mime) { return mime; } + protected static boolean isRedirect(int httpStatusCode) { + return httpStatusCode >= 300 && httpStatusCode < 400 + && httpStatusCode != 304; + } } diff --git a/src/java/org/commoncrawl/util/WarcRecordWriter.java b/src/java/org/commoncrawl/util/WarcRecordWriter.java index b840bc8cd6..c6369f51ec 100644 --- a/src/java/org/commoncrawl/util/WarcRecordWriter.java +++ b/src/java/org/commoncrawl/util/WarcRecordWriter.java @@ -498,6 +498,14 @@ public synchronized void write(Text key, WarcCapture value) return; } } + String httpStatusCodeVal = value.datum.getMetaData() + .get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString(); + if (httpStatusCodeVal != null) { + try { + httpStatusCode = Integer.parseInt(httpStatusCodeVal); + } catch (NumberFormatException e) { + } + } if (value.datum.getMetaData().get(FETCH_DURATION) != null) { fetchDuration = value.datum.getMetaData().get(FETCH_DURATION) .toString(); @@ -507,7 +515,7 @@ public synchronized void write(Text key, WarcCapture value) String fetchTime = value.content.getMetadata().get(Nutch.FETCH_TIME_KEY); if (fetchTime != null) { try { - date = new Date(new Long(fetchTime)); + date = new Date(Long.parseLong(fetchTime)); } catch (NumberFormatException e) { LOG.error("Invalid fetch time '{}' in content metadata of {}", fetchTime, value.url.toString()); @@ -687,8 +695,8 @@ public synchronized void write(Text key, WarcCapture value) * a well-defined payload." */ String payloadDigest = null; - writer.writeWarcRevisitRecord(targetUri, ip, date, infoId, requestId, - WarcWriter.PROFILE_REVISIT_NOT_MODIFIED, lastModifiedDate, + writer.writeWarcRevisitRecord(targetUri, ip, httpStatusCode, date, infoId, + requestId, WarcWriter.PROFILE_REVISIT_NOT_MODIFIED, lastModifiedDate, payloadDigest, blockDigest, responseHeaderBytes, value.content); } else { StringBuilder responsesb = new StringBuilder(4096); @@ -705,9 +713,9 @@ public synchronized void write(Text key, WarcCapture value) String payloadDigest = getSha1DigestWithAlg(value.content.getContent()); String blockDigest = getSha1DigestWithAlg(responseBytes); - URI responseId = writer.writeWarcResponseRecord(targetUri, ip, date, - infoId, requestId, payloadDigest, blockDigest, truncatedReason, - responseBytes, value.content); + URI responseId = writer.writeWarcResponseRecord(targetUri, ip, + httpStatusCode, date, infoId, requestId, payloadDigest, blockDigest, + truncatedReason, responseBytes, value.content); // Write metadata record StringBuilder metadatasb = new StringBuilder(4096); diff --git a/src/java/org/commoncrawl/util/WarcWriter.java b/src/java/org/commoncrawl/util/WarcWriter.java index acd559954a..a5fd512571 100644 --- a/src/java/org/commoncrawl/util/WarcWriter.java +++ b/src/java/org/commoncrawl/util/WarcWriter.java @@ -165,8 +165,8 @@ public URI writeWarcRequestRecord(final URI targetUri, final String ip, } public URI writeWarcResponseRecord(final URI targetUri, final String ip, - final Date date, final URI warcinfoId, final URI relatedId, - final String payloadDigest, final String blockDigest, + final int httpStatusCode, final Date date, final URI warcinfoId, + final URI relatedId, final String payloadDigest, final String blockDigest, final String truncated, final byte[] block, Content content) throws IOException { Map extra = new LinkedHashMap(); @@ -198,8 +198,8 @@ public URI writeWarcResponseRecord(final URI targetUri, final String ip, } public URI writeWarcRevisitRecord(final URI targetUri, final String ip, - final Date date, final URI warcinfoId, final URI relatedId, - final String warcProfile, final Date refersToDate, + final int httpStatusCode, final Date date, final URI warcinfoId, + final URI relatedId, final String warcProfile, final Date refersToDate, final String payloadDigest, final String blockDigest, byte[] block, Content content) throws IOException { Map extra = new LinkedHashMap();