Skip to content

Commit b3b78bb

Browse files
WarcCdxWriter: extraction of redirect targets for CDX should not be case-sensitive (#18)
- make extraction of HTTP headers not depend on correct casing for: - "Location" and "Content-Type" (WarcCdxWriter: "redirect" and "mime") - "Content-Type" (support for language detector) - refactor: header names as constants
1 parent 5b73e16 commit b3b78bb

4 files changed

Lines changed: 34 additions & 20 deletions

File tree

src/java/org/commoncrawl/util/LanguageDetector.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,8 @@ protected Result detectLanguage(URI uri, Content content) {
9797

9898
String httpContentLanguage = content.getMetadata()
9999
.get(Response.CONTENT_LANGUAGE);
100-
String httpContentType = content.getMetadata().get(Response.CONTENT_TYPE);
100+
String httpContentType = WarcWriter.getMeta(content.getMetadata(),
101+
Response.CONTENT_TYPE);
101102

102103
Metadata metadata = new Metadata();
103104
if (httpContentType != null) {
@@ -107,7 +108,7 @@ protected Result detectLanguage(URI uri, Content content) {
107108
String text;
108109
byte[] bytes = content.getContent();
109110
try (AutoDetectReader charsetDetectReader = new AutoDetectReader(
110-
new ByteArrayInputStream(bytes), metadata);) {
111+
new ByteArrayInputStream(bytes), metadata)) {
111112
result.charset = charsetDetectReader.getCharset();
112113
boolean isValidUtf8 = false;
113114
if (result.charset.equals(StandardCharsets.UTF_8)) {

src/java/org/commoncrawl/util/WarcCdxWriter.java

Lines changed: 6 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,8 @@
1818

1919
import java.io.IOException;
2020
import java.io.OutputStream;
21-
import java.net.MalformedURLException;
2221
import java.net.URI;
2322
import java.net.URISyntaxException;
24-
import java.net.URL;
2523
import java.nio.charset.Charset;
2624
import java.nio.charset.StandardCharsets;
2725
import java.text.SimpleDateFormat;
@@ -34,6 +32,7 @@
3432
import org.apache.hadoop.fs.Path;
3533
import org.apache.hadoop.util.StringUtils;
3634
import org.apache.nutch.metadata.Metadata;
35+
import org.apache.nutch.net.protocols.Response;
3736
import org.apache.nutch.protocol.Content;
3837
import org.archive.url.WaybackURLKeyMaker;
3938
import org.slf4j.Logger;
@@ -120,15 +119,7 @@ public URI writeWarcResponseRecord(final URI targetUri, final String ip,
120119
long length = (countingOut.getByteCount() - offset);
121120
String redirectLocation = null;
122121
if (isRedirect(httpStatusCode)) {
123-
redirectLocation = content.getMetadata().get("Location");
124-
if (redirectLocation != null) {
125-
try {
126-
redirectLocation = new URL(targetUri.toURL(), redirectLocation)
127-
.toURI().toString();
128-
} catch (URISyntaxException | MalformedURLException e) {
129-
redirectLocation = null;
130-
}
131-
}
122+
redirectLocation = getMeta(content.getMetadata(), "Location");
132123
}
133124
writeCdxLine(targetUri, date, offset, length, payloadDigest, content, false,
134125
redirectLocation, truncated);
@@ -162,21 +153,21 @@ public void writeCdxLine(final URI targetUri, final Date date, long offset,
162153
if (revisit) {
163154
data.put("mime", "warc/revisit");
164155
} else {
165-
data.put("mime", cleanMimeType(meta.get("Content-Type")));
156+
data.put("mime", cleanMimeType(getMeta(meta, Response.CONTENT_TYPE)));
166157
data.put("mime-detected", content.getContentType());
167158
}
168-
data.put("status", meta.get("HTTP-Status-Code"));
159+
data.put("status", meta.get(WarcWriter.HTTP_STATUS_CODE));
169160
if (payloadDigest != null) {
170161
data.put("digest", payloadDigest);
171162
}
172163
data.put("length", String.format("%d", length));
173164
data.put("offset", String.format("%d", offset));
174165
data.put("filename", warcFilename);
175-
String val = meta.get("Detected-Charset");
166+
String val = meta.get(WarcWriter.DETECTED_CHARSET);
176167
if (val != null) {
177168
data.put("charset", val);
178169
}
179-
val = meta.get("Detected-Language");
170+
val = meta.get(WarcWriter.DETECTED_LANGUAGE);
180171
if (val != null) {
181172
data.put("languages", val);
182173
}

src/java/org/commoncrawl/util/WarcRecordWriter.java

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -637,7 +637,7 @@ public synchronized void write(Text key, WarcCapture value)
637637
}
638638

639639
if (generateCdx) {
640-
value.content.getMetadata().add("HTTP-Status-Code",
640+
value.content.getMetadata().add(WarcWriter.HTTP_STATUS_CODE,
641641
String.format("%d", httpStatusCode));
642642
}
643643

@@ -660,14 +660,15 @@ public synchronized void write(Text key, WarcCapture value)
660660
}
661661
if (generateCdx) {
662662
if (ldres.charset != null) {
663-
value.content.getMetadata().add("Detected-Charset",
663+
value.content.getMetadata().add(WarcWriter.DETECTED_CHARSET,
664664
ldres.charset.name());
665665
}
666666
org.commoncrawl.langdetect.cld2.Result lr = ldres.languages;
667667
if (lr != null) {
668668
String codes = lr.getLanguageCodesISO639_3(",", true);
669669
if (codes != null && !codes.isEmpty()) {
670-
value.content.getMetadata().add("Detected-Language", codes);
670+
value.content.getMetadata().add(WarcWriter.DETECTED_LANGUAGE,
671+
codes);
671672
}
672673
}
673674
}

src/java/org/commoncrawl/util/WarcWriter.java

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
import java.util.UUID;
3333
import java.util.zip.GZIPOutputStream;
3434

35+
import org.apache.nutch.metadata.Metadata;
3536
import org.apache.nutch.protocol.Content;
3637

3738
public class WarcWriter {
@@ -74,6 +75,11 @@ public class WarcWriter {
7475
private static final String CRLF = "\r\n";
7576
private static final String COLONSP = ": ";
7677

78+
/* Metadata names to pass from WARC to CDX */
79+
protected static final String HTTP_STATUS_CODE = "HTTP-Status-Code";
80+
protected static final String DETECTED_CHARSET = "Detected-Charset";
81+
protected static final String DETECTED_LANGUAGE = "Detected-Language";
82+
7783
private SimpleDateFormat isoDate;
7884

7985
public static class CompressedOutputStream extends GZIPOutputStream {
@@ -374,4 +380,19 @@ public URI getRecordId() {
374380
throw new RuntimeException(e);
375381
}
376382
}
383+
384+
protected static String getMeta(Metadata metadata, String name) {
385+
String value = metadata.get(name);
386+
if (value == null) {
387+
// check for case variants
388+
for (String n : metadata.names()) {
389+
if (n.equalsIgnoreCase(name)) {
390+
value = metadata.get(n);
391+
break;
392+
}
393+
}
394+
}
395+
return value;
396+
}
397+
377398
}

0 commit comments

Comments
 (0)