diff --git a/CHANGES.md b/CHANGES.md
index b872846d..e00300f5 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,3 +1,12 @@
+1.1.6
+-----
+* [WAT extractor: adding information in WAT's warcinfo](https://github.com/iipc/webarchive-commons/issues/47)
+* [WAT extractor: missing WARC format version](https://github.com/iipc/webarchive-commons/issues/45)
+* [WAT extractor: envelope structure does not conform to the WAT specification](https://github.com/iipc/webarchive-commons/issues/44)
+* [WAT extractor: WARC-Date in all records should be the WAT record generation date](https://github.com/iipc/webarchive-commons/issues/43)
+* [WAT extractor: WARC-Filename in the WAT warcinfo record should be the WAT filename itself](https://github.com/iipc/webarchive-commons/issues/42)
+* [WAT extractor: Entity-Trailing-Slop-Bytes should be called Entity-Trailing-Slop-Length](https://github.com/iipc/webarchive-commons/issues/48)
+
1.1.5
-----
* [Escape redirect URLs in RealCDXExtractorOutput](https://github.com/iipc/webarchive-commons/pull/36)
diff --git a/pom.xml b/pom.xml
index 7a32de08..a94b12f7 100644
--- a/pom.xml
+++ b/pom.xml
@@ -9,7 +9,7 @@
org.netpreserve.commons
webarchive-commons
- 1.1.6-SNAPSHOT
+ 1.1.7-SNAPSHOT
jar
webarchive-commons
diff --git a/src/main/java/org/archive/extract/RealCDXExtractorOutput.java b/src/main/java/org/archive/extract/RealCDXExtractorOutput.java
index 8ca3ff82..e6f6e82f 100644
--- a/src/main/java/org/archive/extract/RealCDXExtractorOutput.java
+++ b/src/main/java/org/archive/extract/RealCDXExtractorOutput.java
@@ -104,7 +104,7 @@ public void output(Resource resource) throws IOException {
String meta = "TBD";
String redir = "TBD";
- if(format.equals("WARC")) {
+ if(format.startsWith("WARC")) {
origUrl = getWARCURL(m);
date = getWARCDate(m);
String type = getWARCType(m);
diff --git a/src/main/java/org/archive/extract/ResourceExtractor.java b/src/main/java/org/archive/extract/ResourceExtractor.java
index 7f4d6e7a..2812aa5b 100644
--- a/src/main/java/org/archive/extract/ResourceExtractor.java
+++ b/src/main/java/org/archive/extract/ResourceExtractor.java
@@ -1,6 +1,7 @@
package org.archive.extract;
import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
@@ -74,7 +75,7 @@ public int run(String[] args)
if(args.length < 1) {
return USAGE(1);
}
- if(args.length > 3) {
+ if(args.length > 4) {
return USAGE(1);
}
int max = Integer.MAX_VALUE;
@@ -89,7 +90,14 @@ public int run(String[] args)
}
}
String path = args[arg];
- if(args.length == arg + 2) {
+ String outputFile = null;
+ if(args.length >= arg + 2) {
+ //if a output file is specified in the command line
+ if(args.length == arg + 3) {
+ outputFile = args[arg+2];
+ os.close();
+ os = new FileOutputStream(outputFile);
+ }
if(args[arg].equals("-cdx")) {
path = args[arg+1];
out = new RealCDXExtractorOutput(makePrintWriter(os));
@@ -100,7 +108,7 @@ public int run(String[] args)
} else if(args[arg].equals("-wat")) {
path = args[arg+1];
- out = new WATExtractorOutput(os);
+ out = new WATExtractorOutput(os, outputFile);
} else {
String filter = args[arg+1];
out = new JSONViewExtractorOutput(os, filter);
diff --git a/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java b/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java
index ff46a914..68f9d1c8 100644
--- a/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java
+++ b/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java
@@ -68,7 +68,7 @@ public void output(Resource resource) throws IOException {
String date = "TBD";
String canUrl = "TBD";
- if(format.equals("WARC")) {
+ if(format.startsWith("WARC")) {
origUrl = getWARCURL(m);
date = getWARCDate(m);
String type = getWARCType(m);
diff --git a/src/main/java/org/archive/extract/WATExtractorOutput.java b/src/main/java/org/archive/extract/WATExtractorOutput.java
index f4d27147..3bcfa924 100644
--- a/src/main/java/org/archive/extract/WATExtractorOutput.java
+++ b/src/main/java/org/archive/extract/WATExtractorOutput.java
@@ -2,11 +2,13 @@
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
+import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.nio.charset.Charset;
import java.text.ParseException;
+import java.net.UnknownHostException;
import java.util.Date;
import org.archive.format.gzip.GZIPMemberWriter;
@@ -22,6 +24,12 @@
import org.archive.util.io.CommitedOutputStream;
import org.json.JSONException;
+import java.net.InetAddress;
+import java.text.DateFormat;
+import java.text.SimpleDateFormat;
+
+import java.util.logging.Logger;
+
public class WATExtractorOutput implements ExtractorOutput {
WARCRecordWriter recW;
private boolean wroteFirst;
@@ -29,11 +37,15 @@ public class WATExtractorOutput implements ExtractorOutput {
private static int DEFAULT_BUFFER_RAM = 1024 * 1024;
private int bufferRAM = DEFAULT_BUFFER_RAM;
private final static Charset UTF8 = Charset.forName("UTF-8");
+ private String outputFile;
+
+ private static final Logger LOG = Logger.getLogger(WATExtractorOutput.class.getName());
- public WATExtractorOutput(OutputStream out) {
+ public WATExtractorOutput(OutputStream out, String outputFile) {
gzW = new GZIPMemberWriter(out);
recW = new WARCRecordWriter();
wroteFirst = false;
+ this.outputFile = outputFile;
}
private CommitedOutputStream getOutput() {
@@ -56,9 +68,9 @@ public void output(Resource resource) throws IOException {
throw new IOException("Missing Envelope.Format");
}
cos = getOutput();
- if(envelopeFormat.equals("ARC")) {
+ if(envelopeFormat.startsWith("ARC")) {
writeARC(cos,top);
- } else if(envelopeFormat.equals("WARC")) {
+ } else if(envelopeFormat.startsWith("WARC")) {
writeWARC(cos,top);
} else {
// hrm...
@@ -68,13 +80,51 @@ public void output(Resource resource) throws IOException {
}
private void writeWARCInfo(OutputStream recOut, MetaData md) throws IOException {
- String filename = JSONUtils.extractSingle(md, "Container.Filename");
- if(filename == null) {
- throw new IOException("No Container.Filename...");
+ // filename is given in the command line
+ String filename = outputFile;
+ if (filename == null || filename.length() == 0) {
+ // if no filename by command line, we construct a default filename base on container filename
+ filename = JSONUtils.extractSingle(md, "Container.Filename");
+ if (filename == null) {
+ throw new IOException("No Container.Filename...");
+ }
+ if (filename.endsWith(".warc") || filename.endsWith(".warc.gz")) {
+ filename = filename.replaceFirst("\\.warc$", ".warc.wat.gz");
+ filename = filename.replaceFirst("\\.warc\\.gz$", ".warc.wat.gz");
+ } else if (filename.endsWith(".arc") || filename.endsWith(".arc.gz")) {
+ filename = filename.replaceFirst("\\.arc$", ".arc.wat.gz");
+ filename = filename.replaceFirst("\\.arc\\.gz$", ".arc.wat.gz");
+ }
}
+ // removing path from filename
+ File tmpFile = new File(filename);
+ filename = tmpFile.getName();
HttpHeaders headers = new HttpHeaders();
- headers.add("Software-Info", IAUtils.COMMONS_VERSION);
- headers.addDateHeader("Extracted-Date", new Date());
+ headers.add("software", IAUtils.COMMONS_VERSION);
+ headers.addDateHeader("extractedDate", new Date());
+
+ // add ip, hostname
+ try {
+ InetAddress host = InetAddress.getLocalHost();
+ headers.add("ip", host.getHostAddress());
+ headers.add("hostname", host.getCanonicalHostName());
+ } catch (UnknownHostException e) {
+ LOG.warning("unable to obtain local crawl engine host :\n"+e.getMessage());
+ }
+
+ headers.add("format", IAUtils.WARC_FORMAT);
+ headers.add("conformsTo", IAUtils.WARC_FORMAT_CONFORMS_TO);
+ // optional arguments
+ if(IAUtils.OPERATOR != null && IAUtils.OPERATOR.length() > 0) {
+ headers.add("operator", IAUtils.OPERATOR);
+ }
+ if(IAUtils.PUBLISHER != null && IAUtils.PUBLISHER.length() > 0) {
+ headers.add("publisher", IAUtils.PUBLISHER);
+ }
+ if(IAUtils.WAT_WARCINFO_DESCRIPTION != null && IAUtils.WAT_WARCINFO_DESCRIPTION.length() > 0) {
+ headers.add("description", IAUtils.WAT_WARCINFO_DESCRIPTION);
+ }
+
ByteArrayOutputStream baos = new ByteArrayOutputStream();
headers.write(baos);
recW.writeWARCInfoRecord(recOut,filename,baos.toByteArray());
@@ -105,8 +155,9 @@ private void writeWARC(OutputStream recOut, MetaData md) throws IOException {
} else {
targetURI = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Target-URI");
}
- String capDateString = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Date");
- capDateString = transformWARCDate(capDateString);
+ // handle date of generation in WARC format
+ DateFormat dateFormat = new SimpleDateFormat("yyyyMMddHHmmss");
+ String capDateString = dateFormat.format(new Date());
String recId = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Record-ID");
writeWARCMDRecord(recOut,md,targetURI,capDateString,recId);
}
diff --git a/src/main/java/org/archive/resource/ResourceConstants.java b/src/main/java/org/archive/resource/ResourceConstants.java
index dd04fcfe..3b8bea1c 100644
--- a/src/main/java/org/archive/resource/ResourceConstants.java
+++ b/src/main/java/org/archive/resource/ResourceConstants.java
@@ -31,6 +31,7 @@ public interface ResourceConstants {
public static final String ENVELOPE_FORMAT = "Format";
public static final String ENVELOPE_FORMAT_ARC = "ARC";
public static final String ENVELOPE_FORMAT_WARC = "WARC";
+ public static final String ENVELOPE_FORMAT_WARC_1_0 = "WARC/1.0";
public static final String WARC_HEADER_LENGTH = "WARC-Header-Length";
public static final String WARC_HEADER_METADATA = "WARC-Header-Metadata";
@@ -104,7 +105,7 @@ public interface ResourceConstants {
public static final String HTTP_ENTITY_LENGTH = "Entity-Length";
public static final String HTTP_ENTITY_DIGEST = "Entity-Digest";
- public static final String HTTP_ENTITY_TRAILING_SLOP = "Entity-Trailing-Slop-Bytes";
+ public static final String HTTP_ENTITY_TRAILING_SLOP = "Entity-Trailing-Slop-Length";
public static final String HTML_METADATA = "HTML-Metadata";
public static final String HTML_HEAD = "Head";
diff --git a/src/main/java/org/archive/resource/warc/WARCResource.java b/src/main/java/org/archive/resource/warc/WARCResource.java
index 80929206..d538a25d 100644
--- a/src/main/java/org/archive/resource/warc/WARCResource.java
+++ b/src/main/java/org/archive/resource/warc/WARCResource.java
@@ -36,7 +36,7 @@ public WARCResource(MetaData metaData, ResourceContainer container,
this.response = response;
long length = -1;
- metaData.putString(ENVELOPE_FORMAT, ENVELOPE_FORMAT_WARC);
+ metaData.putString(ENVELOPE_FORMAT, ENVELOPE_FORMAT_WARC_1_0);
metaData.putLong(WARC_HEADER_LENGTH, response.getHeaderBytes());
MetaData fields = metaData.createChild(WARC_HEADER_METADATA);
for(HttpHeader h : response.getHeaders()) {
@@ -68,11 +68,11 @@ public InputStream getInputStream() {
}
public void notifyEOF() throws IOException {
- envelope.putLong(PAYLOAD_LENGTH, countingIS.getCount());
String digString = Base32.encode(digIS.getMessageDigest().digest());
- envelope.putString(PAYLOAD_DIGEST, "sha1:"+digString);
if(container.isCompressed()) {
+ metaData.putLong(PAYLOAD_LENGTH, countingIS.getCount());
metaData.putLong(PAYLOAD_SLOP_BYTES, StreamCopy.readToEOF(response));
+ metaData.putString(PAYLOAD_DIGEST, "sha1:"+digString);
} else {
// consume trailing bytes if we can...
InputStream raw = response.getInner();
@@ -81,7 +81,9 @@ public void notifyEOF() throws IOException {
(PushBackOneByteInputStream) raw;
long numNewlines = StreamCopy.skipChars(pb1bis, CR_NL_CHARS);
if(numNewlines > 0) {
+ metaData.putLong(PAYLOAD_LENGTH, countingIS.getCount());
metaData.putLong(PAYLOAD_SLOP_BYTES, numNewlines);
+ metaData.putString(PAYLOAD_DIGEST, "sha1:"+digString);
}
}
}
diff --git a/src/main/java/org/archive/resource/warc/record/WARCMetaDataResourceFactory.java b/src/main/java/org/archive/resource/warc/record/WARCMetaDataResourceFactory.java
index 3f502665..0dfb2834 100644
--- a/src/main/java/org/archive/resource/warc/record/WARCMetaDataResourceFactory.java
+++ b/src/main/java/org/archive/resource/warc/record/WARCMetaDataResourceFactory.java
@@ -33,8 +33,8 @@ public Resource getResource(InputStream is, MetaData parentMetaData,
if(headers.isCorrupt()) {
md.putBoolean(WARC_META_FIELDS_CORRUPT, true);
}
- md.putLong(PAYLOAD_SLOP_BYTES, StreamCopy.readToEOF(is));
- md.putLong(PAYLOAD_LENGTH, bytes);
+ parentMetaData.putLong(PAYLOAD_SLOP_BYTES, StreamCopy.readToEOF(is));
+ parentMetaData.putLong(PAYLOAD_LENGTH, bytes);
return new WARCMetaDataResource(md,container, headers);
} catch (HttpParseException e) {
diff --git a/src/main/java/org/archive/util/IAUtils.java b/src/main/java/org/archive/util/IAUtils.java
index ed563d02..d3cf5cf9 100644
--- a/src/main/java/org/archive/util/IAUtils.java
+++ b/src/main/java/org/archive/util/IAUtils.java
@@ -24,7 +24,10 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
+import java.io.Reader;
+import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
+import java.util.Properties;
/**
* Miscellaneous useful methods.
@@ -35,6 +38,11 @@ public class IAUtils {
public final static Charset UTF8 = Charset.forName("utf-8");
final public static String COMMONS_VERSION = loadCommonsVersion();
+ final public static String PUBLISHER = loadCommons("publisher");
+ final public static String OPERATOR = loadCommons("operator");
+ final public static String WAT_WARCINFO_DESCRIPTION = loadCommons("wat.warcinfo.description");
+ final public static String WARC_FORMAT = loadCommons("warc.format");
+ final public static String WARC_FORMAT_CONFORMS_TO = loadCommons("warc.format.conforms.to");
public static String loadCommonsVersion() {
InputStream input = IAUtils.class.getResourceAsStream(
@@ -57,6 +65,31 @@ public static String loadCommonsVersion() {
return version.trim();
}
+ public static String loadCommons(String id) {
+ InputStream input = IAUtils.class.getResourceAsStream("/org/archive/commons.properties");
+ Reader reader = null;
+ if (input == null) {
+ return "UNKNOWN";
+ }
+ try {
+ reader = new InputStreamReader(input, "UTF-8");
+ } catch (UnsupportedEncodingException e) {
+ return "UNKNOWN";
+ }
+ Properties prop = new Properties();
+ try {
+ prop.load(reader);
+ } catch (IOException e1) {
+ return "UNKNOWN";
+ }
+ if (prop.getProperty(id) != null) {
+ return prop.getProperty(id);
+ } else {
+ return "UNKNOWN";
+ }
+
+ }
+
public static void closeQuietly(Object input) {
if(input == null || ! (input instanceof Closeable)) {
return;
diff --git a/src/main/resources/org/archive/commons.properties b/src/main/resources/org/archive/commons.properties
new file mode 100644
index 00000000..f115ff43
--- /dev/null
+++ b/src/main/resources/org/archive/commons.properties
@@ -0,0 +1,5 @@
+operator=
+publisher=
+wat.warcinfo.description=
+warc.format=WARC File Format 1.0
+warc.format.conforms.to=http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf