diff --git a/CHANGES.md b/CHANGES.md index b872846d..e00300f5 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,3 +1,12 @@ +1.1.6 +----- +* [WAT extractor: adding information in WAT's warcinfo](https://github.com/iipc/webarchive-commons/issues/47) +* [WAT extractor: missing WARC format version](https://github.com/iipc/webarchive-commons/issues/45) +* [WAT extractor: envelope structure does not conform to the WAT specification](https://github.com/iipc/webarchive-commons/issues/44) +* [WAT extractor: WARC-Date in all records should be the WAT record generation date](https://github.com/iipc/webarchive-commons/issues/43) +* [WAT extractor: WARC-Filename in the WAT warcinfo record should be the WAT filename itself](https://github.com/iipc/webarchive-commons/issues/42) +* [WAT extractor: Entity-Trailing-Slop-Bytes should be called Entity-Trailing-Slop-Length](https://github.com/iipc/webarchive-commons/issues/48) + 1.1.5 ----- * [Escape redirect URLs in RealCDXExtractorOutput](https://github.com/iipc/webarchive-commons/pull/36) diff --git a/pom.xml b/pom.xml index 7a32de08..a94b12f7 100644 --- a/pom.xml +++ b/pom.xml @@ -9,7 +9,7 @@ org.netpreserve.commons webarchive-commons - 1.1.6-SNAPSHOT + 1.1.7-SNAPSHOT jar webarchive-commons diff --git a/src/main/java/org/archive/extract/RealCDXExtractorOutput.java b/src/main/java/org/archive/extract/RealCDXExtractorOutput.java index 8ca3ff82..e6f6e82f 100644 --- a/src/main/java/org/archive/extract/RealCDXExtractorOutput.java +++ b/src/main/java/org/archive/extract/RealCDXExtractorOutput.java @@ -104,7 +104,7 @@ public void output(Resource resource) throws IOException { String meta = "TBD"; String redir = "TBD"; - if(format.equals("WARC")) { + if(format.startsWith("WARC")) { origUrl = getWARCURL(m); date = getWARCDate(m); String type = getWARCType(m); diff --git a/src/main/java/org/archive/extract/ResourceExtractor.java b/src/main/java/org/archive/extract/ResourceExtractor.java index 7f4d6e7a..2812aa5b 100644 --- a/src/main/java/org/archive/extract/ResourceExtractor.java +++ b/src/main/java/org/archive/extract/ResourceExtractor.java @@ -1,6 +1,7 @@ package org.archive.extract; import java.io.FileNotFoundException; +import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.io.OutputStreamWriter; @@ -74,7 +75,7 @@ public int run(String[] args) if(args.length < 1) { return USAGE(1); } - if(args.length > 3) { + if(args.length > 4) { return USAGE(1); } int max = Integer.MAX_VALUE; @@ -89,7 +90,14 @@ public int run(String[] args) } } String path = args[arg]; - if(args.length == arg + 2) { + String outputFile = null; + if(args.length >= arg + 2) { + //if a output file is specified in the command line + if(args.length == arg + 3) { + outputFile = args[arg+2]; + os.close(); + os = new FileOutputStream(outputFile); + } if(args[arg].equals("-cdx")) { path = args[arg+1]; out = new RealCDXExtractorOutput(makePrintWriter(os)); @@ -100,7 +108,7 @@ public int run(String[] args) } else if(args[arg].equals("-wat")) { path = args[arg+1]; - out = new WATExtractorOutput(os); + out = new WATExtractorOutput(os, outputFile); } else { String filter = args[arg+1]; out = new JSONViewExtractorOutput(os, filter); diff --git a/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java b/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java index ff46a914..68f9d1c8 100644 --- a/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java +++ b/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java @@ -68,7 +68,7 @@ public void output(Resource resource) throws IOException { String date = "TBD"; String canUrl = "TBD"; - if(format.equals("WARC")) { + if(format.startsWith("WARC")) { origUrl = getWARCURL(m); date = getWARCDate(m); String type = getWARCType(m); diff --git a/src/main/java/org/archive/extract/WATExtractorOutput.java b/src/main/java/org/archive/extract/WATExtractorOutput.java index f4d27147..3bcfa924 100644 --- a/src/main/java/org/archive/extract/WATExtractorOutput.java +++ b/src/main/java/org/archive/extract/WATExtractorOutput.java @@ -2,11 +2,13 @@ import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; +import java.io.File; import java.io.IOException; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.nio.charset.Charset; import java.text.ParseException; +import java.net.UnknownHostException; import java.util.Date; import org.archive.format.gzip.GZIPMemberWriter; @@ -22,6 +24,12 @@ import org.archive.util.io.CommitedOutputStream; import org.json.JSONException; +import java.net.InetAddress; +import java.text.DateFormat; +import java.text.SimpleDateFormat; + +import java.util.logging.Logger; + public class WATExtractorOutput implements ExtractorOutput { WARCRecordWriter recW; private boolean wroteFirst; @@ -29,11 +37,15 @@ public class WATExtractorOutput implements ExtractorOutput { private static int DEFAULT_BUFFER_RAM = 1024 * 1024; private int bufferRAM = DEFAULT_BUFFER_RAM; private final static Charset UTF8 = Charset.forName("UTF-8"); + private String outputFile; + + private static final Logger LOG = Logger.getLogger(WATExtractorOutput.class.getName()); - public WATExtractorOutput(OutputStream out) { + public WATExtractorOutput(OutputStream out, String outputFile) { gzW = new GZIPMemberWriter(out); recW = new WARCRecordWriter(); wroteFirst = false; + this.outputFile = outputFile; } private CommitedOutputStream getOutput() { @@ -56,9 +68,9 @@ public void output(Resource resource) throws IOException { throw new IOException("Missing Envelope.Format"); } cos = getOutput(); - if(envelopeFormat.equals("ARC")) { + if(envelopeFormat.startsWith("ARC")) { writeARC(cos,top); - } else if(envelopeFormat.equals("WARC")) { + } else if(envelopeFormat.startsWith("WARC")) { writeWARC(cos,top); } else { // hrm... @@ -68,13 +80,51 @@ public void output(Resource resource) throws IOException { } private void writeWARCInfo(OutputStream recOut, MetaData md) throws IOException { - String filename = JSONUtils.extractSingle(md, "Container.Filename"); - if(filename == null) { - throw new IOException("No Container.Filename..."); + // filename is given in the command line + String filename = outputFile; + if (filename == null || filename.length() == 0) { + // if no filename by command line, we construct a default filename base on container filename + filename = JSONUtils.extractSingle(md, "Container.Filename"); + if (filename == null) { + throw new IOException("No Container.Filename..."); + } + if (filename.endsWith(".warc") || filename.endsWith(".warc.gz")) { + filename = filename.replaceFirst("\\.warc$", ".warc.wat.gz"); + filename = filename.replaceFirst("\\.warc\\.gz$", ".warc.wat.gz"); + } else if (filename.endsWith(".arc") || filename.endsWith(".arc.gz")) { + filename = filename.replaceFirst("\\.arc$", ".arc.wat.gz"); + filename = filename.replaceFirst("\\.arc\\.gz$", ".arc.wat.gz"); + } } + // removing path from filename + File tmpFile = new File(filename); + filename = tmpFile.getName(); HttpHeaders headers = new HttpHeaders(); - headers.add("Software-Info", IAUtils.COMMONS_VERSION); - headers.addDateHeader("Extracted-Date", new Date()); + headers.add("software", IAUtils.COMMONS_VERSION); + headers.addDateHeader("extractedDate", new Date()); + + // add ip, hostname + try { + InetAddress host = InetAddress.getLocalHost(); + headers.add("ip", host.getHostAddress()); + headers.add("hostname", host.getCanonicalHostName()); + } catch (UnknownHostException e) { + LOG.warning("unable to obtain local crawl engine host :\n"+e.getMessage()); + } + + headers.add("format", IAUtils.WARC_FORMAT); + headers.add("conformsTo", IAUtils.WARC_FORMAT_CONFORMS_TO); + // optional arguments + if(IAUtils.OPERATOR != null && IAUtils.OPERATOR.length() > 0) { + headers.add("operator", IAUtils.OPERATOR); + } + if(IAUtils.PUBLISHER != null && IAUtils.PUBLISHER.length() > 0) { + headers.add("publisher", IAUtils.PUBLISHER); + } + if(IAUtils.WAT_WARCINFO_DESCRIPTION != null && IAUtils.WAT_WARCINFO_DESCRIPTION.length() > 0) { + headers.add("description", IAUtils.WAT_WARCINFO_DESCRIPTION); + } + ByteArrayOutputStream baos = new ByteArrayOutputStream(); headers.write(baos); recW.writeWARCInfoRecord(recOut,filename,baos.toByteArray()); @@ -105,8 +155,9 @@ private void writeWARC(OutputStream recOut, MetaData md) throws IOException { } else { targetURI = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Target-URI"); } - String capDateString = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Date"); - capDateString = transformWARCDate(capDateString); + // handle date of generation in WARC format + DateFormat dateFormat = new SimpleDateFormat("yyyyMMddHHmmss"); + String capDateString = dateFormat.format(new Date()); String recId = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Record-ID"); writeWARCMDRecord(recOut,md,targetURI,capDateString,recId); } diff --git a/src/main/java/org/archive/resource/ResourceConstants.java b/src/main/java/org/archive/resource/ResourceConstants.java index dd04fcfe..3b8bea1c 100644 --- a/src/main/java/org/archive/resource/ResourceConstants.java +++ b/src/main/java/org/archive/resource/ResourceConstants.java @@ -31,6 +31,7 @@ public interface ResourceConstants { public static final String ENVELOPE_FORMAT = "Format"; public static final String ENVELOPE_FORMAT_ARC = "ARC"; public static final String ENVELOPE_FORMAT_WARC = "WARC"; + public static final String ENVELOPE_FORMAT_WARC_1_0 = "WARC/1.0"; public static final String WARC_HEADER_LENGTH = "WARC-Header-Length"; public static final String WARC_HEADER_METADATA = "WARC-Header-Metadata"; @@ -104,7 +105,7 @@ public interface ResourceConstants { public static final String HTTP_ENTITY_LENGTH = "Entity-Length"; public static final String HTTP_ENTITY_DIGEST = "Entity-Digest"; - public static final String HTTP_ENTITY_TRAILING_SLOP = "Entity-Trailing-Slop-Bytes"; + public static final String HTTP_ENTITY_TRAILING_SLOP = "Entity-Trailing-Slop-Length"; public static final String HTML_METADATA = "HTML-Metadata"; public static final String HTML_HEAD = "Head"; diff --git a/src/main/java/org/archive/resource/warc/WARCResource.java b/src/main/java/org/archive/resource/warc/WARCResource.java index 80929206..d538a25d 100644 --- a/src/main/java/org/archive/resource/warc/WARCResource.java +++ b/src/main/java/org/archive/resource/warc/WARCResource.java @@ -36,7 +36,7 @@ public WARCResource(MetaData metaData, ResourceContainer container, this.response = response; long length = -1; - metaData.putString(ENVELOPE_FORMAT, ENVELOPE_FORMAT_WARC); + metaData.putString(ENVELOPE_FORMAT, ENVELOPE_FORMAT_WARC_1_0); metaData.putLong(WARC_HEADER_LENGTH, response.getHeaderBytes()); MetaData fields = metaData.createChild(WARC_HEADER_METADATA); for(HttpHeader h : response.getHeaders()) { @@ -68,11 +68,11 @@ public InputStream getInputStream() { } public void notifyEOF() throws IOException { - envelope.putLong(PAYLOAD_LENGTH, countingIS.getCount()); String digString = Base32.encode(digIS.getMessageDigest().digest()); - envelope.putString(PAYLOAD_DIGEST, "sha1:"+digString); if(container.isCompressed()) { + metaData.putLong(PAYLOAD_LENGTH, countingIS.getCount()); metaData.putLong(PAYLOAD_SLOP_BYTES, StreamCopy.readToEOF(response)); + metaData.putString(PAYLOAD_DIGEST, "sha1:"+digString); } else { // consume trailing bytes if we can... InputStream raw = response.getInner(); @@ -81,7 +81,9 @@ public void notifyEOF() throws IOException { (PushBackOneByteInputStream) raw; long numNewlines = StreamCopy.skipChars(pb1bis, CR_NL_CHARS); if(numNewlines > 0) { + metaData.putLong(PAYLOAD_LENGTH, countingIS.getCount()); metaData.putLong(PAYLOAD_SLOP_BYTES, numNewlines); + metaData.putString(PAYLOAD_DIGEST, "sha1:"+digString); } } } diff --git a/src/main/java/org/archive/resource/warc/record/WARCMetaDataResourceFactory.java b/src/main/java/org/archive/resource/warc/record/WARCMetaDataResourceFactory.java index 3f502665..0dfb2834 100644 --- a/src/main/java/org/archive/resource/warc/record/WARCMetaDataResourceFactory.java +++ b/src/main/java/org/archive/resource/warc/record/WARCMetaDataResourceFactory.java @@ -33,8 +33,8 @@ public Resource getResource(InputStream is, MetaData parentMetaData, if(headers.isCorrupt()) { md.putBoolean(WARC_META_FIELDS_CORRUPT, true); } - md.putLong(PAYLOAD_SLOP_BYTES, StreamCopy.readToEOF(is)); - md.putLong(PAYLOAD_LENGTH, bytes); + parentMetaData.putLong(PAYLOAD_SLOP_BYTES, StreamCopy.readToEOF(is)); + parentMetaData.putLong(PAYLOAD_LENGTH, bytes); return new WARCMetaDataResource(md,container, headers); } catch (HttpParseException e) { diff --git a/src/main/java/org/archive/util/IAUtils.java b/src/main/java/org/archive/util/IAUtils.java index ed563d02..d3cf5cf9 100644 --- a/src/main/java/org/archive/util/IAUtils.java +++ b/src/main/java/org/archive/util/IAUtils.java @@ -24,7 +24,10 @@ import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; +import java.io.Reader; +import java.io.UnsupportedEncodingException; import java.nio.charset.Charset; +import java.util.Properties; /** * Miscellaneous useful methods. @@ -35,6 +38,11 @@ public class IAUtils { public final static Charset UTF8 = Charset.forName("utf-8"); final public static String COMMONS_VERSION = loadCommonsVersion(); + final public static String PUBLISHER = loadCommons("publisher"); + final public static String OPERATOR = loadCommons("operator"); + final public static String WAT_WARCINFO_DESCRIPTION = loadCommons("wat.warcinfo.description"); + final public static String WARC_FORMAT = loadCommons("warc.format"); + final public static String WARC_FORMAT_CONFORMS_TO = loadCommons("warc.format.conforms.to"); public static String loadCommonsVersion() { InputStream input = IAUtils.class.getResourceAsStream( @@ -57,6 +65,31 @@ public static String loadCommonsVersion() { return version.trim(); } + public static String loadCommons(String id) { + InputStream input = IAUtils.class.getResourceAsStream("/org/archive/commons.properties"); + Reader reader = null; + if (input == null) { + return "UNKNOWN"; + } + try { + reader = new InputStreamReader(input, "UTF-8"); + } catch (UnsupportedEncodingException e) { + return "UNKNOWN"; + } + Properties prop = new Properties(); + try { + prop.load(reader); + } catch (IOException e1) { + return "UNKNOWN"; + } + if (prop.getProperty(id) != null) { + return prop.getProperty(id); + } else { + return "UNKNOWN"; + } + + } + public static void closeQuietly(Object input) { if(input == null || ! (input instanceof Closeable)) { return; diff --git a/src/main/resources/org/archive/commons.properties b/src/main/resources/org/archive/commons.properties new file mode 100644 index 00000000..f115ff43 --- /dev/null +++ b/src/main/resources/org/archive/commons.properties @@ -0,0 +1,5 @@ +operator= +publisher= +wat.warcinfo.description= +warc.format=WARC File Format 1.0 +warc.format.conforms.to=http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf