From 280a97547cf0f107ebcd78264e0f9fba98335c38 Mon Sep 17 00:00:00 2001 From: Khanh-Lam Mai Date: Wed, 1 Apr 2015 17:54:32 +0200 Subject: [PATCH 01/22] Fix issue #43 --- src/main/java/org/archive/extract/WATExtractorOutput.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/archive/extract/WATExtractorOutput.java b/src/main/java/org/archive/extract/WATExtractorOutput.java index f4d27147..05ab1146 100644 --- a/src/main/java/org/archive/extract/WATExtractorOutput.java +++ b/src/main/java/org/archive/extract/WATExtractorOutput.java @@ -105,8 +105,9 @@ private void writeWARC(OutputStream recOut, MetaData md) throws IOException { } else { targetURI = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Target-URI"); } - String capDateString = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Date"); - capDateString = transformWARCDate(capDateString); + // handle date of generation in WARC format + DateFormat dateFormat = new SimpleDateFormat("yyyyMMddHHmmss"); + String capDateString = dateFormat.format(new Date()); String recId = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Record-ID"); writeWARCMDRecord(recOut,md,targetURI,capDateString,recId); } From c06aff540a483f7fba1b89791ccb6c1b698cee28 Mon Sep 17 00:00:00 2001 From: Khanh-Lam Mai Date: Wed, 1 Apr 2015 17:59:22 +0200 Subject: [PATCH 02/22] Fix issue #45 --- src/main/java/org/archive/resource/ResourceConstants.java | 1 + src/main/java/org/archive/resource/warc/WARCResource.java | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/archive/resource/ResourceConstants.java b/src/main/java/org/archive/resource/ResourceConstants.java index dd04fcfe..2ab86de2 100644 --- a/src/main/java/org/archive/resource/ResourceConstants.java +++ b/src/main/java/org/archive/resource/ResourceConstants.java @@ -31,6 +31,7 @@ public interface ResourceConstants { public static final String ENVELOPE_FORMAT = "Format"; public static final String ENVELOPE_FORMAT_ARC = "ARC"; public static final String ENVELOPE_FORMAT_WARC = "WARC"; + public static final String ENVELOPE_FORMAT_WARC_1_0 = "WARC/1.0"; public static final String WARC_HEADER_LENGTH = "WARC-Header-Length"; public static final String WARC_HEADER_METADATA = "WARC-Header-Metadata"; diff --git a/src/main/java/org/archive/resource/warc/WARCResource.java b/src/main/java/org/archive/resource/warc/WARCResource.java index 80929206..91883637 100644 --- a/src/main/java/org/archive/resource/warc/WARCResource.java +++ b/src/main/java/org/archive/resource/warc/WARCResource.java @@ -36,7 +36,7 @@ public WARCResource(MetaData metaData, ResourceContainer container, this.response = response; long length = -1; - metaData.putString(ENVELOPE_FORMAT, ENVELOPE_FORMAT_WARC); + metaData.putString(ENVELOPE_FORMAT, ENVELOPE_FORMAT_WARC_1_0); metaData.putLong(WARC_HEADER_LENGTH, response.getHeaderBytes()); MetaData fields = metaData.createChild(WARC_HEADER_METADATA); for(HttpHeader h : response.getHeaders()) { From 4359e51c0cf4d0d998ab989d9ffb54f6558473ca Mon Sep 17 00:00:00 2001 From: Khanh-Lam Mai Date: Wed, 1 Apr 2015 18:14:18 +0200 Subject: [PATCH 03/22] Fix issue #42 - adding a parameter --- .../archive/extract/ResourceExtractor.java | 13 ++++++-- .../archive/extract/WATExtractorOutput.java | 30 +++++++++++++++---- 2 files changed, 34 insertions(+), 9 deletions(-) diff --git a/src/main/java/org/archive/extract/ResourceExtractor.java b/src/main/java/org/archive/extract/ResourceExtractor.java index 7f4d6e7a..11c61f4d 100644 --- a/src/main/java/org/archive/extract/ResourceExtractor.java +++ b/src/main/java/org/archive/extract/ResourceExtractor.java @@ -74,7 +74,7 @@ public int run(String[] args) if(args.length < 1) { return USAGE(1); } - if(args.length > 3) { + if(args.length > 4) { return USAGE(1); } int max = Integer.MAX_VALUE; @@ -89,7 +89,14 @@ public int run(String[] args) } } String path = args[arg]; - if(args.length == arg + 2) { + String outputFile = null; + if(args.length >= arg + 2) { + //if a output file is specified in the command line + if(args.length == arg + 3) { + outputFile=args[arg+2]; + os.close(); + os = new FileOutputStream(outputFile); + } if(args[arg].equals("-cdx")) { path = args[arg+1]; out = new RealCDXExtractorOutput(makePrintWriter(os)); @@ -100,7 +107,7 @@ public int run(String[] args) } else if(args[arg].equals("-wat")) { path = args[arg+1]; - out = new WATExtractorOutput(os); + out = new WATExtractorOutput(os, outputFile); } else { String filter = args[arg+1]; out = new JSONViewExtractorOutput(os, filter); diff --git a/src/main/java/org/archive/extract/WATExtractorOutput.java b/src/main/java/org/archive/extract/WATExtractorOutput.java index 05ab1146..624e591b 100644 --- a/src/main/java/org/archive/extract/WATExtractorOutput.java +++ b/src/main/java/org/archive/extract/WATExtractorOutput.java @@ -22,6 +22,10 @@ import org.archive.util.io.CommitedOutputStream; import org.json.JSONException; +import java.net.InetAddress; +import java.text.DateFormat; +import java.text.SimpleDateFormat; + public class WATExtractorOutput implements ExtractorOutput { WARCRecordWriter recW; private boolean wroteFirst; @@ -29,11 +33,13 @@ public class WATExtractorOutput implements ExtractorOutput { private static int DEFAULT_BUFFER_RAM = 1024 * 1024; private int bufferRAM = DEFAULT_BUFFER_RAM; private final static Charset UTF8 = Charset.forName("UTF-8"); + private String outputFile; - public WATExtractorOutput(OutputStream out) { + public WATExtractorOutput(OutputStream out, String outputFile) { gzW = new GZIPMemberWriter(out); recW = new WARCRecordWriter(); wroteFirst = false; + this.outputFile = outputFile; } private CommitedOutputStream getOutput() { @@ -56,9 +62,9 @@ public void output(Resource resource) throws IOException { throw new IOException("Missing Envelope.Format"); } cos = getOutput(); - if(envelopeFormat.equals("ARC")) { + if(envelopeFormat.startsWith("ARC")) { writeARC(cos,top); - } else if(envelopeFormat.equals("WARC")) { + } else if(envelopeFormat.startsWith("WARC")) { writeWARC(cos,top); } else { // hrm... @@ -68,9 +74,21 @@ public void output(Resource resource) throws IOException { } private void writeWARCInfo(OutputStream recOut, MetaData md) throws IOException { - String filename = JSONUtils.extractSingle(md, "Container.Filename"); - if(filename == null) { - throw new IOException("No Container.Filename..."); + // filename is given in the command line + String filename = outputFile; + if (filename == null || filename.length() == 0) { + // if no filename by command line, we construct a default filename base on container filename + filename = JSONUtils.extractSingle(md, "Container.Filename"); + if (filename == null) { + throw new IOException("No Container.Filename..."); + } + if (filename.endsWith(".warc") || filename.endsWith(".warc.gz")) { + filename = filename.replaceFirst("\\.warc$", ".warc.wat.gz"); + filename = filename.replaceFirst("\\.warc\\.gz$", ".warc.wat.gz"); + } else if (filename.endsWith(".arc") || filename.endsWith(".arc.gz")) { + filename = filename.replaceFirst("\\.arc$", ".arc.wat.gz"); + filename = filename.replaceFirst("\\.arc\\.gz$", ".arc.wat.gz"); + } } HttpHeaders headers = new HttpHeaders(); headers.add("Software-Info", IAUtils.COMMONS_VERSION); From 85902f79ded3e92f2c1dac3a4b7e4db954f893d3 Mon Sep 17 00:00:00 2001 From: Khanh-Lam Mai Date: Wed, 1 Apr 2015 18:19:32 +0200 Subject: [PATCH 04/22] Fix issue #44 --- src/main/java/org/archive/resource/warc/WARCResource.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/archive/resource/warc/WARCResource.java b/src/main/java/org/archive/resource/warc/WARCResource.java index 91883637..ce8019fb 100644 --- a/src/main/java/org/archive/resource/warc/WARCResource.java +++ b/src/main/java/org/archive/resource/warc/WARCResource.java @@ -70,9 +70,9 @@ public InputStream getInputStream() { public void notifyEOF() throws IOException { envelope.putLong(PAYLOAD_LENGTH, countingIS.getCount()); String digString = Base32.encode(digIS.getMessageDigest().digest()); - envelope.putString(PAYLOAD_DIGEST, "sha1:"+digString); if(container.isCompressed()) { metaData.putLong(PAYLOAD_SLOP_BYTES, StreamCopy.readToEOF(response)); + metaData.putString(PAYLOAD_DIGEST, "sha1:"+digString); } else { // consume trailing bytes if we can... InputStream raw = response.getInner(); @@ -82,6 +82,7 @@ public void notifyEOF() throws IOException { long numNewlines = StreamCopy.skipChars(pb1bis, CR_NL_CHARS); if(numNewlines > 0) { metaData.putLong(PAYLOAD_SLOP_BYTES, numNewlines); + metaData.putString(PAYLOAD_DIGEST, "sha1:"+digString); } } } From 18acd5e2ff7511ef76f0ae4a07e680e2961da2e4 Mon Sep 17 00:00:00 2001 From: Khanh-Lam Mai Date: Wed, 1 Apr 2015 18:26:58 +0200 Subject: [PATCH 05/22] Fix issue #47 --- .../archive/extract/WATExtractorOutput.java | 14 ++++++-- src/main/java/org/archive/util/IAUtils.java | 33 +++++++++++++++++++ .../resources/org/archive/commons.properties | 5 +++ 3 files changed, 50 insertions(+), 2 deletions(-) create mode 100644 src/main/resources/org/archive/commons.properties diff --git a/src/main/java/org/archive/extract/WATExtractorOutput.java b/src/main/java/org/archive/extract/WATExtractorOutput.java index 624e591b..bfa0a802 100644 --- a/src/main/java/org/archive/extract/WATExtractorOutput.java +++ b/src/main/java/org/archive/extract/WATExtractorOutput.java @@ -91,8 +91,18 @@ private void writeWARCInfo(OutputStream recOut, MetaData md) throws IOException } } HttpHeaders headers = new HttpHeaders(); - headers.add("Software-Info", IAUtils.COMMONS_VERSION); - headers.addDateHeader("Extracted-Date", new Date()); + headers.add("software", IAUtils.COMMONS_VERSION); + headers.addDateHeader("extractedDate", new Date()); + + //add ip, hostname, format, etc. + headers.add("ip", InetAddress.getLocalHost().getHostAddress()); + headers.add("hostname", InetAddress.getLocalHost().getHostName()); + headers.add("format", IAUtils.WARC_FORMAT); + headers.add("conformsTo", IAUtils.WARC_FORMAT_CONFORMS_TO); + headers.add("operator", IAUtils.OPERATOR); + headers.add("publisher", IAUtils.PUBLISHER); + headers.add("description", IAUtils.WAT_WARCINFO_DESCRIPTION); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); headers.write(baos); recW.writeWARCInfoRecord(recOut,filename,baos.toByteArray()); diff --git a/src/main/java/org/archive/util/IAUtils.java b/src/main/java/org/archive/util/IAUtils.java index ed563d02..d3cf5cf9 100644 --- a/src/main/java/org/archive/util/IAUtils.java +++ b/src/main/java/org/archive/util/IAUtils.java @@ -24,7 +24,10 @@ import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; +import java.io.Reader; +import java.io.UnsupportedEncodingException; import java.nio.charset.Charset; +import java.util.Properties; /** * Miscellaneous useful methods. @@ -35,6 +38,11 @@ public class IAUtils { public final static Charset UTF8 = Charset.forName("utf-8"); final public static String COMMONS_VERSION = loadCommonsVersion(); + final public static String PUBLISHER = loadCommons("publisher"); + final public static String OPERATOR = loadCommons("operator"); + final public static String WAT_WARCINFO_DESCRIPTION = loadCommons("wat.warcinfo.description"); + final public static String WARC_FORMAT = loadCommons("warc.format"); + final public static String WARC_FORMAT_CONFORMS_TO = loadCommons("warc.format.conforms.to"); public static String loadCommonsVersion() { InputStream input = IAUtils.class.getResourceAsStream( @@ -57,6 +65,31 @@ public static String loadCommonsVersion() { return version.trim(); } + public static String loadCommons(String id) { + InputStream input = IAUtils.class.getResourceAsStream("/org/archive/commons.properties"); + Reader reader = null; + if (input == null) { + return "UNKNOWN"; + } + try { + reader = new InputStreamReader(input, "UTF-8"); + } catch (UnsupportedEncodingException e) { + return "UNKNOWN"; + } + Properties prop = new Properties(); + try { + prop.load(reader); + } catch (IOException e1) { + return "UNKNOWN"; + } + if (prop.getProperty(id) != null) { + return prop.getProperty(id); + } else { + return "UNKNOWN"; + } + + } + public static void closeQuietly(Object input) { if(input == null || ! (input instanceof Closeable)) { return; diff --git a/src/main/resources/org/archive/commons.properties b/src/main/resources/org/archive/commons.properties new file mode 100644 index 00000000..0e43575d --- /dev/null +++ b/src/main/resources/org/archive/commons.properties @@ -0,0 +1,5 @@ +operator=BnF +publisher=Bibliotheque nationale de France +wat.warcinfo.description=Description du format WAT +warc.format=WARC File Format 1.0 +warc.format.conforms.to=http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf \ No newline at end of file From 7b964c64818f4d9319462dc64d06da5b5379e443 Mon Sep 17 00:00:00 2001 From: Khanh-Lam Mai Date: Wed, 1 Apr 2015 18:30:19 +0200 Subject: [PATCH 06/22] changing version in pom.xml to 1.1.7-SNAPSHOT --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 7a32de08..a94b12f7 100644 --- a/pom.xml +++ b/pom.xml @@ -9,7 +9,7 @@ org.netpreserve.commons webarchive-commons - 1.1.6-SNAPSHOT + 1.1.7-SNAPSHOT jar webarchive-commons From d8b589c3db88092dada70525db979578e29a7163 Mon Sep 17 00:00:00 2001 From: Khanh-Lam Mai Date: Wed, 1 Apr 2015 18:33:49 +0200 Subject: [PATCH 07/22] Fix issue #42 : adding a missing import --- src/main/java/org/archive/extract/ResourceExtractor.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/java/org/archive/extract/ResourceExtractor.java b/src/main/java/org/archive/extract/ResourceExtractor.java index 11c61f4d..c91db42a 100644 --- a/src/main/java/org/archive/extract/ResourceExtractor.java +++ b/src/main/java/org/archive/extract/ResourceExtractor.java @@ -1,6 +1,7 @@ package org.archive.extract; import java.io.FileNotFoundException; +import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.io.OutputStreamWriter; From d5827b3d506cd3019ebbd2af1793f92190733ac5 Mon Sep 17 00:00:00 2001 From: Khanh-Lam Mai Date: Tue, 7 Apr 2015 17:08:26 +0200 Subject: [PATCH 08/22] Extraction from WAT to CDX : correcting RealCDXExtractorOutput.java and WARCMetadataRecordExtractorOutput.java to match new WARC header (eg 'WARC' --> 'WARC/1.0') --- src/main/java/org/archive/extract/RealCDXExtractorOutput.java | 2 +- .../org/archive/extract/WARCMetadataRecordExtractorOutput.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/archive/extract/RealCDXExtractorOutput.java b/src/main/java/org/archive/extract/RealCDXExtractorOutput.java index 8ca3ff82..e6f6e82f 100644 --- a/src/main/java/org/archive/extract/RealCDXExtractorOutput.java +++ b/src/main/java/org/archive/extract/RealCDXExtractorOutput.java @@ -104,7 +104,7 @@ public void output(Resource resource) throws IOException { String meta = "TBD"; String redir = "TBD"; - if(format.equals("WARC")) { + if(format.startsWith("WARC")) { origUrl = getWARCURL(m); date = getWARCDate(m); String type = getWARCType(m); diff --git a/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java b/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java index ff46a914..68f9d1c8 100644 --- a/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java +++ b/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java @@ -68,7 +68,7 @@ public void output(Resource resource) throws IOException { String date = "TBD"; String canUrl = "TBD"; - if(format.equals("WARC")) { + if(format.startsWith("WARC")) { origUrl = getWARCURL(m); date = getWARCDate(m); String type = getWARCType(m); From 618fda8293a2737d7d80df4bfc5fa3d4e264a463 Mon Sep 17 00:00:00 2001 From: Khanh-Lam Mai Date: Tue, 14 Apr 2015 17:38:20 +0200 Subject: [PATCH 09/22] correcting whitespace around assignment operator and renaming 'Entity-Trailing-Slop-Bytes' into 'Entity-Trailing-Slop-Length' --- src/main/java/org/archive/extract/ResourceExtractor.java | 2 +- src/main/java/org/archive/resource/ResourceConstants.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/archive/extract/ResourceExtractor.java b/src/main/java/org/archive/extract/ResourceExtractor.java index c91db42a..2812aa5b 100644 --- a/src/main/java/org/archive/extract/ResourceExtractor.java +++ b/src/main/java/org/archive/extract/ResourceExtractor.java @@ -94,7 +94,7 @@ public int run(String[] args) if(args.length >= arg + 2) { //if a output file is specified in the command line if(args.length == arg + 3) { - outputFile=args[arg+2]; + outputFile = args[arg+2]; os.close(); os = new FileOutputStream(outputFile); } diff --git a/src/main/java/org/archive/resource/ResourceConstants.java b/src/main/java/org/archive/resource/ResourceConstants.java index 2ab86de2..3b8bea1c 100644 --- a/src/main/java/org/archive/resource/ResourceConstants.java +++ b/src/main/java/org/archive/resource/ResourceConstants.java @@ -105,7 +105,7 @@ public interface ResourceConstants { public static final String HTTP_ENTITY_LENGTH = "Entity-Length"; public static final String HTTP_ENTITY_DIGEST = "Entity-Digest"; - public static final String HTTP_ENTITY_TRAILING_SLOP = "Entity-Trailing-Slop-Bytes"; + public static final String HTTP_ENTITY_TRAILING_SLOP = "Entity-Trailing-Slop-Length"; public static final String HTML_METADATA = "HTML-Metadata"; public static final String HTML_HEAD = "Head"; From 6128ab3ad5955834696346ea34d9d0413b9e3b43 Mon Sep 17 00:00:00 2001 From: Khanh-Lam Mai Date: Thu, 16 Apr 2015 13:09:32 +0200 Subject: [PATCH 10/22] adding changes in CHANGES.md --- CHANGES.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index b872846d..b5d2ee85 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,3 +1,11 @@ +1.1.7 +----- +* [WAT extractor: adding information in WAT's warcinfo](https://github.com/iipc/webarchive-commons/issues/47) +* [WAT extractor: missing WARC format version](https://github.com/iipc/webarchive-commons/issues/45) +* [WAT extractor: envelope structure does not conform to the WAT specification](https://github.com/iipc/webarchive-commons/issues/44) +* [WAT extractor: WARC-Date in all records should be the WAT record generation date](https://github.com/iipc/webarchive-commons/issues/43) +* [WAT extractor: WARC-Filename in the WAT warcinfo record should be the WAT filename itself](https://github.com/iipc/webarchive-commons/issues/42) + 1.1.5 ----- * [Escape redirect URLs in RealCDXExtractorOutput](https://github.com/iipc/webarchive-commons/pull/36) From 92fdfd4e658d17d58ebda20e29bc20fb6b4ec069 Mon Sep 17 00:00:00 2001 From: Khanh-Lam Mai Date: Fri, 17 Apr 2015 09:19:34 +0200 Subject: [PATCH 11/22] putting 'Actual-Content-Lenght into metaData + removing path from filename --- src/main/java/org/archive/extract/WATExtractorOutput.java | 4 ++++ src/main/java/org/archive/resource/warc/WARCResource.java | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/archive/extract/WATExtractorOutput.java b/src/main/java/org/archive/extract/WATExtractorOutput.java index bfa0a802..090d3e06 100644 --- a/src/main/java/org/archive/extract/WATExtractorOutput.java +++ b/src/main/java/org/archive/extract/WATExtractorOutput.java @@ -2,6 +2,7 @@ import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; +import java.io.File; import java.io.IOException; import java.io.OutputStream; import java.io.OutputStreamWriter; @@ -90,6 +91,9 @@ private void writeWARCInfo(OutputStream recOut, MetaData md) throws IOException filename = filename.replaceFirst("\\.arc\\.gz$", ".arc.wat.gz"); } } + //removing path from filename + File tmpFile = new File(filename); + filename = tmpFile.getName(); HttpHeaders headers = new HttpHeaders(); headers.add("software", IAUtils.COMMONS_VERSION); headers.addDateHeader("extractedDate", new Date()); diff --git a/src/main/java/org/archive/resource/warc/WARCResource.java b/src/main/java/org/archive/resource/warc/WARCResource.java index ce8019fb..d538a25d 100644 --- a/src/main/java/org/archive/resource/warc/WARCResource.java +++ b/src/main/java/org/archive/resource/warc/WARCResource.java @@ -68,9 +68,9 @@ public InputStream getInputStream() { } public void notifyEOF() throws IOException { - envelope.putLong(PAYLOAD_LENGTH, countingIS.getCount()); String digString = Base32.encode(digIS.getMessageDigest().digest()); if(container.isCompressed()) { + metaData.putLong(PAYLOAD_LENGTH, countingIS.getCount()); metaData.putLong(PAYLOAD_SLOP_BYTES, StreamCopy.readToEOF(response)); metaData.putString(PAYLOAD_DIGEST, "sha1:"+digString); } else { @@ -81,6 +81,7 @@ public void notifyEOF() throws IOException { (PushBackOneByteInputStream) raw; long numNewlines = StreamCopy.skipChars(pb1bis, CR_NL_CHARS); if(numNewlines > 0) { + metaData.putLong(PAYLOAD_LENGTH, countingIS.getCount()); metaData.putLong(PAYLOAD_SLOP_BYTES, numNewlines); metaData.putString(PAYLOAD_DIGEST, "sha1:"+digString); } From ce23f3651bc34c6c1eab9967fd81882edf242bd0 Mon Sep 17 00:00:00 2001 From: Khanh-Lam Mai Date: Tue, 21 Apr 2015 15:18:34 +0200 Subject: [PATCH 12/22] generic defaults values for parameters in commons.properties --- src/main/resources/org/archive/commons.properties | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/main/resources/org/archive/commons.properties b/src/main/resources/org/archive/commons.properties index 0e43575d..d0c0afd0 100644 --- a/src/main/resources/org/archive/commons.properties +++ b/src/main/resources/org/archive/commons.properties @@ -1,5 +1,5 @@ -operator=BnF -publisher=Bibliotheque nationale de France -wat.warcinfo.description=Description du format WAT +operator= +publisher= +wat.warcinfo.description= warc.format=WARC File Format 1.0 -warc.format.conforms.to=http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf \ No newline at end of file +warc.format.conforms.to=http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf From 4752056bb1df9e77fa04dc125d8a2a3624d0a9e2 Mon Sep 17 00:00:00 2001 From: Khanh-Lam Mai Date: Tue, 21 Apr 2015 16:33:29 +0200 Subject: [PATCH 13/22] removing Actual-Content-Length and Trailing-Slop-Length from WARC-Metadata-Metadata --- .../resource/warc/record/WARCMetaDataResourceFactory.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/archive/resource/warc/record/WARCMetaDataResourceFactory.java b/src/main/java/org/archive/resource/warc/record/WARCMetaDataResourceFactory.java index 3f502665..d9cc2228 100644 --- a/src/main/java/org/archive/resource/warc/record/WARCMetaDataResourceFactory.java +++ b/src/main/java/org/archive/resource/warc/record/WARCMetaDataResourceFactory.java @@ -33,8 +33,8 @@ public Resource getResource(InputStream is, MetaData parentMetaData, if(headers.isCorrupt()) { md.putBoolean(WARC_META_FIELDS_CORRUPT, true); } - md.putLong(PAYLOAD_SLOP_BYTES, StreamCopy.readToEOF(is)); - md.putLong(PAYLOAD_LENGTH, bytes); + //md.putLong(PAYLOAD_SLOP_BYTES, StreamCopy.readToEOF(is)); + //md.putLong(PAYLOAD_LENGTH, bytes); return new WARCMetaDataResource(md,container, headers); } catch (HttpParseException e) { From d7b7c2aaaeccccb4ffb75a2cd5b276adc3fef0e7 Mon Sep 17 00:00:00 2001 From: Khanh-Lam Mai Date: Wed, 22 Apr 2015 16:11:54 +0200 Subject: [PATCH 14/22] removing Actual-Content-Lenght and Trailing-Slop-Length from Warc-MetaData-MetaData --- .../resource/warc/record/WARCMetaDataResourceFactory.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/archive/resource/warc/record/WARCMetaDataResourceFactory.java b/src/main/java/org/archive/resource/warc/record/WARCMetaDataResourceFactory.java index d9cc2228..0dfb2834 100644 --- a/src/main/java/org/archive/resource/warc/record/WARCMetaDataResourceFactory.java +++ b/src/main/java/org/archive/resource/warc/record/WARCMetaDataResourceFactory.java @@ -33,8 +33,8 @@ public Resource getResource(InputStream is, MetaData parentMetaData, if(headers.isCorrupt()) { md.putBoolean(WARC_META_FIELDS_CORRUPT, true); } - //md.putLong(PAYLOAD_SLOP_BYTES, StreamCopy.readToEOF(is)); - //md.putLong(PAYLOAD_LENGTH, bytes); + parentMetaData.putLong(PAYLOAD_SLOP_BYTES, StreamCopy.readToEOF(is)); + parentMetaData.putLong(PAYLOAD_LENGTH, bytes); return new WARCMetaDataResource(md,container, headers); } catch (HttpParseException e) { From 8e9cd33132d76dc045e0cf3837562805cde0693d Mon Sep 17 00:00:00 2001 From: Khanh-Lam Mai Date: Thu, 23 Apr 2015 11:59:24 +0200 Subject: [PATCH 15/22] removing default values in common.properties and making them optional --- .../org/archive/extract/WATExtractorOutput.java | 13 ++++++++++--- src/main/resources/org/archive/commons.properties | 6 +++--- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/archive/extract/WATExtractorOutput.java b/src/main/java/org/archive/extract/WATExtractorOutput.java index 090d3e06..1a82a03a 100644 --- a/src/main/java/org/archive/extract/WATExtractorOutput.java +++ b/src/main/java/org/archive/extract/WATExtractorOutput.java @@ -103,9 +103,16 @@ private void writeWARCInfo(OutputStream recOut, MetaData md) throws IOException headers.add("hostname", InetAddress.getLocalHost().getHostName()); headers.add("format", IAUtils.WARC_FORMAT); headers.add("conformsTo", IAUtils.WARC_FORMAT_CONFORMS_TO); - headers.add("operator", IAUtils.OPERATOR); - headers.add("publisher", IAUtils.PUBLISHER); - headers.add("description", IAUtils.WAT_WARCINFO_DESCRIPTION); + //optionnal arguments + if(IAUtils.OPERATOR != null && IAUtils.OPERATOR.length() > 0) { + headers.add("operator", IAUtils.OPERATOR); + } + if(IAUtils.PUBLISHER != null && IAUtils.PUBLISHER.length() > 0) { + headers.add("publisher", IAUtils.PUBLISHER); + } + if(IAUtils.WAT_WARCINFO_DESCRIPTION != null && IAUtils.WAT_WARCINFO_DESCRIPTION.length() > 0) { + headers.add("description", IAUtils.WAT_WARCINFO_DESCRIPTION); + } ByteArrayOutputStream baos = new ByteArrayOutputStream(); headers.write(baos); diff --git a/src/main/resources/org/archive/commons.properties b/src/main/resources/org/archive/commons.properties index d0c0afd0..f115ff43 100644 --- a/src/main/resources/org/archive/commons.properties +++ b/src/main/resources/org/archive/commons.properties @@ -1,5 +1,5 @@ -operator= -publisher= -wat.warcinfo.description= +operator= +publisher= +wat.warcinfo.description= warc.format=WARC File Format 1.0 warc.format.conforms.to=http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf From 8740b56681b8d1440f9a85b4dc8de2e32a96fd04 Mon Sep 17 00:00:00 2001 From: Khanh-Lam Mai Date: Mon, 4 May 2015 10:45:47 +0200 Subject: [PATCH 16/22] adding spaces in comments for consistency --- src/main/java/org/archive/extract/WATExtractorOutput.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/archive/extract/WATExtractorOutput.java b/src/main/java/org/archive/extract/WATExtractorOutput.java index 1a82a03a..f91aabd7 100644 --- a/src/main/java/org/archive/extract/WATExtractorOutput.java +++ b/src/main/java/org/archive/extract/WATExtractorOutput.java @@ -91,19 +91,19 @@ private void writeWARCInfo(OutputStream recOut, MetaData md) throws IOException filename = filename.replaceFirst("\\.arc\\.gz$", ".arc.wat.gz"); } } - //removing path from filename + // removing path from filename File tmpFile = new File(filename); filename = tmpFile.getName(); HttpHeaders headers = new HttpHeaders(); headers.add("software", IAUtils.COMMONS_VERSION); headers.addDateHeader("extractedDate", new Date()); - //add ip, hostname, format, etc. + // add ip, hostname, format, etc. headers.add("ip", InetAddress.getLocalHost().getHostAddress()); headers.add("hostname", InetAddress.getLocalHost().getHostName()); headers.add("format", IAUtils.WARC_FORMAT); headers.add("conformsTo", IAUtils.WARC_FORMAT_CONFORMS_TO); - //optionnal arguments + // optional arguments if(IAUtils.OPERATOR != null && IAUtils.OPERATOR.length() > 0) { headers.add("operator", IAUtils.OPERATOR); } From b1dc2a014685809e3e00b397c33bebfa9522d540 Mon Sep 17 00:00:00 2001 From: Khanh-Lam Mai Date: Mon, 11 May 2015 10:06:30 +0200 Subject: [PATCH 17/22] changing 1.1.7 -> 1.1.6 in CHANGES.md --- CHANGES.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index b5d2ee85..ae98dbea 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,4 +1,4 @@ -1.1.7 +1.1.6 ----- * [WAT extractor: adding information in WAT's warcinfo](https://github.com/iipc/webarchive-commons/issues/47) * [WAT extractor: missing WARC format version](https://github.com/iipc/webarchive-commons/issues/45) From 2dc62c287faa84fb7e1cdd6ca7a019d5d488af10 Mon Sep 17 00:00:00 2001 From: Khanh-Lam Mai Date: Mon, 11 May 2015 10:08:45 +0200 Subject: [PATCH 18/22] adding issue #48 in CHANGES.md --- CHANGES.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGES.md b/CHANGES.md index ae98dbea..abe7e971 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -5,6 +5,7 @@ * [WAT extractor: envelope structure does not conform to the WAT specification](https://github.com/iipc/webarchive-commons/issues/44) * [WAT extractor: WARC-Date in all records should be the WAT record generation date](https://github.com/iipc/webarchive-commons/issues/43) * [WAT extractor: WARC-Filename in the WAT warcinfo record should be the WAT filename itself](https://github.com/iipc/webarchive-commons/issues/42) + [WAT extractor: Entity-Trailing-Slop-Bytes should be called Entity-Trailing-Slop-Length](https://github.com/iipc/webarchive-commons/issues/48) 1.1.5 ----- From ba89c3c5114b621d6f770b13d9f94204ab44b7ca Mon Sep 17 00:00:00 2001 From: Lam Mai Date: Mon, 11 May 2015 10:09:39 +0200 Subject: [PATCH 19/22] Update CHANGES.md --- CHANGES.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index abe7e971..e00300f5 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -5,7 +5,7 @@ * [WAT extractor: envelope structure does not conform to the WAT specification](https://github.com/iipc/webarchive-commons/issues/44) * [WAT extractor: WARC-Date in all records should be the WAT record generation date](https://github.com/iipc/webarchive-commons/issues/43) * [WAT extractor: WARC-Filename in the WAT warcinfo record should be the WAT filename itself](https://github.com/iipc/webarchive-commons/issues/42) - [WAT extractor: Entity-Trailing-Slop-Bytes should be called Entity-Trailing-Slop-Length](https://github.com/iipc/webarchive-commons/issues/48) +* [WAT extractor: Entity-Trailing-Slop-Bytes should be called Entity-Trailing-Slop-Length](https://github.com/iipc/webarchive-commons/issues/48) 1.1.5 ----- From 16e0c91ed4ba8b87c2a6bab41d97d3aee6643a76 Mon Sep 17 00:00:00 2001 From: Khanh-Lam Mai Date: Wed, 24 Jun 2015 14:14:26 +0200 Subject: [PATCH 20/22] changing getHostName to getCanonicalHostName to conform to Heritrix --- src/main/java/org/archive/extract/WATExtractorOutput.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/archive/extract/WATExtractorOutput.java b/src/main/java/org/archive/extract/WATExtractorOutput.java index f91aabd7..6d265a47 100644 --- a/src/main/java/org/archive/extract/WATExtractorOutput.java +++ b/src/main/java/org/archive/extract/WATExtractorOutput.java @@ -100,7 +100,7 @@ private void writeWARCInfo(OutputStream recOut, MetaData md) throws IOException // add ip, hostname, format, etc. headers.add("ip", InetAddress.getLocalHost().getHostAddress()); - headers.add("hostname", InetAddress.getLocalHost().getHostName()); + headers.add("hostname", InetAddress.getLocalHost().getCanonicalHostName()); headers.add("format", IAUtils.WARC_FORMAT); headers.add("conformsTo", IAUtils.WARC_FORMAT_CONFORMS_TO); // optional arguments From b0fbabd195249022549599396bc4028f5ded9329 Mon Sep 17 00:00:00 2001 From: Khanh-Lam Mai Date: Fri, 26 Jun 2015 11:45:55 +0200 Subject: [PATCH 21/22] catching UnknownHostException similar to the Heritrix code --- .../archive/extract/WATExtractorOutput.java | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/archive/extract/WATExtractorOutput.java b/src/main/java/org/archive/extract/WATExtractorOutput.java index 6d265a47..aa4fe254 100644 --- a/src/main/java/org/archive/extract/WATExtractorOutput.java +++ b/src/main/java/org/archive/extract/WATExtractorOutput.java @@ -8,6 +8,7 @@ import java.io.OutputStreamWriter; import java.nio.charset.Charset; import java.text.ParseException; +import java.net.UnknownHostException; import java.util.Date; import org.archive.format.gzip.GZIPMemberWriter; @@ -27,6 +28,8 @@ import java.text.DateFormat; import java.text.SimpleDateFormat; +import java.util.logging.Logger; + public class WATExtractorOutput implements ExtractorOutput { WARCRecordWriter recW; private boolean wroteFirst; @@ -36,6 +39,8 @@ public class WATExtractorOutput implements ExtractorOutput { private final static Charset UTF8 = Charset.forName("UTF-8"); private String outputFile; + private static final Logger LOG = Logger.getLogger(WATExtractorOutput.class.getName()); + public WATExtractorOutput(OutputStream out, String outputFile) { gzW = new GZIPMemberWriter(out); recW = new WARCRecordWriter(); @@ -97,10 +102,16 @@ private void writeWARCInfo(OutputStream recOut, MetaData md) throws IOException HttpHeaders headers = new HttpHeaders(); headers.add("software", IAUtils.COMMONS_VERSION); headers.addDateHeader("extractedDate", new Date()); - - // add ip, hostname, format, etc. - headers.add("ip", InetAddress.getLocalHost().getHostAddress()); - headers.add("hostname", InetAddress.getLocalHost().getCanonicalHostName()); + + // add ip, hostname + try { + InetAddress host = InetAddress.getLocalHost(); + headers.add("ip", host.getHostAddress()); + headers.add("hostname", host.getCanonicalHostName()); + } catch (UnknownHostException e) { + LOG.warning("unable top obtain local crawl engine host :\n"+e.getMessage()); + } + headers.add("format", IAUtils.WARC_FORMAT); headers.add("conformsTo", IAUtils.WARC_FORMAT_CONFORMS_TO); // optional arguments From 6a0f6dc6c19b9b34f710c8f2bedd517cecd17987 Mon Sep 17 00:00:00 2001 From: Khanh-Lam Mai Date: Fri, 3 Jul 2015 15:46:44 +0200 Subject: [PATCH 22/22] correcting typo in log message --- src/main/java/org/archive/extract/WATExtractorOutput.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/archive/extract/WATExtractorOutput.java b/src/main/java/org/archive/extract/WATExtractorOutput.java index aa4fe254..3bcfa924 100644 --- a/src/main/java/org/archive/extract/WATExtractorOutput.java +++ b/src/main/java/org/archive/extract/WATExtractorOutput.java @@ -109,7 +109,7 @@ private void writeWARCInfo(OutputStream recOut, MetaData md) throws IOException headers.add("ip", host.getHostAddress()); headers.add("hostname", host.getCanonicalHostName()); } catch (UnknownHostException e) { - LOG.warning("unable top obtain local crawl engine host :\n"+e.getMessage()); + LOG.warning("unable to obtain local crawl engine host :\n"+e.getMessage()); } headers.add("format", IAUtils.WARC_FORMAT);