From a2cc42cac2777d06ab40e09811cdc883773775b9 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Thu, 11 Jun 2020 14:24:03 +0200 Subject: [PATCH 1/2] WAT extractor: do not fail on missing WARC-Filename in warcinfo record, fixes #88 - do not throw IOException if there is no WARC-Filename in warcinfo record - write metadata record (corresponding to warcinfo) without WARC-Target-URI --- src/main/java/org/archive/extract/WATExtractorOutput.java | 2 +- src/main/java/org/archive/format/warc/WARCRecordWriter.java | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/archive/extract/WATExtractorOutput.java b/src/main/java/org/archive/extract/WATExtractorOutput.java index 3bcfa924..4b5f72ed 100644 --- a/src/main/java/org/archive/extract/WATExtractorOutput.java +++ b/src/main/java/org/archive/extract/WATExtractorOutput.java @@ -151,7 +151,7 @@ private void writeWARC(OutputStream recOut, MetaData md) throws IOException { String warcType = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Type"); String targetURI; if(warcType.equals("warcinfo")) { - targetURI = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Filename"); + targetURI = JSONUtils.extractSingle(md, "Envelope.WARC-Header-Metadata.WARC-Filename"); } else { targetURI = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Target-URI"); } diff --git a/src/main/java/org/archive/format/warc/WARCRecordWriter.java b/src/main/java/org/archive/format/warc/WARCRecordWriter.java index 0aab83b7..3278b289 100644 --- a/src/main/java/org/archive/format/warc/WARCRecordWriter.java +++ b/src/main/java/org/archive/format/warc/WARCRecordWriter.java @@ -88,7 +88,10 @@ public void writeJSONMetadataRecord( OutputStream out, { HttpHeaders headers = new HttpHeaders(); headers.add(HEADER_KEY_TYPE, WARCRecordType.metadata.name()); - headers.add(HEADER_KEY_URI, targetURI); + if (targetURI != null) { + // WARC-Target-URI is optional in metadata records + headers.add(HEADER_KEY_URI, targetURI); + } headers.add(HEADER_KEY_DATE, DateUtils.getLog14Date(originalDate)); headers.add(HEADER_KEY_ID, makeRecordId()); headers.add(HEADER_KEY_REFERS_TO, origRecordId); From 04e10397b9137a36812c17276826bc60d1a37ede Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Mon, 15 Jun 2020 13:29:25 +0200 Subject: [PATCH 2/2] Update change log to include #85, #86 and #89 --- CHANGES.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index dcb598d9..bf985ada 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,3 +1,10 @@ +1.1.10 +------ +* [WAT extractor: do not fail on missing WARC-Filename in warcinfo record](https://github.com/iipc/webarchive-commons/pull/89) +* [ExtractingParseObserver: extract rel, hreflang and type attributes](https://github.com/iipc/webarchive-commons/pull/86) +* [ExtractingParseObserver: extract links from onClick attributes](https://github.com/iipc/webarchive-commons/pull/85) +* [Update TravisCI config](https://github.com/iipc/webarchive-commons/pull/83) + 1.1.9 ----- * [Use commons-collections v3.2.2 to avoid v3.2.1 vulnerability](https://github.com/iipc/webarchive-commons/pull/77)