From c1545bc7bee9c9bbd8626cf1b4b8d323bd415f2c Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 28 Oct 2015 11:45:48 -0700 Subject: [PATCH 001/211] fix for HER-2089 - get rid of broken, seemingly unnecessary escapeWhitespace() step of uri fixup --- .../org/archive/url/UsableURIFactory.java | 52 +------------------ .../org/archive/url/UsableURIFactoryTest.java | 8 ++- 2 files changed, 9 insertions(+), 51 deletions(-) diff --git a/src/main/java/org/archive/url/UsableURIFactory.java b/src/main/java/org/archive/url/UsableURIFactory.java index 9118b850..1059bfbd 100644 --- a/src/main/java/org/archive/url/UsableURIFactory.java +++ b/src/main/java/org/archive/url/UsableURIFactory.java @@ -49,8 +49,8 @@ * @author stack */ public class UsableURIFactory extends URI { - - private static final long serialVersionUID = -6146295130382209042L; + + private static final long serialVersionUID = 2L; /** * Logging instance. @@ -395,9 +395,6 @@ private String fixup(String uri, final URI base, final String charset) } TextUtils.recycleMatcher(matcher); - // now, minimally escape any whitespace - uri = escapeWhitespace(uri); - // For further processing, get uri elements. See the RFC2396REGEX // comment above for explanation of group indices used in the below. // matcher = RFC2396REGEX.matcher(uri); @@ -663,51 +660,6 @@ private String ensureMinimalEscaping(String u, final String charset, return u; } - /** - * Escape any whitespace found. - * - * The parent class takes care of the bulk of escaping. But if any - * instance of escaping is found in the URI, then we ask for parent - * to do NO escaping. Here we escape any whitespace found irrespective - * of whether the uri has already been escaped. We do this for - * case where uri has been judged already-escaped only, its been - * incompletly done and whitespace remains. Spaces, etc., in the URI are - * a real pain. Their presence will break log file and ARC parsing. - * @param uri URI string to check. - * @return uri with spaces escaped if any found. - */ - protected String escapeWhitespace(String uri) { - // Just write a new string anyways. The perl '\s' is not - // as inclusive as the Character.isWhitespace so there are - // whitespace characters we could miss. So, rather than - // write some awkward regex, just go through the string - // a character at a time. Only create buffer first time - // we find a space. - MutableString buffer = null; - for (int i = 0; i < uri.length(); i++) { - char c = uri.charAt(i); - if (Character.isWhitespace(c)) { - if (buffer == null) { - buffer = new MutableString(uri.length() + - 2 /*If space, two extra characters (at least)*/); - buffer.append(uri.substring(0, i)); - } - buffer.append("%"); - String hexStr = Integer.toHexString(c); - if ((hexStr.length() % 2) > 0) { - buffer.append("0"); - } - buffer.append(hexStr); - - } else { - if (buffer != null) { - buffer.append(c); - } - } - } - return (buffer != null)? buffer.toString(): uri; - } - /** * Check port on passed http authority. Make sure the size is not larger * than allowed: See the 'port' definition on this diff --git a/src/test/java/org/archive/url/UsableURIFactoryTest.java b/src/test/java/org/archive/url/UsableURIFactoryTest.java index af190957..73f2b6db 100644 --- a/src/test/java/org/archive/url/UsableURIFactoryTest.java +++ b/src/test/java/org/archive/url/UsableURIFactoryTest.java @@ -174,7 +174,7 @@ public final void testWhitespaceEscaped() throws URIException { assertTrue("Not equal " + uuri.toString(), uuri.toString().equals(tgtUri)); uri = "http://archive.org/index%25\u001D.html"; - tgtUri = "http://archive.org/index%25%1D.html".toLowerCase(); + tgtUri = "http://archive.org/index%25%1D.html"; uuri = UsableURIFactory.getInstance(uri); assertEquals("whitespace escaping", tgtUri, uuri.toString()); uri = "http://gemini.info.usaid.gov/directory/" + @@ -185,6 +185,12 @@ public final void testWhitespaceEscaped() throws URIException { "faxResults.cfm?name=Ebenezer +Rumplestiltskin,&location=" + "RRB%20%20%20%205%2E08%2D006"); assertEquals("whitespace escaping", tgtUri, uuri.toString()); + + // https://webarchive.jira.com/browse/HER-2089 + uri = "http://archive.org/index%25\u3000.html"; + tgtUri = "http://archive.org/index%25%E3%80%80.html"; + uuri = UsableURIFactory.getInstance(uri); + assertEquals("U+3000 ideographic space escaping", tgtUri, uuri.toString()); } // public final void testFailedGetPath() throws URIException { From 86589b0fafaa0918ce2192080e68941c47b39c40 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 14 Dec 2015 10:38:49 -0800 Subject: [PATCH 002/211] flush output etc before tallying stats to fix sizeOnDisk calculation --- src/main/java/org/archive/io/warc/WARCWriter.java | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/archive/io/warc/WARCWriter.java b/src/main/java/org/archive/io/warc/WARCWriter.java index e2d28ee9..7e22e08b 100644 --- a/src/main/java/org/archive/io/warc/WARCWriter.java +++ b/src/main/java/org/archive/io/warc/WARCWriter.java @@ -236,8 +236,8 @@ public void writeRecord(WARCRecordInfo recordInfo) long totalBytes = 0; long startPosition; - try { - startPosition = getPosition(); + startPosition = getPosition(); + try { preWriteRecordTasks(); // TODO: Revisit encoding of header. @@ -261,13 +261,12 @@ public void writeRecord(WARCRecordInfo recordInfo) write(CRLF_BYTES); totalBytes += 2 * CRLF_BYTES.length; - tally(recordInfo.getType(), contentBytes, totalBytes, getPosition() - startPosition); - recordInfo.setWARCFilename(getFilenameWithoutOccupiedSuffix()); recordInfo.setWARCFileOffset(startPosition); tmpRecordLog.add(recordInfo); } finally { postWriteRecordTasks(); + tally(recordInfo.getType(), contentBytes, totalBytes, getPosition() - startPosition); } } From 1fede2354f65437825b6471261a8f0361ffba241 Mon Sep 17 00:00:00 2001 From: Jeremy Wiebe Date: Mon, 7 Mar 2016 22:35:28 -0500 Subject: [PATCH 003/211] Store origin-code in ARCRecord header; accessible through getOrigin() method. --- .../org/archive/format/ArchiveFileConstants.java | 7 ++++++- .../java/org/archive/format/arc/ARCConstants.java | 2 +- src/main/java/org/archive/io/arc/ARCRecord.java | 13 +++++++++---- .../java/org/archive/io/arc/ARCRecordMetaData.java | 9 ++++++++- 4 files changed, 24 insertions(+), 7 deletions(-) diff --git a/src/main/java/org/archive/format/ArchiveFileConstants.java b/src/main/java/org/archive/format/ArchiveFileConstants.java index b0b8aa66..df3b4465 100644 --- a/src/main/java/org/archive/format/ArchiveFileConstants.java +++ b/src/main/java/org/archive/format/ArchiveFileConstants.java @@ -44,6 +44,11 @@ public interface ArchiveFileConstants { * Key for the Archive File version field. */ public static final String VERSION_FIELD_KEY = "version"; + + /** + * Key for the Archive File origin-code field. + */ + public static final String ORIGIN_FIELD_KEY = "origin"; /** * Key for the Archive File length field. @@ -80,7 +85,7 @@ public interface ArchiveFileConstants { * Key for the Archive Record absolute offset into Archive file. */ public static final String ABSOLUTE_OFFSET_KEY = "absolute-offset"; - + public static final String READER_IDENTIFIER_FIELD_KEY = "reader-identifier"; diff --git a/src/main/java/org/archive/format/arc/ARCConstants.java b/src/main/java/org/archive/format/arc/ARCConstants.java index a336ddeb..5987b49f 100755 --- a/src/main/java/org/archive/format/arc/ARCConstants.java +++ b/src/main/java/org/archive/format/arc/ARCConstants.java @@ -196,7 +196,7 @@ public interface ARCConstants extends ArchiveFileConstants { .asList(new String[] { URL_FIELD_KEY, IP_HEADER_FIELD_KEY, DATE_FIELD_KEY, MIMETYPE_FIELD_KEY, LENGTH_FIELD_KEY, VERSION_FIELD_KEY, - ABSOLUTE_OFFSET_KEY }); + ORIGIN_FIELD_KEY, ABSOLUTE_OFFSET_KEY }); /** * Minimum possible record length. diff --git a/src/main/java/org/archive/io/arc/ARCRecord.java b/src/main/java/org/archive/io/arc/ARCRecord.java index 21bea07c..2d9c9bf4 100644 --- a/src/main/java/org/archive/io/arc/ARCRecord.java +++ b/src/main/java/org/archive/io/arc/ARCRecord.java @@ -200,7 +200,7 @@ public ARCRecord(InputStream in, ArchiveRecordHeader metaData, public ARCRecord(InputStream in, final String identifier, final long offset, boolean digest, boolean strict, final boolean parseHttpHeaders, - final boolean isAlignedOnFirstRecord, String version) + final boolean isAlignedOnFirstRecord, String version) throws IOException { super(in, null, 0, digest, strict); setHeader(parseHeaders(in, identifier, offset, strict, isAlignedOnFirstRecord, version)); @@ -243,6 +243,7 @@ private ArchiveRecordHeader parseHeaders(final InputStream in, getTokenizedHeaderLine(in, firstLineValues); int bodyOffset = 0; + String origin = ""; if (offset == 0 && isAlignedOnFirstRecord) { // If offset is zero and we were aligned at first record on // creation (See #alignedOnFirstRecord for more on this), then no @@ -263,6 +264,7 @@ private ArchiveRecordHeader parseHeaders(final InputStream in, bodyOffset += getTokenizedHeaderLine(in, secondLineValues); version = ((String)secondLineValues.get(0) + "." + (String)secondLineValues.get(1)); + origin = (String)secondLineValues.get(2); // Just read over the 3rd line. We used to parse it and use // values found here but now we just hardcode them to avoid // having to read this 3rd line even for random arc file accesses. @@ -271,7 +273,8 @@ private ArchiveRecordHeader parseHeaders(final InputStream in, } setBodyOffset(bodyOffset); - return computeMetaData(this.headerFieldNameKeys, firstLineValues, version, offset, identifier); + return computeMetaData(this.headerFieldNameKeys, firstLineValues, + version, origin, offset, identifier); } /** @@ -362,7 +365,8 @@ private int getTokenizedHeaderLine(final InputStream stream, * @exception IOException If no. of keys doesn't match no. of values. */ private ARCRecordMetaData computeMetaData(List keys, - List values, String v, long offset, final String identifier) + List values, String v, String origin, + long offset, final String identifier) throws IOException { if (keys.size() != values.size()) { List originalValues = values; @@ -423,6 +427,7 @@ private ARCRecordMetaData computeMetaData(List keys, } headerFields.put(VERSION_FIELD_KEY, v); + headerFields.put(ORIGIN_FIELD_KEY, origin); headerFields.put(ABSOLUTE_OFFSET_KEY, new Long(offset)); return new ARCRecordMetaData(identifier, headerFields); @@ -832,4 +837,4 @@ protected String getDigest4Cdx(ArchiveRecordHeader h) { } return (result != null) ? result: super.getDigest4Cdx(h); } -} \ No newline at end of file +} diff --git a/src/main/java/org/archive/io/arc/ARCRecordMetaData.java b/src/main/java/org/archive/io/arc/ARCRecordMetaData.java index 3f617041..02b368e4 100644 --- a/src/main/java/org/archive/io/arc/ARCRecordMetaData.java +++ b/src/main/java/org/archive/io/arc/ARCRecordMetaData.java @@ -168,6 +168,13 @@ public String getVersion() { return (String)this.headerFields.get(VERSION_FIELD_KEY); } + /** + * @return Arcfile origin code. + */ + public String getOrigin() { + return (String)this.headerFields.get(ORIGIN_FIELD_KEY); + } + /** * @return Offset into arcfile at which this record begins. */ @@ -264,4 +271,4 @@ public int getContentBegin() { protected void setContentBegin(final int offset) { this.contentBegin = offset; } -} \ No newline at end of file +} From 28c9a1b2b04c9f392247690c7112ae20882d8cbc Mon Sep 17 00:00:00 2001 From: Jeremy Wiebe Date: Fri, 11 Mar 2016 11:45:42 -0500 Subject: [PATCH 004/211] Update ArchiveFileConstants.java --- src/main/java/org/archive/format/ArchiveFileConstants.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/archive/format/ArchiveFileConstants.java b/src/main/java/org/archive/format/ArchiveFileConstants.java index df3b4465..89e1308c 100644 --- a/src/main/java/org/archive/format/ArchiveFileConstants.java +++ b/src/main/java/org/archive/format/ArchiveFileConstants.java @@ -46,7 +46,7 @@ public interface ArchiveFileConstants { public static final String VERSION_FIELD_KEY = "version"; /** - * Key for the Archive File origin-code field. + * Key for the Archive File origin-code field. This value is often hard-coded, so use with care. */ public static final String ORIGIN_FIELD_KEY = "origin"; From 7ef8aa95bc758d96b60a30d036dd0c32de20937c Mon Sep 17 00:00:00 2001 From: Jeremy Wiebe Date: Mon, 14 Mar 2016 17:10:23 -0400 Subject: [PATCH 005/211] Update CHANGES.md --- CHANGES.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index 70f9b052..3c9f4c8b 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,3 +1,7 @@ +1.1.7 +----- +* [Store origin-code of ARC file header](https://github.com/iipc/webarchive-commons/pull/52/) + 1.1.6 ----- * [Handle empty String argument in CharsetDetector.trimAttrValue](https://github.com/iipc/webarchive-commons/pull/49) From 5cfff50a03263208520ca2d260229eefb2aec2f7 Mon Sep 17 00:00:00 2001 From: Hunter Stern Date: Mon, 21 Mar 2016 17:30:30 -0700 Subject: [PATCH 006/211] Make canonicalizer be able to strip session id params even if they are the first params in the query string. And add session id strip test. And change IAURLCanonicalizer.java to ensure that if after transformations on the query string have completed and the query is empty, there is not a ? added to the end of the url. --- .../org/archive/url/IAURLCanonicalizer.java | 35 +++++++++---------- .../org/archive/url/URLRegexTransformer.java | 10 +++--- .../archive/url/IAURLCanonicalizerTest.java | 10 ++++++ 3 files changed, 32 insertions(+), 23 deletions(-) diff --git a/src/main/java/org/archive/url/IAURLCanonicalizer.java b/src/main/java/org/archive/url/IAURLCanonicalizer.java index 029598f6..0cf7c8a4 100644 --- a/src/main/java/org/archive/url/IAURLCanonicalizer.java +++ b/src/main/java/org/archive/url/IAURLCanonicalizer.java @@ -63,25 +63,24 @@ public void canonicalize(HandyURL url) { String query = url.getQuery(); if(query != null) { - if(query.equals("")) { - if(rules.isSet(QUERY_SETTINGS, QUERY_STRIP_EMPTY)) { - query = null; - } - } else { - // we have a query... what to do with it? + // we have a query... what to do with it? - // first remove uneeded: - if(rules.isSet(QUERY_SETTINGS, QUERY_STRIP_SESSION_ID)) { - query = URLRegexTransformer.stripQuerySessionID(query); - } - // lower-case: - if(rules.isSet(QUERY_SETTINGS, QUERY_LOWERCASE)) { - query = query.toLowerCase(); - } - // re-order? - if(rules.isSet(QUERY_SETTINGS, QUERY_ALPHA_REORDER)) { - query = alphaReorderQuery(query); - } + // first remove uneeded: + if(rules.isSet(QUERY_SETTINGS, QUERY_STRIP_SESSION_ID)) { + query = URLRegexTransformer.stripQuerySessionID(query); + } + // lower-case: + if(rules.isSet(QUERY_SETTINGS, QUERY_LOWERCASE)) { + query = query.toLowerCase(); + } + // re-order? + if(rules.isSet(QUERY_SETTINGS, QUERY_ALPHA_REORDER)) { + query = alphaReorderQuery(query); + } + if(query.equals("")) { + if(rules.isSet(QUERY_SETTINGS, QUERY_STRIP_EMPTY)) { + query = null; + } } url.setQuery(query); } diff --git a/src/main/java/org/archive/url/URLRegexTransformer.java b/src/main/java/org/archive/url/URLRegexTransformer.java index c5505a74..617e0225 100644 --- a/src/main/java/org/archive/url/URLRegexTransformer.java +++ b/src/main/java/org/archive/url/URLRegexTransformer.java @@ -16,11 +16,11 @@ public class URLRegexTransformer { private static final OptimizedPattern QUERY_OPTS[] = { - new OptimizedPattern("(?i)^(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2), - new OptimizedPattern("(?i)^(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2), - new OptimizedPattern("(?i)^(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2), - new OptimizedPattern("(?i)^(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2), - new OptimizedPattern("(?i)^(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", "cftoken=", 1, 2), + new OptimizedPattern("(?i)^(.*)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2), + new OptimizedPattern("(?i)^(.*)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2), + new OptimizedPattern("(?i)^(.*)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2), + new OptimizedPattern("(?i)^(.*)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2), + new OptimizedPattern("(?i)^(.*)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", "cftoken=", 1, 2), }; diff --git a/src/test/java/org/archive/url/IAURLCanonicalizerTest.java b/src/test/java/org/archive/url/IAURLCanonicalizerTest.java index 3263edc7..91751b4a 100644 --- a/src/test/java/org/archive/url/IAURLCanonicalizerTest.java +++ b/src/test/java/org/archive/url/IAURLCanonicalizerTest.java @@ -53,5 +53,15 @@ public void testGetDefaultPort() { assertEquals(80,IAURLCanonicalizer.getDefaultPort("http")); assertEquals(443,IAURLCanonicalizer.getDefaultPort("https")); } + + public void testStripSessionId() throws URISyntaxException { + IAURLCanonicalizer iaC = new IAURLCanonicalizer(new DefaultIACanonicalizerRules()); + compCan(iaC, + "http://www.nsf.gov/statistics/sed/2009/SED_2009.zip?CFID=14387305&CFTOKEN=72942008&jsessionid=f030eacc7e49c4ca0b077922347418418766", + "http://nsf.gov/statistics/sed/2009/sed_2009.zip?jsessionid=f030eacc7e49c4ca0b077922347418418766"); + compCan(iaC, + "http://www.nsf.gov/statistics/sed/2009/SED_2009.zip?CFID=14387305&CFTOKEN=72942008", + "http://nsf.gov/statistics/sed/2009/sed_2009.zip"); + } } From 02e6e29fb735b1fdd0957196d264b40d29e6fa6d Mon Sep 17 00:00:00 2001 From: John Erik Halse Date: Fri, 17 Jun 2016 09:54:36 +0200 Subject: [PATCH 007/211] Updated release notes --- CHANGES.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index 3c9f4c8b..52c40f42 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,6 +1,9 @@ 1.1.7 ----- +* [Make canonicalizer be able to strip session id params even if they are the first params in the query string](https://github.com/iipc/webarchive-commons/pull/54) * [Store origin-code of ARC file header](https://github.com/iipc/webarchive-commons/pull/52/) +* [Flush output etc before tallying stats to fix sizeOnDisk calculation](https://github.com/iipc/webarchive-commons/pull/51) +* [Get rid of broken, seemingly unnecessary escapeWhitespace() step of uri fixup](https://github.com/iipc/webarchive-commons/pull/50) 1.1.6 ----- From a55391dfe1855259939d118c49b84cf386c0960f Mon Sep 17 00:00:00 2001 From: John Erik Halse Date: Fri, 17 Jun 2016 10:25:23 +0200 Subject: [PATCH 008/211] [maven-release-plugin] prepare release webarchive-commons-1.1.7 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 7984edde..f842a09c 100644 --- a/pom.xml +++ b/pom.xml @@ -9,7 +9,7 @@ org.netpreserve.commons webarchive-commons - 1.1.7-SNAPSHOT + 1.1.7 jar webarchive-commons From bb36b6a7375453e1cb8073211041ca3f955ab217 Mon Sep 17 00:00:00 2001 From: John Erik Halse Date: Fri, 17 Jun 2016 10:25:28 +0200 Subject: [PATCH 009/211] [maven-release-plugin] prepare for next development iteration --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index f842a09c..24780063 100644 --- a/pom.xml +++ b/pom.xml @@ -9,7 +9,7 @@ org.netpreserve.commons webarchive-commons - 1.1.7 + 1.1.8-SNAPSHOT jar webarchive-commons From 0cbca57bc87f9bd55844977a480ead400a40920d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kristinn=20Sigur=C3=B0sson?= Date: Wed, 13 Jul 2016 12:21:42 +0000 Subject: [PATCH 010/211] Remove invalid constant The PROFILE_REVISIT_URI_AGNOSTIC_IDENTICAL_DIGEST does not exist in the WARC specification. This file shouldn't include non-standard items. And, in any case, use of PROFILE_REVISIT_IDENTICAL_DIGEST is appropriate, even when using 'uri agnostic' deduplication. --- src/main/java/org/archive/format/warc/WARCConstants.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/main/java/org/archive/format/warc/WARCConstants.java b/src/main/java/org/archive/format/warc/WARCConstants.java index c9f6cbf3..93a81f96 100644 --- a/src/main/java/org/archive/format/warc/WARCConstants.java +++ b/src/main/java/org/archive/format/warc/WARCConstants.java @@ -183,8 +183,6 @@ enum WARCRecordType { public static final String HEADER_KEY_REFERS_TO_FILENAME = "WARC-Refers-To-Filename"; public static final String HEADER_KEY_REFERS_TO_FILE_OFFSET = "WARC-Refers-To-File-Offset"; - public static final String PROFILE_REVISIT_URI_AGNOSTIC_IDENTICAL_DIGEST = - "http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest"; public static final String PROFILE_REVISIT_IDENTICAL_DIGEST = "http://netpreserve.org/warc/1.0/revisit/identical-payload-digest"; public static final String PROFILE_REVISIT_NOT_MODIFIED = From a23cfebe24a959c929b1fcf9fbb6fc37eae31c76 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Sun, 7 Aug 2016 16:49:47 +0200 Subject: [PATCH 011/211] Make regular expression to extract URLs from CSS more restrictive (allow only `"`, `'`, `\"` or `\'` in front of or after the URL). Avoid long-runners when matching the regex due to heavy back-tracking. --- .../java/org/archive/resource/html/ExtractingParseObserver.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java index e1f57b55..df3742fa 100644 --- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java +++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java @@ -21,7 +21,7 @@ public class ExtractingParseObserver implements ParseObserver { boolean inTitle = false; protected static String cssUrlPatString = - "url\\s*\$\\s*([\\\\\"']*.+?[\\\\\"']*)\\s*\$"; + "url\\s*\$\\s*((?:\\\\?[\"'])?.+?(?:\\\\?[\"'])?)\\s*\$"; protected static String cssImportNoUrlPatString = "@import\\s+(('[^']+')|(\"[^\"]+\")|(\$'[^']+'\$)|(\$\"[^\"]+\"\$)|(\$[^)]+\$)|([a-z0-9_.:/\\\\-]+))\\s*;"; From 9d7abed43aef409e19842f875914c50a0b58ccf8 Mon Sep 17 00:00:00 2001 From: David Portabella Date: Wed, 21 Sep 2016 11:54:18 +0200 Subject: [PATCH 012/211] fix: last header was lost if LF LF (intead of CRLF CRLF) --- .../archive/format/http/HttpHeaderParser.java | 1 + .../format/http/HttpResponseParserTest.java | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/src/main/java/org/archive/format/http/HttpHeaderParser.java b/src/main/java/org/archive/format/http/HttpHeaderParser.java index fdec62f2..d63ec405 100755 --- a/src/main/java/org/archive/format/http/HttpHeaderParser.java +++ b/src/main/java/org/archive/format/http/HttpHeaderParser.java @@ -231,6 +231,7 @@ public ParseState handleByte(byte b, HttpHeaderParser parser) if(b == LF) { // TODO: this is lax, is LFLF an OK terminator? // that's all folks! + parser.headerFinished(); parser.parseFinished(); return parser.endState; } diff --git a/src/test/java/org/archive/format/http/HttpResponseParserTest.java b/src/test/java/org/archive/format/http/HttpResponseParserTest.java index 2850fe44..c0d13230 100644 --- a/src/test/java/org/archive/format/http/HttpResponseParserTest.java +++ b/src/test/java/org/archive/format/http/HttpResponseParserTest.java @@ -38,4 +38,23 @@ public void testParse() throws IOException { } + public void testParseWithLf() throws IOException { + + HttpResponseParser parser = new HttpResponseParser(); + String message = "200 OK\nContent-Type: text/plain\n\nHi there"; + try { + HttpResponse response = + parser.parse(new ByteArrayInputStream(message.getBytes(IAUtils.UTF8))); + assertNotNull(response); + HttpHeaders headers = response.getHeaders(); + assertNotNull(headers); + assertEquals(1,headers.size()); + + } catch (HttpParseException e) { + e.printStackTrace(); + fail(); + } + + } + } From 5f223d60c365a53533b2ad7217deaa65b3a91667 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Thu, 24 Nov 2016 11:51:10 +0100 Subject: [PATCH 013/211] Use CharsetDetector to guess encoding of HTML document --- .../resource/html/HTMLResourceFactory.java | 32 +++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/archive/resource/html/HTMLResourceFactory.java b/src/main/java/org/archive/resource/html/HTMLResourceFactory.java index 935843f1..34062ed9 100644 --- a/src/main/java/org/archive/resource/html/HTMLResourceFactory.java +++ b/src/main/java/org/archive/resource/html/HTMLResourceFactory.java @@ -1,9 +1,14 @@ package org.archive.resource.html; +import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; import java.io.UnsupportedEncodingException; +import org.archive.format.http.HttpHeaders; +import org.archive.format.json.JSONUtils; +import org.archive.format.text.charset.CharsetDetector; +import org.archive.format.text.charset.StandardCharsetDetector; import org.archive.format.text.html.CDATALexer; import org.archive.format.text.html.LexParser; import org.archive.resource.MetaData; @@ -13,17 +18,40 @@ import org.archive.resource.ResourceParseException; import org.htmlparser.lexer.Page; import org.htmlparser.util.ParserException; +import org.json.JSONException; +import org.json.JSONObject; public class HTMLResourceFactory implements ResourceFactory { + protected static final int CHARSET_GUESS_CHUNK_SIZE = 8192; + protected static final String HTTP_HEADER_PATH = "Envelope.Payload-Metadata.HTTP-Response-Metadata.Headers"; + + protected CharsetDetector charSetDetector = new StandardCharsetDetector(); + + public Resource getResource(InputStream is, MetaData parentMetaData, ResourceContainer container) throws ResourceParseException, IOException { HTMLMetaData hmd = new HTMLMetaData(parentMetaData); ExtractingParseObserver epo = new ExtractingParseObserver(hmd); LexParser parser = new LexParser(epo); CDATALexer lex = new CDATALexer(); - // TODO: figure out charset: - String charset = "UTF-8"; + + // guess charset based on HTTP header and sniffed content chunk + is = new BufferedInputStream(is, CHARSET_GUESS_CHUNK_SIZE); + byte[] chunk = new byte[CHARSET_GUESS_CHUNK_SIZE]; + is.mark(0); + int chunkSize = is.read(chunk, 0, CHARSET_GUESS_CHUNK_SIZE); + is.reset(); + JSONObject headers = JSONUtils.extractObject(hmd.getTopMetaData(), HTTP_HEADER_PATH); + HttpHeaders httpHeaders = new HttpHeaders(); + if (headers.has("Content-Type")) { + try { + httpHeaders.add("Content-Type", headers.getString("Content-Type")); + } catch (JSONException e) { } + } + + String charset = charSetDetector.getCharset(chunk, chunkSize, httpHeaders); + Page page; try { page = new Page(is, charset); From 607acaa734183b72c816359c588bbf157485d5ba Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Thu, 24 Nov 2016 12:44:53 +0100 Subject: [PATCH 014/211] HTML encoding detection: fix errors with empty content or empty charset values --- .../format/text/charset/CharsetDetector.java | 2 ++ .../resource/html/HTMLResourceFactory.java | 24 +++++++++++++------ 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/src/main/java/org/archive/format/text/charset/CharsetDetector.java b/src/main/java/org/archive/format/text/charset/CharsetDetector.java index ae71b5fa..0534ff85 100644 --- a/src/main/java/org/archive/format/text/charset/CharsetDetector.java +++ b/src/main/java/org/archive/format/text/charset/CharsetDetector.java @@ -182,6 +182,8 @@ private static String trimAttrValue(String value) { return value; } String result = value; + if (result.isEmpty()) + return result; if (result.charAt(0) == '"') { result = result.substring(1, result.length() - 1); } else if (result.charAt(0) == '\'') { diff --git a/src/main/java/org/archive/resource/html/HTMLResourceFactory.java b/src/main/java/org/archive/resource/html/HTMLResourceFactory.java index 34062ed9..afb1c850 100644 --- a/src/main/java/org/archive/resource/html/HTMLResourceFactory.java +++ b/src/main/java/org/archive/resource/html/HTMLResourceFactory.java @@ -5,6 +5,8 @@ import java.io.InputStream; import java.io.UnsupportedEncodingException; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.archive.format.http.HttpHeaders; import org.archive.format.json.JSONUtils; import org.archive.format.text.charset.CharsetDetector; @@ -23,6 +25,8 @@ public class HTMLResourceFactory implements ResourceFactory { + public static final Log LOG = LogFactory.getLog(HTMLResourceFactory.class); + protected static final int CHARSET_GUESS_CHUNK_SIZE = 8192; protected static final String HTTP_HEADER_PATH = "Envelope.Payload-Metadata.HTTP-Response-Metadata.Headers"; @@ -37,21 +41,27 @@ public Resource getResource(InputStream is, MetaData parentMetaData, CDATALexer lex = new CDATALexer(); // guess charset based on HTTP header and sniffed content chunk + String charset = "UTF-8"; is = new BufferedInputStream(is, CHARSET_GUESS_CHUNK_SIZE); byte[] chunk = new byte[CHARSET_GUESS_CHUNK_SIZE]; is.mark(0); int chunkSize = is.read(chunk, 0, CHARSET_GUESS_CHUNK_SIZE); is.reset(); - JSONObject headers = JSONUtils.extractObject(hmd.getTopMetaData(), HTTP_HEADER_PATH); - HttpHeaders httpHeaders = new HttpHeaders(); - if (headers.has("Content-Type")) { + if (chunkSize > 0) { + JSONObject headers = JSONUtils.extractObject(hmd.getTopMetaData(), HTTP_HEADER_PATH); + HttpHeaders httpHeaders = new HttpHeaders(); + if (headers.has("Content-Type")) { + try { + httpHeaders.add("Content-Type", headers.getString("Content-Type")); + } catch (JSONException e) { } + } try { - httpHeaders.add("Content-Type", headers.getString("Content-Type")); - } catch (JSONException e) { } + charset = charSetDetector.getCharset(chunk, chunkSize, httpHeaders); + } catch (Exception e) { + LOG.error("Failed to guess charset: " + e.getMessage()); + } } - String charset = charSetDetector.getCharset(chunk, chunkSize, httpHeaders); - Page page; try { page = new Page(is, charset); From 824dd82f5f9c9e60392ece498f8e5d44a7e431b9 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Thu, 24 Nov 2016 14:05:55 +0100 Subject: [PATCH 015/211] Match http-equiv meta elements with unquoted attribute values, e.g. --- .../org/archive/format/text/charset/CharsetDetector.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/archive/format/text/charset/CharsetDetector.java b/src/main/java/org/archive/format/text/charset/CharsetDetector.java index 0534ff85..9b4c8523 100644 --- a/src/main/java/org/archive/format/text/charset/CharsetDetector.java +++ b/src/main/java/org/archive/format/text/charset/CharsetDetector.java @@ -60,7 +60,8 @@ public abstract class CharsetDetector { private final static String META_CONTENT_ATTR_PATTERN_STRING = "\\b" + META_CONTENT_ATTRIBUTE + "\\s*=\\s*(" + ANY_ATTR_VALUE + ")(?:\\s|>)?"; private final static String META_HTTP_EQUIV_ATTR_PATTERN_STRING = "\\b" + - META_HTTP_EQUIV_ATTRIBUTE + "\\s*=\\s*(" + ANY_ATTR_VALUE + ")(?:\\s|>)?"; + META_HTTP_EQUIV_ATTRIBUTE + "\\s*=\\s*(" + META_CONTENT_TYPE + "|" + + ANY_ATTR_VALUE + ")(?:\\s|>)?"; @@ -183,7 +184,7 @@ private static String trimAttrValue(String value) { } String result = value; if (result.isEmpty()) - return result; + return result; if (result.charAt(0) == '"') { result = result.substring(1, result.length() - 1); } else if (result.charAt(0) == '\'') { @@ -232,7 +233,6 @@ public static String findMetaContentType(String pageSample) { protected String getCharsetFromBytes(byte buffer[], int len) throws IOException { String charsetName = null; - UniversalDetector detector = new UniversalDetector(null); detector.handleData(buffer, 0, len); detector.dataEnd(); From 9e41abcb36c585dd1cd9622f0eeeaddb0faae111 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Fri, 9 Dec 2016 15:35:10 +0100 Subject: [PATCH 016/211] Strip empty port, do not fail --- src/main/java/org/archive/url/URLParser.java | 24 +++++++++++-------- .../archive/url/IAURLCanonicalizerTest.java | 1 + .../archive/url/WaybackURLKeyMakerTest.java | 1 + 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/src/main/java/org/archive/url/URLParser.java b/src/main/java/org/archive/url/URLParser.java index 98e4c1aa..83d3c386 100644 --- a/src/main/java/org/archive/url/URLParser.java +++ b/src/main/java/org/archive/url/URLParser.java @@ -246,16 +246,20 @@ public static HandyURL parse(String urlString) throws URISyntaxException { colonPort = uriAuthority.substring(portColonIndex); } if(colonPort != null) { - if(colonPort.startsWith(":")) { - try { - port = Integer.parseInt(colonPort.substring(1)); - } catch(NumberFormatException e) { - throw new URISyntaxException(urlString, "bad port " - + colonPort.substring(1)); - } - } else { - // XXX: what's happened?! - } + if(colonPort.startsWith(":")) { + if (colonPort.length() == 1) { + // a bare colon (http://example.com:/), use default port + } else { + try { + port = Integer.parseInt(colonPort.substring(1)); + } catch(NumberFormatException e) { + throw new URISyntaxException(urlString, "bad port " + + colonPort.substring(1)); + } + } + } else { + // XXX: what's happened?! + } } if(userInfo != null) { int passColonIndex = userInfo.indexOf(COLON); diff --git a/src/test/java/org/archive/url/IAURLCanonicalizerTest.java b/src/test/java/org/archive/url/IAURLCanonicalizerTest.java index 91751b4a..e2c46258 100644 --- a/src/test/java/org/archive/url/IAURLCanonicalizerTest.java +++ b/src/test/java/org/archive/url/IAURLCanonicalizerTest.java @@ -12,6 +12,7 @@ public void testFull() throws URISyntaxException { compCan(iaC,"https://www.archive.org:80/","https://archive.org:80/"); compCan(iaC,"http://www.archive.org:443/","http://archive.org:443/"); compCan(iaC,"https://www.archive.org:443/","https://archive.org/"); + compCan(iaC,"http://www.archive.org:/","http://archive.org/"); compCan(iaC,"http://www.archive.org/big/","http://archive.org/big"); compCan(iaC,"dns:www.archive.org","dns:www.archive.org"); diff --git a/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java b/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java index 34bfe625..26161456 100644 --- a/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java +++ b/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java @@ -22,6 +22,7 @@ public void testMakeKey() throws URISyntaxException { assertEquals("org,archive)/goo", km.makeKey("http://archive.org/goo/?")); assertEquals("org,archive)/goo?a&b", km.makeKey("http://archive.org/goo/?b&a")); assertEquals("org,archive)/goo?a=1&a=2&b", km.makeKey("http://archive.org/goo/?a=2&b&a=1")); + assertEquals("org,archive)/", km.makeKey("http://archive.org:/")); } } From b918f7f18e94c58a4a74d97e98f3c19465466595 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Wed, 4 Jan 2017 18:21:22 +0100 Subject: [PATCH 017/211] Improve clipping of quotation marks in CSS link extraction - clip multiple quotation marks Fix StringIndexOutOfBoundsException in patternCSSExtract - correct check for min. required URL lenght when stripping 4 characters (2 at each end) - simplified code, use non-capturing groups in regular expression --- .../html/ExtractingParseObserver.java | 79 ++++++++++--------- .../html/ExtractingParseObserverTest.java | 48 ++++++----- 2 files changed, 70 insertions(+), 57 deletions(-) diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java index df3742fa..45a48808 100644 --- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java +++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java @@ -23,7 +23,7 @@ public class ExtractingParseObserver implements ParseObserver { protected static String cssUrlPatString = "url\\s*\$\\s*((?:\\\\?[\"'])?.+?(?:\\\\?[\"'])?)\\s*\$"; protected static String cssImportNoUrlPatString = - "@import\\s+(('[^']+')|(\"[^\"]+\")|(\$'[^']+'\$)|(\$\"[^\"]+\"\$)|(\$[^)]+\$)|([a-z0-9_.:/\\\\-]+))\\s*;"; + "@import\\s+((?:'[^']+')|(?:\"[^\"]+\")|(?:\$'[^']+'\$)|(?:\$\"[^\"]+\"\$)|(?:\$[^)]+\$)|(?:[a-z0-9_.:/\\\\-]+))\\s*;"; protected static Pattern cssImportNoUrlPattern = Pattern .compile(cssImportNoUrlPatString); @@ -368,40 +368,45 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs } } } - private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String content) { - Matcher m = pattern.matcher(content); - int idx = 0; - int contentLen = content.length(); - while((idx < contentLen) && m.find(idx)) { - String url = m.group(1); - int origUrlLength = url.length(); - int urlStart = m.start(1); - int urlEnd = m.end(1); - idx = urlEnd; - if(url.length() < 2) { - continue; - } - if ((url.charAt(0) == '(') - && (url.charAt(origUrlLength-1) == ')')) { - url = url.substring(1, origUrlLength - 1); - urlStart += 1; - origUrlLength -= 2; - } - if (url.charAt(0) == '"') { - url = url.substring(1, origUrlLength - 1); - urlStart += 1; - } else if (url.charAt(0) == '\'') { - url = url.substring(1, origUrlLength - 1); - urlStart += 1; - } else if (url.charAt(0) == '\\') { - if(url.length() == 2) - continue; - url = url.substring(2, origUrlLength - 2); - urlStart += 2; - } - int urlLength = url.length(); - data.addHref("path","STYLE/#text","href",url); - idx += urlLength; - } - } + private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String content) { + Matcher m = pattern.matcher(content); + int idx = 0; + int contentLen = content.length(); + if (contentLen > 100000) + // extract URLs only from the first 100 kB + contentLen = 100000; + FIND: + while((idx < contentLen) && m.find()) { + idx = m.end(); + String url = m.group(1); + if(url.length() < 2) { + continue; + } + if ((url.charAt(0) == '(') + && (url.charAt(url.length()-1) == ')')) { + url = url.substring(1, url.length() - 1); + } + CLIP: + while (url.length() > 1) { + if ((url.charAt(0) == '"' || url.charAt(0) == '\'') + && (url.charAt(url.length() - 1) == '"' + || url.charAt(url.length() - 1) == '\'')) { + if(url.length() <= 2) { + // empty URL + continue FIND; + } + url = url.substring(1, url.length() - 1); + } else if (url.charAt(0) == '\\') { + if(url.length() <= 4) { + // empty URL + continue FIND; + } + url = url.substring(2, url.length() - 2); + } else { + break CLIP; + } + } + data.addHref("path","STYLE/#text","href",url); + } + } } diff --git a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java index 24b6c18a..236b964b 100644 --- a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java +++ b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java @@ -19,7 +19,9 @@ public void testHandleStyleNodeExceptions() throws Exception { "url (' ')", "url('\")", "url(')", - "url('\"')" + "url('\"')", + "url('\\\"\"')", + "url(''''')" }; boolean except = false; HTMLMetaData md = new HTMLMetaData(new MetaData()); @@ -37,6 +39,7 @@ public void testHandleStyleNodeExceptions() throws Exception { assertFalse(except); } } + public void testHandleStyleNode() throws Exception { String[][] tests = { {""}, @@ -45,31 +48,35 @@ public void testHandleStyleNode() throws Exception { {"url(\"foo.gif\")","foo.gif"}, {"url(\\\"foo.gif\\\")","foo.gif"}, {"url(\\'foo.gif\\')","foo.gif"}, - - }; + {"url(''foo.gif'')","foo.gif"}, + {"url( foo.gif )","foo.gif"}, + {"url('''')"} + }; for(String[] testa : tests) { checkExtract(testa); } - // boolean except = false; -// HTMLMetaData md = new HTMLMetaData(new MetaData()); -// ExtractingParseObserver epo = new ExtractingParseObserver(md); -// for(String css : tests) { -// try { -// TextNode tn = new TextNode(css); -// epo.handleStyleNode(tn); -// } catch(Exception e) { -// System.err.format("And the winner is....(%s)\n", css); -// e.printStackTrace(); -// except = true; -// throw e; -// } -// assertFalse(except); -// } } + + /** + * Test whether the pattern matcher does extract nothing and also does not + * not hang-up if an overlong CSS link is truncated. + */ + public void testHandleStyleNodeNoHangupTruncated() throws Exception { + StringBuilder sb = new StringBuilder(); + sb.append("url("); + for (int i = 0; i < 500000; i++) + sb.append('\''); + sb.append("foo.gif"); + for (int i = 0; i < 499000; i++) + sb.append('\''); + String[] test = new String[1]; + test[0] = sb.toString(); + checkExtract(test); + } + private void checkExtract(String[] data) throws JSONException { // System.err.format("CSS(%s) want[0](%s)\n",css,want[0]); String css = data[0]; - boolean except = false; HTMLMetaData md = new HTMLMetaData(new MetaData()); ExtractingParseObserver epo = new ExtractingParseObserver(md); try { @@ -87,7 +94,8 @@ private void checkExtract(String[] data) throws JSONException { assertTrue(o instanceof JSONObject); JSONObject jo = (JSONObject) o; - assertEquals(data[i],jo.getString("href")); + assertEquals("CSS link extraction failed for <" + css + ">", + data[i], jo.getString("href")); } } else { assertNull(a); From 194a1faecf30905c840d71d0bc22b6ea5d6a61fe Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Wed, 18 Jan 2017 12:29:43 +0100 Subject: [PATCH 018/211] CSS link extraction: clip also unpaired leading and trailing quotation marks --- .../html/ExtractingParseObserver.java | 64 +++++++------------ .../html/ExtractingParseObserverTest.java | 9 +-- 2 files changed, 27 insertions(+), 46 deletions(-) diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java index 45a48808..deb8c8c0 100644 --- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java +++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java @@ -22,13 +22,18 @@ public class ExtractingParseObserver implements ParseObserver { protected static String cssUrlPatString = "url\\s*\$\\s*((?:\\\\?[\"'])?.+?(?:\\\\?[\"'])?)\\s*\$"; + protected static String cssUrlTrimPatString = + "^(?:\\\\?[\"'])+|(?:\\\\?[\"'])+$"; protected static String cssImportNoUrlPatString = - "@import\\s+((?:'[^']+')|(?:\"[^\"]+\")|(?:\$'[^']+'\$)|(?:\$\"[^\"]+\"\$)|(?:\$[^)]+\$)|(?:[a-z0-9_.:/\\\\-]+))\\s*;"; + "@import\\s+((?:'[^']+')|(?:\"[^\"]+\")|(?:\$'[^']+'\$)|(?:\$\"[^\"]+\"\$)|(?:\$[^)]+\$)|(?:[a-z0-9_.:/\\\\-]+))\\s*;"; protected static Pattern cssImportNoUrlPattern = Pattern .compile(cssImportNoUrlPatString); protected static Pattern cssUrlPattern = Pattern.compile(cssUrlPatString); + + protected static Pattern cssUrlTrimPattern = Pattern.compile(cssUrlTrimPatString); + private final static int MAX_TEXT_LEN = 100; // private static String GLOBAL_ATTR[] = {"background"}; @@ -368,45 +373,20 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs } } } - private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String content) { - Matcher m = pattern.matcher(content); - int idx = 0; - int contentLen = content.length(); - if (contentLen > 100000) - // extract URLs only from the first 100 kB - contentLen = 100000; - FIND: - while((idx < contentLen) && m.find()) { - idx = m.end(); - String url = m.group(1); - if(url.length() < 2) { - continue; - } - if ((url.charAt(0) == '(') - && (url.charAt(url.length()-1) == ')')) { - url = url.substring(1, url.length() - 1); - } - CLIP: - while (url.length() > 1) { - if ((url.charAt(0) == '"' || url.charAt(0) == '\'') - && (url.charAt(url.length() - 1) == '"' - || url.charAt(url.length() - 1) == '\'')) { - if(url.length() <= 2) { - // empty URL - continue FIND; - } - url = url.substring(1, url.length() - 1); - } else if (url.charAt(0) == '\\') { - if(url.length() <= 4) { - // empty URL - continue FIND; - } - url = url.substring(2, url.length() - 2); - } else { - break CLIP; - } - } - data.addHref("path","STYLE/#text","href",url); - } - } + private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String content) { + Matcher m = pattern.matcher(content); + int idx = 0; + int contentLen = content.length(); + if (contentLen > 100000) + // extract URLs only from the first 100 kB + contentLen = 100000; + while((idx < contentLen) && m.find()) { + idx = m.end(); + String url = m.group(1); + url = cssUrlTrimPattern.matcher(url).replaceAll(""); + if (!url.isEmpty()) { + data.addHref("path","STYLE/#text","href", url); + } + } + } } diff --git a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java index 236b964b..bfbd6f02 100644 --- a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java +++ b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java @@ -20,8 +20,8 @@ public void testHandleStyleNodeExceptions() throws Exception { "url('\")", "url(')", "url('\"')", - "url('\\\"\"')", - "url(''''')" + "url('\\\"\"')", + "url(''''')" }; boolean except = false; HTMLMetaData md = new HTMLMetaData(new MetaData()); @@ -50,7 +50,8 @@ public void testHandleStyleNode() throws Exception { {"url(\\'foo.gif\\')","foo.gif"}, {"url(''foo.gif'')","foo.gif"}, {"url( foo.gif )","foo.gif"}, - {"url('''')"} + {"url('''')"}, + {"url('foo.gif'')","foo.gif"}, }; for(String[] testa : tests) { checkExtract(testa); @@ -98,7 +99,7 @@ private void checkExtract(String[] data) throws JSONException { data[i], jo.getString("href")); } } else { - assertNull(a); + assertNull("Expected no extracted link for <" + css + ">", a); } } From 038402885f85a426601d5f85936e210e4f55636f Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Fri, 27 Jan 2017 08:59:25 +0100 Subject: [PATCH 019/211] CharsetDetector: remove unnecessary check for empty string (contributed by @ldko) --- .../java/org/archive/format/text/charset/CharsetDetector.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/main/java/org/archive/format/text/charset/CharsetDetector.java b/src/main/java/org/archive/format/text/charset/CharsetDetector.java index 9b4c8523..690f8b99 100644 --- a/src/main/java/org/archive/format/text/charset/CharsetDetector.java +++ b/src/main/java/org/archive/format/text/charset/CharsetDetector.java @@ -183,8 +183,6 @@ private static String trimAttrValue(String value) { return value; } String result = value; - if (result.isEmpty()) - return result; if (result.charAt(0) == '"') { result = result.substring(1, result.length() - 1); } else if (result.charAt(0) == '\'') { From 1364716a83911369de7256aa1718a236acb75973 Mon Sep 17 00:00:00 2001 From: Lauren Ko Date: Tue, 14 Feb 2017 17:07:36 -0600 Subject: [PATCH 020/211] Logging changes for next release. --- CHANGES.md | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 52c40f42..fee29e16 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,3 +1,11 @@ +1.1.8 +----- +* [Strip empty port via URLParser](https://github.com/iipc/webarchive-commons/pull/69/) +* [Use CharsetDetector to guess encoding of HTML documents](https://github.com/iipc/webarchive-commons/pull/68/) +* [Fix last header was lost if LF LF](https://github.com/iipc/webarchive-commons/pull/65/) +* [Make regular expression to extract URLs from CSS more restrictive](https://github.com/iipc/webarchive-commons/pull/63) +* [Remove invalid constant `PROFILE_REVISIT_URI_AGNOSTIC_IDENTICAL_DIGEST`](https://github.com/iipc/webarchive-commons/pull/62) + 1.1.7 ----- * [Make canonicalizer be able to strip session id params even if they are the first params in the query string](https://github.com/iipc/webarchive-commons/pull/54) @@ -36,10 +44,10 @@ 1.1.2 ----- -* Fixed support for reading uncompressed WARCs, along with some unit testing. (https://github.com/iipc/webarchive-commons/pull/12) +* [Fixed support for reading uncompressed WARCs, along with some unit testing.](https://github.com/iipc/webarchive-commons/pull/12) 1.1.1 ----- -* Renamed from commons-webarchive to webarchive-commons (https://github.com/iipc/webarchive-commons/pull/8) -* Cope with malformed GZip extra fields as produced by wget 1.14 (https://github.com/iipc/webarchive-commons/pull/10) -* Switch to httpcomponents, and add IA deployment information. (https://github.com/iipc/webarchive-commons/pull/11) +* [Renamed from commons-webarchive to webarchive-commons](https://github.com/iipc/webarchive-commons/pull/8) +* [Cope with malformed GZip extra fields as produced by wget 1.14](https://github.com/iipc/webarchive-commons/pull/10) +* [Switch to httpcomponents, and add IA deployment information.](https://github.com/iipc/webarchive-commons/pull/11) From 11579c2baab0db08f14341f70b848353eed17269 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Wed, 22 Feb 2017 13:11:13 +0100 Subject: [PATCH 021/211] Improve HTML link extraction - add extractors for more elements which can take URLs as attribute values, add missing attributes - generalize extraction of "global" attributes (`background`) - add custom data attributes frequently used for linking (`data-href`, `data-uri`) - add unit test to cover link extraction --- .../html/ExtractingParseObserver.java | 79 ++++- .../html/ExtractingParseObserverTest.java | 161 +++++++++ .../resource/html/link-extraction-test.warc | 320 ++++++++++++++++++ 3 files changed, 551 insertions(+), 9 deletions(-) create mode 100644 src/test/resources/org/archive/resource/html/link-extraction-test.warc diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java index deb8c8c0..826851e0 100644 --- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java +++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java @@ -2,12 +2,17 @@ import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; +import java.util.Locale; import java.util.Map; +import java.util.Set; import java.util.Stack; +import java.util.Vector; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.archive.format.text.html.ParseObserver; +import org.htmlparser.Attribute; import org.htmlparser.nodes.RemarkNode; import org.htmlparser.nodes.TagNode; import org.htmlparser.nodes.TextNode; @@ -36,11 +41,10 @@ public class ExtractingParseObserver implements ParseObserver { private final static int MAX_TEXT_LEN = 100; -// private static String GLOBAL_ATTR[] = {"background"}; - private static final String PATH = "path"; private static final String PATH_SEPARATOR = "@/"; - private final static Map extractors; + private static final Map extractors; + private static final Set globalHrefAttributes; static { extractors = new HashMap(); extractors.put("A", new AnchorTagExtractor()); @@ -57,6 +61,22 @@ public class ExtractingParseObserver implements ParseObserver { extractors.put("META", new MetaTagExtractor()); extractors.put("OBJECT", new ObjectTagExtractor()); extractors.put("SCRIPT", new ScriptTagExtractor()); + extractors.put("Q", new QuotationLinkTagExtractor()); + extractors.put("BLOCKQUOTE", new QuotationLinkTagExtractor()); + extractors.put("DEL", new QuotationLinkTagExtractor()); + extractors.put("INS", new QuotationLinkTagExtractor()); + // HTML5: + extractors.put("BUTTON", new ButtonTagExtractor()); + extractors.put("MENUITEM", new MenuitemTagExtractor()); + extractors.put("VIDEO", new EmbedVideoTagExtractor()); + extractors.put("AUDIO", new EmbedTagExtractor()); + extractors.put("TRACK", new EmbedTagExtractor()); + extractors.put("SOURCE", new EmbedTagExtractor()); + + globalHrefAttributes = new HashSet(); + globalHrefAttributes.add("background"); + globalHrefAttributes.add("data-href"); + globalHrefAttributes.add("data-uri"); } @@ -84,11 +104,19 @@ public void handleTagOpen(TagNode tag) { inTitle = !tag.isEmptyXmlTag(); return; } + // first the global attributes: - // background - String v = tag.getAttribute("background"); - if(v != null) { - data.addHref(PATH,makePath(name,"background"),"url",v); + Vector attributes = tag.getAttributesEx(); + for (Attribute a : attributes) { + String attrName = a.getName(); + String attrValue = a.getValue(); + if (attrName == null || attrValue == null) { + continue; + } + attrName = attrName.toLowerCase(Locale.ROOT); + if (globalHrefAttributes.contains(attrName)) { + data.addHref(PATH,makePath(name,attrName),"url",attrValue); + } } // TODO: style attribute, BASE(href) tag, Resolve URLs @@ -296,12 +324,24 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs } } + private static class ButtonTagExtractor implements TagExtractor { + public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { + addBasicHrefs(data,node,"formaction"); + } + } + private static class EmbedTagExtractor implements TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { addBasicHrefs(data,node,"src"); } } + private static class EmbedVideoTagExtractor implements TagExtractor { + public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { + addBasicHrefs(data,node,"src","poster"); + } + } + private static class FormTagExtractor implements TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { ArrayList l = new ArrayList(); @@ -329,21 +369,26 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs addBasicHrefs(data,node,"src"); } } + private static class IFrameTagExtractor implements TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { addBasicHrefs(data,node,"src"); } } + private static class ImgTagExtractor implements TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { addHrefWithAttrs(data,node,"src","alt","title"); + addBasicHrefs(data,node,"longdesc"); } } + private static class InputTagExtractor implements TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { - addBasicHrefs(data,node,"src"); + addBasicHrefs(data,node,"src","formaction"); } } + private static class LinkTagExtractor implements TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { ArrayList l = getAttrListUrl(node,"href","rel","type"); @@ -352,6 +397,13 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs } } } + + private static class MenuitemTagExtractor implements TagExtractor { + public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { + addBasicHrefs(data,node,"icon"); + } + } + private static class MetaTagExtractor implements TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { ArrayList l = getAttrList(node,"name","rel","content","http-equiv"); @@ -360,11 +412,19 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs } } } + private static class ObjectTagExtractor implements TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { - addBasicHrefs(data,node,"codebase","cdata"); + addBasicHrefs(data,node,"codebase","cdata","data"); } } + + private static class QuotationLinkTagExtractor implements TagExtractor { + public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { + addBasicHrefs(data,node,"cite"); + } + } + private static class ScriptTagExtractor implements TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { ArrayList l = getAttrListUrl(node,"src","type"); @@ -373,6 +433,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs } } } + private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String content) { Matcher m = pattern.matcher(content); int idx = 0; diff --git a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java index bfbd6f02..8f690a06 100644 --- a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java +++ b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java @@ -1,15 +1,33 @@ package org.archive.resource.html; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.logging.Logger; + +import org.archive.extract.ExtractingResourceFactoryMapper; +import org.archive.extract.ExtractingResourceProducer; +import org.archive.extract.ProducerUtils; +import org.archive.extract.ResourceFactoryMapper; import org.archive.resource.MetaData; +import org.archive.resource.Resource; +import org.archive.resource.ResourceParseException; +import org.archive.resource.ResourceProducer; import org.htmlparser.nodes.TextNode; import org.json.JSONArray; import org.json.JSONException; import org.json.JSONObject; +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.Multimap; + import junit.framework.TestCase; public class ExtractingParseObserverTest extends TestCase { + private static final Logger LOG = + Logger.getLogger(ExtractingParseObserverTest.class.getName()); + public void testHandleStyleNodeExceptions() throws Exception { String[] tests = { "some css", @@ -103,5 +121,148 @@ private void checkExtract(String[] data) throws JSONException { } } + private void checkLink(Multimap links, String url, String path) { + assertTrue("Link with URL " + url + " not found", links.containsKey(url)); + assertTrue("Wrong path " + path + " for " + url, links.get(url).contains(path)); + } + + private void checkLinks(Resource resource, String[][] expectedLinks) { + assertNotNull(resource); + assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource); + MetaData md = resource.getMetaData(); + LOG.info(md.toString()); + Multimap links = ArrayListMultimap.create(); + JSONObject head = md.optJSONObject("Head"); + if (head != null) { + // + String baseUrl = (String) head.opt("Base"); + if (baseUrl != null) { + links.put(baseUrl, "__base__"); + } + // + JSONArray metas = head.optJSONArray("Metas"); + if (metas != null) { + for (int i = 0; i < metas.length(); i++) { + JSONObject o = (JSONObject) metas.optJSONObject(i); + String httpEquiv = o.optString("http-equiv"); + if (httpEquiv != null && httpEquiv.equalsIgnoreCase("Refresh")) { + String metaRefreshTarget = o.optString("content"); + if (metaRefreshTarget != null) { + metaRefreshTarget = metaRefreshTarget.replaceFirst("(?i)(?:^\\d+\\s*;)?\\s*url=", ""); + links.put(metaRefreshTarget, "__meta_refresh__"); + } + } + } + } + } + // extract outlinks + List linkArrays = new ArrayList(); + if (md.optJSONArray("Links") != null) { + linkArrays.add(md.optJSONArray("Links")); + } + try { + if (md.getJSONObject("Head") != null && md.getJSONObject("Head").getJSONArray("Link") != null) { + linkArrays.add(md.getJSONObject("Head").getJSONArray("Link")); + } + } catch (JSONException e1) { + } + for (JSONArray ldata : linkArrays) { + for (int i = 0; i < ldata.length(); i++) { + JSONObject o = (JSONObject) ldata.optJSONObject(i); + try { + String url = o.getString("url"); + links.put(url, o.getString("path")); + LOG.info(" found link: " + o.getString("url") + " " + o.getString("path")); + } catch (JSONException e) { + fail("Failed to extract URL from link: " + e.getMessage()); + } + } + } + assertEquals("Unexpected number of links", expectedLinks.length, links.size()); + for (String[] l : expectedLinks) { + checkLink(links, l[0], l[1]); + } + } + + public void testLinkExtraction() throws ResourceParseException, IOException { + String testFileName = "link-extraction-test.warc"; + ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath()); + ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper(); + ExtractingResourceProducer extractor = + new ExtractingResourceProducer(producer, mapper); + extractor.getNext(); // skip warcinfo record + String[][] html4links = { + {"http://www.example.com/", "__base__"}, + {"http://www.example.com/redirected.html", "__meta_refresh__"}, + {"background.jpg", "BODY@/background"}, + {"http://www.example.com/a-href.html", "A@/href"}, + {"#anchor", "A@/href"}, + {"image.png", "IMG@/src"}, + {"image.gif", "IMG@/src"}, + {"http://example.com/image-description.html#image.gif", "IMG@/longdesc"}, + {"helloworld.swf", "OBJECT@/data"}, + {"http://www.example.com/shakespeare.html", "Q@/cite"}, + {"http://www.example.com/shakespeare-long.html", "BLOCKQUOTE@/cite"} + }; + checkLinks(extractor.getNext(), html4links); + String[][] html5links = { + {"http:///www.example.com/video.html", "LINK@/href", "canonical"}, + {"video.rss", "LINK@/href", "alternate"}, + {"https://archive.org/download/WebmVp8Vorbis/webmvp8.gif", "VIDEO@/poster"}, + {"https://archive.org/download/WebmVp8Vorbis/webmvp8.webm", "SOURCE@/src"}, + {"https://archive.org/download/WebmVp8Vorbis/webmvp8_512kb.mp4", "SOURCE@/src"}, + {"https://archive.org/download/WebmVp8Vorbis/webmvp8.ogv", "SOURCE@/src"} + }; + checkLinks(extractor.getNext(), html5links); + String[][] html5links2 = { + {"http://www.example.com/", "A@/href"}, + }; + checkLinks(extractor.getNext(), html5links2); + String[][] fbVideoLinks = { + {"https://www.facebook.com/facebook/videos/10153231379946729/", "BLOCKQUOTE@/cite"}, + {"https://www.facebook.com/facebook/videos/10153231379946729/", "A@/href"}, + {"https://www.facebook.com/facebook/", "A@/href"}, + {"https://www.facebook.com/facebook/videos/10153231379946729/", "DIV@/data-href"} + }; + checkLinks(extractor.getNext(), fbVideoLinks); + String[][] dataHrefLinks = { + {"standard.css", "LINK@/href", "stylesheet"}, + {"https://www.facebook.com/elegantthemes/videos/10153760379211923/", "DIV@/data-href"}, + {"https://www.facebook.com/facebook/videos/10153231379946729/", "DIV@/data-href"}, + {"https://www.facebook.com/facebook/videos/10153231379946729/", "BLOCKQUOTE@/cite"}, + {"https://www.facebook.com/facebook/videos/10153231379946729/", "A@/href"}, + {"https://www.facebook.com/facebook/", "A@/href"}, + {"//edge.flowplayer.org/bauhaus.webm", "SOURCE@/src"}, + {"//edge.flowplayer.org/bauhaus.mp4", "SOURCE@/src"}, + {"//edge.flowplayer.org/functional.webm", "BUTTON@/data-href"}, + {"/content-page", "ARTICLE@/data-href"}, + {"/content-page", "A@/href"}, + {"/tags/content","A@/href"}, + {"/tags/headlines", "A@/href"}, + {"http://grabaperch.com", "DIV@/data-href"}, + {"green.css", "LINK@/data-href"}, + {"blue.css", "LINK@/data-href"}, + {"http://codecanyon.net/user/CodingJack", "A@/data-href"}, + {"jackbox/img/thumbs/4.jpg", "IMG@/src"}, + {"//venobox-destination", "A@/data-href"}, + {"#", "A@/href"}, + {"http://www.youtube.com/v/itTskyFLSS8&rel=0&autohide=1&showinfo=0&autoplay=1", "DIV@/data-href"}, + {"#", "A@/href"}, + {"http://www.youtube.com/v/itTskyFLSS8&rel=0&autohide=1&showinfo=0", "IFRAME@/src"} + }; + checkLinks(extractor.getNext(), dataHrefLinks); + String[][] fbSocialLinks = { + {"http://www.your-domain.com/your-page.html", "DIV@/data-uri"}, + {"https://developers.facebook.com/docs/plugins/comments#configurator", "DIV@/data-href"}, + {"https://www.facebook.com/zuck/posts/10102735452532991?comment_id=1070233703036185", "DIV@/data-href"}, + {"https://www.facebook.com/zuck", "DIV@/data-href"}, + {"https://developers.facebook.com/docs/plugins/", "DIV@/data-href"}, + {"https://www.facebook.com/facebook", "DIV@/data-href"}, + {"https://www.facebook.com/facebook", "BLOCKQUOTE@/cite"}, + {"https://www.facebook.com/facebook", "A@/href"}, + {"http://www.your-domain.com/your-page.html", "DIV@/data-href"} + }; + checkLinks(extractor.getNext(), fbSocialLinks); + } } diff --git a/src/test/resources/org/archive/resource/html/link-extraction-test.warc b/src/test/resources/org/archive/resource/html/link-extraction-test.warc new file mode 100644 index 00000000..ab0e54c8 --- /dev/null +++ b/src/test/resources/org/archive/resource/html/link-extraction-test.warc @@ -0,0 +1,320 @@ +WARC/1.0 +WARC-Type: warcinfo +Content-Type: application/warc-fields +WARC-Date: 2017-02-20T14:00:56Z +Content-Length: 128 + +format: WARC File Format 1.0 +conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf +robots: classic + + + +WARC/1.0 +WARC-Type: response +WARC-Date: 2017-02-20T14:00:56Z +WARC-Target-URI: http://www.example.com/html4.html +Content-Type: application/http; msgtype=response +Content-Length: 1243 + +HTTP/1.1 200 OK +Date: Mon, 20 Feb 2017 14:00:56 GMT +Content-Length: 1125 +Content-Type: application/xhtml+xml + + + + + + + +Test XHTML Link Extraction + + +A@/href +

+ anchor only + IMG@/src + IMG@/longdesc + +

+ To be or not to be. +

+To be, or not to be, that is the question:
+Whether 'tis nobler in the mind to suffer
+The slings and arrows of outrageous fortune, … +

+ + + + +WARC/1.0 +WARC-Type: response +WARC-Target-URI: http://www.example.com/link-extraction-test-html5-video.html +WARC-Date: 2017-02-20T21:35:03Z +Content-Type: application/http; msgtype=response +Content-Length: 890 + +HTTP/1.1 200 OK +Date: Mon, 20 Feb 2017 21:35:03 GMT +Content-Length: 789 +Content-Type: text/html + + + + +Test HTML5 Video Tag + + + + + + +

+ + + + +WARC/1.0 +WARC-Type: response +WARC-Target-URI: http://www.example.com/poor_html5.html +WARC-Date: 2017-02-21T15:50:40Z +Content-Type: application/http; msgtype=response +Content-Length: 594 + +HTTP/1.1 200 OK +Date: Tue, 21 Feb 2017 15:50:40 GMT +Content-Length: 486 +Content-Type: text/html + + +Testing poor HTML5 + + + + + +This is valid HTML5! + + + +

header

+ +

headline

+ +

paragraph one with link. + + +WARC/1.0 +WARC-Type: response +WARC-Target-URI: http://www.example.com/fb-video.html +WARC-Date: 2017-02-20T16:58:50Z +Content-Type: application/http; msgtype=response +Content-Length: 1330 + +HTTP/1.1 200 OK +Date: Mon, 20 Feb 2017 16:58:50 GMT +Content-Length: 1194 +Content-Type: text/html + + + + + fb-video - Embedded Videos - Social Plugins + + + + +

+ + + +

+ How to Share With Just Friends +
How to share with just friends.
+ Posted by Facebook on Friday, December 5, 2014 +

+ + + + + +WARC/1.0 +WARC-Type: response +WARC-Target-URI: http://www.example.com/data-href.examples.html +WARC-Date: 2017-02-21T21:05:10Z +Content-Type: application/http; msgtype=response +Content-Length: 3160 + +HTTP/1.1 200 OK +Date: Tue, 21 Feb 2017 21:05:10 GMT +Content-Length: 3057 +Content-Type: text/html + + + + + + + + + + + + +

+ + +

+ How to Share With Just Friends +
How to share with just friends.
+ Posted by Facebook on Friday, December 5, 2014 +

+ + +

+ +

+ + +

Headline goes here.

And here goes a bit of copy about the content of the article.

+ Tags: content, headlines +

+ + +

+ + + +

+ + + +venobox + + + + + +WARC/1.0 +WARC-Type: response +WARC-Target-URI: http://www.example.com/fb-social-plugins.html +WARC-Date: 2017-02-22T09:33:02Z +Content-Type: application/http; msgtype=response +Content-Length: 1870 + +HTTP/1.1 200 OK +Date: Wed, 22 Feb 2017 09:33:02 GMT +Content-Length: 1767 +Content-Type: text/html + + +

+ + +

Facebook

+ + + + + From 6aa43f83a2cbc2acd0feb7f2c81d66f4ef1b13c5 Mon Sep 17 00:00:00 2001 From: Mohamed Elsayed Date: Thu, 2 Mar 2017 15:28:16 +0200 Subject: [PATCH 022/211] Fix #25: move missing unit tests over from Heritrix3 --- .../archive/io/ArchiveReaderFactoryTest.java | 94 +++ .../io/BufferedSeekInputStreamTest.java | 67 ++ .../archive/io/HeaderedArchiveRecordTest.java | 209 ++++++ .../archive/io/RecordingInputStreamTest.java | 132 ++++ .../archive/io/ReplayCharSequenceTest.java | 391 ++++++++++ .../io/RepositionableInputStreamTest.java | 70 ++ .../org/archive/io/arc/ARCWriterPoolTest.java | 122 +++ .../org/archive/io/arc/ARCWriterTest.java | 699 ++++++++++++++++++ .../org/archive/io/warc/WARCWriterTest.java | 512 +++++++++++++ .../org/archive/uid/UUIDGeneratorTest.java | 44 ++ .../java/org/archive/util/FileUtilsTest.java | 271 +++++++ .../org/archive/util/MimetypeUtilsTest.java | 63 ++ .../org/archive/util/PropertyUtilsTest.java | 45 ++ .../org/archive/util/anvl/ANVLRecordTest.java | 128 ++++ 14 files changed, 2847 insertions(+) create mode 100644 src/test/java/org/archive/io/ArchiveReaderFactoryTest.java create mode 100644 src/test/java/org/archive/io/BufferedSeekInputStreamTest.java create mode 100644 src/test/java/org/archive/io/HeaderedArchiveRecordTest.java create mode 100644 src/test/java/org/archive/io/RecordingInputStreamTest.java create mode 100644 src/test/java/org/archive/io/ReplayCharSequenceTest.java create mode 100644 src/test/java/org/archive/io/RepositionableInputStreamTest.java create mode 100644 src/test/java/org/archive/io/arc/ARCWriterPoolTest.java create mode 100644 src/test/java/org/archive/io/arc/ARCWriterTest.java create mode 100644 src/test/java/org/archive/io/warc/WARCWriterTest.java create mode 100644 src/test/java/org/archive/uid/UUIDGeneratorTest.java create mode 100644 src/test/java/org/archive/util/FileUtilsTest.java create mode 100644 src/test/java/org/archive/util/MimetypeUtilsTest.java create mode 100644 src/test/java/org/archive/util/PropertyUtilsTest.java create mode 100644 src/test/java/org/archive/util/anvl/ANVLRecordTest.java diff --git a/src/test/java/org/archive/io/ArchiveReaderFactoryTest.java b/src/test/java/org/archive/io/ArchiveReaderFactoryTest.java new file mode 100644 index 00000000..2313868c --- /dev/null +++ b/src/test/java/org/archive/io/ArchiveReaderFactoryTest.java @@ -0,0 +1,94 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + +import java.io.File; +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.Iterator; + +import org.apache.commons.lang.StringUtils; +import org.archive.io.ArchiveRecord; +import org.archive.io.arc.ARCWriterTest; +import org.archive.util.TmpDirTestCase; + +public class ArchiveReaderFactoryTest extends TmpDirTestCase { + /** + * Test local file as URL + * @throws IOException + */ + public void testGetFileURL() throws IOException { + File arc = ARCWriterTest.createARCFile(getTmpDir(), true); + ArchiveReader reader = null; + try { + reader = ArchiveReaderFactory. + get(new URL("file:////" + arc.getAbsolutePath())); + for (Iterator i = reader.iterator(); i.hasNext();) { + ArchiveRecord r = (ArchiveRecord)i.next(); + assertTrue("mime unread",StringUtils.isNotBlank(r.getHeader().getMimetype())); + } + } finally { + if (reader != null) { + reader.close(); + } + } + } + + /** + * Test local file as File + * @throws IOException + */ + public void testGetFile() throws IOException { + File arc = ARCWriterTest.createARCFile(getTmpDir(), true); + ArchiveReader reader = null; + try { + reader = ArchiveReaderFactory.get(arc.getAbsoluteFile()); + for (Iterator i = reader.iterator(); i.hasNext();) { + ArchiveRecord r = (ArchiveRecord)i.next(); + assertTrue("mime unread",StringUtils.isNotBlank(r.getHeader().getMimetype())); + } + } finally { + if (reader != null) { + reader.close(); + } + } + } + + /** + * Test local file as String path + * @throws IOException + */ + public void testGetPath() throws IOException { + File arc = ARCWriterTest.createARCFile(getTmpDir(), true); + ArchiveReader reader = null; + try { + reader = ArchiveReaderFactory.get(arc.getAbsoluteFile().getAbsolutePath()); + for (Iterator i = reader.iterator(); i.hasNext();) { + ArchiveRecord r = (ArchiveRecord)i.next(); + assertTrue("mime unread",StringUtils.isNotBlank(r.getHeader().getMimetype())); + } + } finally { + if (reader != null) { + reader.close(); + } + } + } +} diff --git a/src/test/java/org/archive/io/BufferedSeekInputStreamTest.java b/src/test/java/org/archive/io/BufferedSeekInputStreamTest.java new file mode 100644 index 00000000..270e45e0 --- /dev/null +++ b/src/test/java/org/archive/io/BufferedSeekInputStreamTest.java @@ -0,0 +1,67 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.util.Random; + +import junit.framework.TestCase; + + +/** + * Unit test for BufferedSeekInputStream. The tests do some random + * repositioning in the stream to make sure the buffer is always valid. + * + * @author pjack + */ +public class BufferedSeekInputStreamTest extends TestCase { + + + private static byte[] TEST_DATA = makeTestData(); + + public void testPosition() throws Exception { + Random random = new Random(); + ArraySeekInputStream asis = new ArraySeekInputStream(TEST_DATA); + BufferedSeekInputStream bsis = new BufferedSeekInputStream(asis, 11); + for (int i = 0; i < TEST_DATA.length; i++) { + byte b = (byte)bsis.read(); + assertEquals(TEST_DATA[i], b); + } + for (int i = 0; i < 1000; i++) { + int index = random.nextInt(TEST_DATA.length); + bsis.position(index); + char expected = (char)((int)TEST_DATA[index] & 0xFF); + char read = (char)(bsis.read() & 0xFF); + assertEquals(expected, read); + } + } + + + private static byte[] makeTestData() { + String s = "If the dull substance of my flesh were thought\n" + + "Injurious distance could not stop my way\n" + + "For then, despite of space, I would be brought\n" + + "From limits far remote where thou dost stay.\n"; + byte[] r = new byte[s.length()]; + for (int i = 0; i < r.length; i++) { + r[i] = (byte)s.charAt(i); +// r[i] = (byte)s.charAt(i); + } + return r; + } +} diff --git a/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java b/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java new file mode 100644 index 00000000..9f7e2a15 --- /dev/null +++ b/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java @@ -0,0 +1,209 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import junit.framework.TestCase; + +import org.apache.commons.httpclient.Header; +import org.archive.io.arc.ARCRecord; +import org.archive.io.warc.WARCRecord; + +public class HeaderedArchiveRecordTest extends TestCase { + private static final String HTTPHEADER = "HTTP/1.1 200 OK\r\n" + + "Last-Modified: Sun, 28 Aug 2005 14:10:55 GMT\r\n" + + "Content-Length: 108\r\n" + "Connection: close\r\n" + + "Content-Type: text/html\r\n" + "\r\n"; + private static final String BODY = "\r\n" + " \r\n" + + " Neue Seite 1\r\n" + " \r\n" + + " \r\n" + " \r\n" + ""; + + public void testParseHttpHeadersInWARC() throws IOException { + final String url = "http://foo.maths.uq.edu.au/index.html"; + // final String warcHeader = "WARC/0.10 000000000486 response " + + // url + " 20070315152520 " + + // "urn:uuid:d8b342a8-dba4-4d7f-a551-1d8184f2ff58 " + + // "application/http; msgtype=response\r\n" + + // "Checksum: sha1:IT6YEX5WHKK57GOEHV2YHTTXEP5KPM6A\r\n" + + // "IP-Address: 80.150.6.184\r\n" + + // "\r\n"; + + final String warcHeader = "WARC/0.12\r\n" + + "MIME-Version: 1.0\r\n" + + "WARC-Record-Type: response\r\n" + + "WARC-Target-URI: http://foo.maths.uq.edu.au/index.html\r\n" + + "WARC-Date: 2006-09-19T17:20:24Z\r\n" + + "WARC-Digest: sha1:IT6YEX5WHKK57GOEHV2YHTTXEP5KPM6A\r\n" + + "WARC-IP-Address: 80.150.6.184\r\n" + + "Content-ID: \r\n" + + "Content-Type: application/http; msgtype=response\r\n" + + "Content-Length: " + (HTTPHEADER.length() + BODY.length()) + "\r\n" + + "\r\n"; + + final String hdr = warcHeader + HTTPHEADER + BODY; + + WARCRecord r = new WARCRecord(new ByteArrayInputStream(hdr.getBytes()), + "READER_IDENTIFIER", 0, false, true); + HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true); + + har.skipHttpHeader(); + + byte[] b = new byte[BODY.length()]; + har.read(b); + String bodyRead = new String(b); + assertEquals(BODY, bodyRead); + assertHeaderCorrectlyParsed(har.getContentHeaders()); + assertEquals("failed to retrieve Url from metadata", har.getHeader() + .getUrl(), url); + } + + public void testParseHttpHeadersInARC() throws IOException { + final int len = HTTPHEADER.length() + BODY.length(); + final int contentLength = BODY.length(); + final String url = "http://www.ly.gov.tw:80/accpart.htm"; + final String hdr = HTTPHEADER + BODY; + // Interesting difference between ARCRecord and WARCRecord is that the + // stream passed the ARCRecord is supposed to be just past the + // ARCRecord metadata line where as stream passed WARCRecord is at + // record start. TODO: Add to ARCRecord constructor that doesn't + // take an ArchiveRecordHeader but rather parses it from the stream. + ArchiveRecordHeader arh = new ArchiveRecordHeader() { + public int getContentBegin() { + // TODO: In ARCs, this is where http headers end and + // the content begins. Need to reconcile for generic + // HeaderedArchiveRecord processing. In this context, it + // makes sense setting it to zero -- HeaderedArchiveRecord + // will then figure it out. + return 0; + } + + public String getDate() { + return null; + } + + public String getDigest() { + return null; + } + + public Set getHeaderFieldKeys() { + return null; + } + + public Map getHeaderFields() { + return null; + } + + public Object getHeaderValue(String key) { + return null; + } + + public long getLength() { + return len; + } + + public long getContentLength() { + return contentLength; + } + + public String getMimetype() { + return null; + } + + public long getOffset() { + return 0; + } + + public String getReaderIdentifier() { + return null; + } + + public String getRecordIdentifier() { + return null; + } + + public String getUrl() { + return url; + } + + public String getVersion() { + return null; + } + + }; + ARCRecord r = new ARCRecord(new ByteArrayInputStream(hdr.getBytes()), + arh, 0, false, true, false); + + HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true); + har.skipHttpHeader(); + byte[] b = new byte[BODY.length()]; + har.read(b); + String bodyRead = new String(b); + assertEquals(BODY, bodyRead); + assertHeaderCorrectlyParsed(har.getContentHeaders()); + } + + public void testEasierParseHttpHeadersInARC() throws IOException { + final String url = "http://www.archive.org/index.htm"; + final String arcHeader = url + + " 192.168.0.1 20070515111004 text/html 167568\n"; + final String hdr = arcHeader + HTTPHEADER + BODY; + + ARCRecord r = new ARCRecord(new ByteArrayInputStream(hdr.getBytes()), + "READER_IDENTIFIER", 0, false, true, false); + + HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true); + har.skipHttpHeader(); + byte[] b = new byte[BODY.length()]; + har.read(b); + String bodyRead = new String(b); + assertEquals(BODY, bodyRead); + assertHeaderCorrectlyParsed(har.getContentHeaders()); + assertEquals("failed to retrieve Url from metadata", har.getHeader() + .getUrl(), url); + } + + private void assertHeaderCorrectlyParsed(Header[] headers) { + final List orgHeaders = Arrays.asList(HTTPHEADER.split("\r\n")); + assertEquals("not all HTTP header entries have been retrieved", + orgHeaders.size(), headers.length + 1); + + for (Header header : headers) { + assertTrue(orgHeaders.contains(header.getName() + ": " + + header.getValue())); + } + } + + public void testNoheaderWARC() throws IOException { + String b = "hello world"; + String c = "WARC/0.12\r\nContent-Type: text/plain\r\n" + + "Content-Length: " + b.length() + "\r\n\r\n" + b; + org.archive.io.warc.WARCRecord r = new org.archive.io.warc.WARCRecord( + new ByteArrayInputStream(c.getBytes()), "READER_IDENTIFIER", 0, + false, true); + HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true); + assertTrue(har.isStrict()); + } +} diff --git a/src/test/java/org/archive/io/RecordingInputStreamTest.java b/src/test/java/org/archive/io/RecordingInputStreamTest.java new file mode 100644 index 00000000..20a8b8b3 --- /dev/null +++ b/src/test/java/org/archive/io/RecordingInputStreamTest.java @@ -0,0 +1,132 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.IOException; +import java.io.PipedInputStream; +import java.io.PipedOutputStream; + +import org.archive.util.TmpDirTestCase; + + +/** + * Test cases for RecordingInputStream. + * + * @author gojomo + */ +public class RecordingInputStreamTest extends TmpDirTestCase +{ + + + /* + * @see TmpDirTestCase#setUp() + */ + protected void setUp() throws Exception + { + super.setUp(); + } + + /** + * Test readFullyOrUntil soft (no exception) and hard (exception) + * length cutoffs, timeout, and rate-throttling. + * + * @throws IOException + * @throws InterruptedException + * @throws RecorderTimeoutException + */ + public void testReadFullyOrUntil() throws RecorderTimeoutException, IOException, InterruptedException + { + RecordingInputStream ris = new RecordingInputStream(16384, (new File( + getTmpDir(), "testReadFullyOrUntil").getAbsolutePath())); + ByteArrayInputStream bais = new ByteArrayInputStream( + "abcdefghijklmnopqrstuvwxyz".getBytes()); + // test soft max + ris.open(bais); + ris.setLimits(10,0,0); + ris.readFullyOrUntil(7); + ris.close(); + ReplayInputStream res = ris.getReplayInputStream(); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + res.readFullyTo(baos); + assertEquals("soft max cutoff","abcdefg",new String(baos.toByteArray())); + // test hard max + bais.reset(); + baos.reset(); + ris.open(bais); + boolean exceptionThrown = false; + try { + ris.setLimits(10,0,0); + ris.readFullyOrUntil(13); + } catch (RecorderLengthExceededException ex) { + exceptionThrown = true; + } + assertTrue("hard max exception",exceptionThrown); + ris.close(); + res = ris.getReplayInputStream(); + res.readFullyTo(baos); + assertEquals("hard max cutoff","abcdefghijk", + new String(baos.toByteArray())); + // test timeout + PipedInputStream pin = new PipedInputStream(); + PipedOutputStream pout = new PipedOutputStream(pin); + ris.open(pin); + exceptionThrown = false; + trickle("abcdefghijklmnopqrstuvwxyz".getBytes(),pout); + try { + ris.setLimits(0,5000,0); + ris.readFullyOrUntil(0); + } catch (RecorderTimeoutException ex) { + exceptionThrown = true; + } + assertTrue("timeout exception",exceptionThrown); + ris.close(); + // test rate limit + bais = new ByteArrayInputStream(new byte[1024*2*5]); + ris.open(bais); + long startTime = System.currentTimeMillis(); + ris.setLimits(0,0,2); + ris.readFullyOrUntil(0); + long endTime = System.currentTimeMillis(); + long duration = endTime - startTime; + assertTrue("read too fast: "+duration,duration>=5000); + ris.close(); + } + + protected void trickle(final byte[] bytes, final PipedOutputStream pout) { + new Thread() { + public void run() { + try { + for (int i = 0; i < bytes.length; i++) { + Thread.sleep(1000); + pout.write(bytes[i]); + } + pout.close(); + } catch (IOException e) { + // do nothing + } catch (Exception e) { + System.err.print(e); + } + } + }.start(); + + } +} diff --git a/src/test/java/org/archive/io/ReplayCharSequenceTest.java b/src/test/java/org/archive/io/ReplayCharSequenceTest.java new file mode 100644 index 00000000..9208594a --- /dev/null +++ b/src/test/java/org/archive/io/ReplayCharSequenceTest.java @@ -0,0 +1,391 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io; + +import java.io.IOException; +import java.nio.charset.Charset; +import java.text.NumberFormat; +import java.util.Date; +import java.util.Random; +import java.util.logging.Logger; + +import org.archive.util.FileUtils; +import org.archive.util.TmpDirTestCase; + +import com.google.common.base.Charsets; + +/** + * Test ReplayCharSequences. + * + * @author stack, gojomo + * @version $Revision$, $Date$ + */ +public class ReplayCharSequenceTest extends TmpDirTestCase +{ + /** + * Logger. + */ + private static Logger logger = + Logger.getLogger("org.archive.io.ReplayCharSequenceFactoryTest"); + + + private static final int SEQUENCE_LENGTH = 127; + private static final int MULTIPLIER = 3; + private static final int BUFFER_SIZE = SEQUENCE_LENGTH * MULTIPLIER; + private static final int INCREMENT = 1; + + /** + * Buffer of regular content. + */ + private byte [] regularBuffer = null; + + /* + * @see TestCase#setUp() + */ + protected void setUp() throws Exception + { + super.setUp(); + this.regularBuffer = + fillBufferWithRegularContent(new byte [BUFFER_SIZE]); + } + + public void testShiftjis() throws IOException { + + // Here's the bytes for the JIS encoding of the Japanese form of Nihongo + byte[] bytes_nihongo = { + (byte) 0x1B, (byte) 0x24, (byte) 0x42, (byte) 0x46, + (byte) 0x7C, (byte) 0x4B, (byte) 0x5C, (byte) 0x38, + (byte) 0x6C, (byte) 0x1B, (byte) 0x28, (byte) 0x42, + (byte) 0x1B, (byte) 0x28, (byte) 0x42 }; + final String ENCODING = "SJIS"; + // Here is nihongo converted to JVM encoding. + String nihongo = new String(bytes_nihongo, ENCODING); + + RecordingOutputStream ros = writeTestStream( + bytes_nihongo,MULTIPLIER, + "testShiftjis",MULTIPLIER); + // TODO: check for existence of overflow file? + ReplayCharSequence rcs = getReplayCharSequence(ros,Charset.forName(ENCODING)); + + // Now check that start of the rcs comes back in as nihongo string. + String rcsStr = rcs.subSequence(0, nihongo.length()).toString(); + assertTrue("Nihongo " + nihongo + " does not equal converted string" + + " from rcs " + rcsStr, + nihongo.equals(rcsStr)); + // And assert next string is also properly nihongo. + if (rcs.length() >= (nihongo.length() * 2)) { + rcsStr = rcs.subSequence(nihongo.length(), + nihongo.length() + nihongo.length()).toString(); + assertTrue("Nihongo " + nihongo + " does not equal converted " + + " string from rcs (2nd time)" + rcsStr, + nihongo.equals(rcsStr)); + } + } + + public void testGetReplayCharSequenceByteZeroOffset() throws IOException { + + RecordingOutputStream ros = writeTestStream( + regularBuffer,MULTIPLIER, + "testGetReplayCharSequenceByteZeroOffset",MULTIPLIER); + ReplayCharSequence rcs = getReplayCharSequence(ros); + + for (int i = 0; i < MULTIPLIER; i++) { + accessingCharacters(rcs); + } + } + + private ReplayCharSequence getReplayCharSequence(RecordingOutputStream ros) throws IOException { + return getReplayCharSequence(ros,null); + } + + private ReplayCharSequence getReplayCharSequence(RecordingOutputStream ros, Charset charset) throws IOException { + return new GenericReplayCharSequence(ros.getReplayInputStream(), + ros.getBufferLength()/2, ros.backingFilename, charset); + } + + + public void testGetReplayCharSequenceMultiByteZeroOffset() + throws IOException { + + RecordingOutputStream ros = writeTestStream( + regularBuffer,MULTIPLIER, + "testGetReplayCharSequenceMultiByteZeroOffset",MULTIPLIER); + ReplayCharSequence rcs = getReplayCharSequence(ros,Charsets.UTF_8); + + for (int i = 0; i < MULTIPLIER; i++) { + accessingCharacters(rcs); + } + } + + public void testReplayCharSequenceByteToString() throws IOException { + String fileContent = "Some file content"; + byte [] buffer = fileContent.getBytes(); + RecordingOutputStream ros = writeTestStream( + buffer,1, + "testReplayCharSequenceByteToString.txt",0); + ReplayCharSequence rcs = getReplayCharSequence(ros); + String result = rcs.toString(); + assertEquals("Strings don't match",result,fileContent); + } + + private String toHexString(String str) + { + if (str != null) { + StringBuilder buf = new StringBuilder("{ "); + buf.append(Integer.toString(str.charAt(0), 16)); + for (int i = 1; i < str.length(); i++) { + buf.append(", "); + buf.append(Integer.toString(str.charAt(i), 16)); + } + buf.append(" }"); + return buf.toString(); + } + else + return "null"; + } + + public void testSingleByteEncodings() throws IOException { + byte[] bytes = { + (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, + (byte) 0x7d, (byte) 0x7e, (byte) 0x7f, (byte) 0x80, + (byte) 0x81, (byte) 0x82, (byte) 0x83, (byte) 0x84, + (byte) 0xfc, (byte) 0xfd, (byte) 0xfe, (byte) 0xff }; + + String latin1String = new String(bytes, "latin1"); + RecordingOutputStream ros = writeTestStream( + bytes, 1, "testSingleByteEncodings-latin1.txt", 0); + ReplayCharSequence rcs = getReplayCharSequence(ros,Charsets.ISO_8859_1); + String result = rcs.toString(); + logger.fine("latin1[0] " + toHexString(latin1String)); + logger.fine("latin1[1] " + toHexString(result)); + assertEquals("latin1 strings don't match", result, latin1String); + + String w1252String = new String(bytes, "windows-1252"); + ros = writeTestStream( + bytes, 1, "testSingleByteEncodings-windows-1252.txt", 0); + rcs = getReplayCharSequence(ros,Charset.forName("windows-1252")); + result = rcs.toString(); + logger.fine("windows-1252[0] " + toHexString(w1252String)); + logger.fine("windows-1252[1] " + toHexString(result)); + assertEquals("windows-1252 strings don't match", result, w1252String); + + String asciiString = new String(bytes, "ascii"); + ros = writeTestStream( + bytes, 1, "testSingleByteEncodings-ascii.txt", 0); + rcs = getReplayCharSequence(ros,Charset.forName("ascii")); + result = rcs.toString(); + logger.fine("ascii[0] " + toHexString(asciiString)); + logger.fine("ascii[1] " + toHexString(result)); + assertEquals("ascii strings don't match", result, asciiString); + } + + public void testReplayCharSequenceByteToStringOverflow() throws IOException { + String fileContent = "Some file content. "; // ascii + byte [] buffer = fileContent.getBytes(); + RecordingOutputStream ros = writeTestStream( + buffer,1, + "testReplayCharSequenceByteToStringOverflow.txt",1); + String expectedContent = fileContent+fileContent; + + // The string is ascii which is a subset of both these encodings. Use + // both encodings because they exercise different code paths. UTF-8 is + // decoded to UTF-16 while windows-1252 is memory mapped directly. See + // GenericReplayCharSequence + ReplayCharSequence rcsUtf8 = getReplayCharSequence(ros,Charsets.UTF_8); + ReplayCharSequence rcs1252 = getReplayCharSequence(ros,Charset.forName("windows-1252")); + + String result = rcsUtf8.toString(); + assertEquals("Strings don't match", expectedContent, result); + + result = rcs1252.toString(); + assertEquals("Strings don't match", expectedContent, result); + } + + public void testReplayCharSequenceByteToStringMulti() throws IOException { + String fileContent = "Some file content"; + byte [] buffer = fileContent.getBytes("UTF-8"); + final int MULTIPLICAND = 10; + StringBuilder sb = + new StringBuilder(MULTIPLICAND * fileContent.length()); + for (int i = 0; i < MULTIPLICAND; i++) { + sb.append(fileContent); + } + String expectedResult = sb.toString(); + RecordingOutputStream ros = writeTestStream( + buffer,1, + "testReplayCharSequenceByteToStringMulti.txt",MULTIPLICAND-1); + for (int i = 0; i < 3; i++) { + ReplayCharSequence rcs = getReplayCharSequence(ros,Charsets.UTF_8); + String result = rcs.toString(); + assertEquals("Strings don't match", result, expectedResult); + rcs.close(); + System.gc(); + System.runFinalization(); + } + } + + public void xestHugeReplayCharSequence() throws IOException { + String fileContent = "01234567890123456789"; + String characterEncoding = "ascii"; + byte[] buffer = fileContent.getBytes(characterEncoding); + + long reps = (long) Integer.MAX_VALUE / (long) buffer.length + 1000000l; + + logger.info("writing " + (reps * buffer.length) + + " bytes to testHugeReplayCharSequence.txt"); + RecordingOutputStream ros = writeTestStream(buffer, 0, + "testHugeReplayCharSequence.txt", reps); + ReplayCharSequence rcs = getReplayCharSequence(ros,Charset.forName(characterEncoding)); + + if (reps * fileContent.length() > (long) Integer.MAX_VALUE) { + assertTrue("ReplayCharSequence has wrong length (length()=" + + rcs.length() + ") (should be " + Integer.MAX_VALUE + ")", + rcs.length() == Integer.MAX_VALUE); + } else { + assertEquals("ReplayCharSequence has wrong length (length()=" + + rcs.length() + ") (should be " + + (reps * fileContent.length()) + ")", (long) rcs.length(), + reps * (long) fileContent.length()); + } + + // boundary cases or something + for (int index : new int[] { 0, rcs.length() / 4, rcs.length() / 2, + rcs.length() - 1, rcs.length() / 4 }) { + // logger.info("testing char at index=" + + // NumberFormat.getInstance().format(index)); + assertEquals("Characters don't match (index=" + + NumberFormat.getInstance().format(index) + ")", + fileContent.charAt(index % fileContent.length()), rcs + .charAt(index)); + } + + // check that out of bounds indices throw exception + for (int n : new int[] { -1, Integer.MIN_VALUE, rcs.length() + 1 }) { + try { + String message = "rcs.charAt(" + n + ")=" + rcs.charAt(n) + + " ?!? -- expected IndexOutOfBoundsException"; + logger.severe(message); + fail(message); + } catch (IndexOutOfBoundsException e) { + logger.info("got expected exception: " + e); + } + } + + // check some characters at random spots & kinda stress test the + // system's memory mapping facility + Random rand = new Random(0); // seed so we get the same ones each time + for (int i = 0; i < 5000; i++) { + int index = rand.nextInt(rcs.length()); + // logger.info(i + ". testing char at index=" + + // NumberFormat.getInstance().format(index)); + assertEquals("Characters don't match (index=" + + NumberFormat.getInstance().format(index) + ")", + fileContent.charAt(index % fileContent.length()), rcs + .charAt(index)); + } + } + + /** + * Accessing characters test. + * + * Checks that characters in the rcs are in sequence. + * + * @param rcs The ReplayCharSequence to try out. + */ + private void accessingCharacters(CharSequence rcs) { + long timestamp = (new Date()).getTime(); + int seeks = 0; + for (int i = (INCREMENT * 2); (i + INCREMENT) < rcs.length(); + i += INCREMENT) { + checkCharacter(rcs, i); + seeks++; + for (int j = i - INCREMENT; j < i; j++) { + checkCharacter(rcs, j); + seeks++; + } + } + // Note that printing out below breaks cruisecontrols drawing + // of the xml unit test results because it outputs disallowed + // xml characters. + logger.fine(rcs + " seeks count " + seeks + " in " + + ((new Date().getTime()) - timestamp) + " milliseconds."); + } + + /** + * Check the character read. + * + * Throws assertion if not expected result. + * + * @param rcs ReplayCharSequence to read from. + * @param i Character offset. + */ + private void checkCharacter(CharSequence rcs, int i) { + int c = rcs.charAt(i); + assertTrue("Character " + Integer.toString(c) + " at offset " + i + + " unexpected.", (c % SEQUENCE_LENGTH) == (i % SEQUENCE_LENGTH)); + } + + /** + * @param baseName + * @return RecordingOutputStream + * @throws IOException + */ + private RecordingOutputStream writeTestStream(byte[] content, + int memReps, String baseName, long fileReps) throws IOException { + String backingFilename = FileUtils.maybeRelative(getTmpDir(),baseName).getAbsolutePath(); + RecordingOutputStream ros = new RecordingOutputStream( + content.length * memReps, + backingFilename); + ros.open(); + ros.markMessageBodyBegin(); + for(long i = 0; i < (memReps+fileReps); i++) { + // fill buffer (repeat MULTIPLIER times) and + // overflow to disk (also MULTIPLIER times) + ros.write(content); + } + ros.close(); + return ros; + } + + + /** + * Fill a buffer w/ regular progression of single-byte + * (and <= 127) characters. + * @param buffer Buffer to fill. + * @return The buffer we filled. + */ + private byte [] fillBufferWithRegularContent(byte [] buffer) { + int index = 0; + for (int i = 0; i < buffer.length; i++) { + buffer[i] = (byte) (index & 0x00ff); + index++; + if (index >= SEQUENCE_LENGTH) { + // Reset the index. + index = 0; + } + } + return buffer; + } + + public void testCheckParameters() + { + // TODO. + } +} diff --git a/src/test/java/org/archive/io/RepositionableInputStreamTest.java b/src/test/java/org/archive/io/RepositionableInputStreamTest.java new file mode 100644 index 00000000..1c7cc74c --- /dev/null +++ b/src/test/java/org/archive/io/RepositionableInputStreamTest.java @@ -0,0 +1,70 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.io; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.PrintWriter; + +import org.archive.util.TmpDirTestCase; + +public class RepositionableInputStreamTest extends TmpDirTestCase { + private File testFile; + private static final String LINE = "0123456789abcdefghijklmnopqrstuv"; + protected void setUp() throws Exception { + super.setUp(); + this.testFile = new File(getTmpDir(), this.getClass().getName()); + PrintWriter pw = new PrintWriter(new FileOutputStream(testFile)); + for (int i = 0; i < 100; i++) { + pw.print(LINE); + } + pw.close(); + } + protected void tearDown() throws Exception { + super.tearDown(); + } + public void testname() throws Exception { + // Make buffer awkward size so we run into buffers spanning issues. + RepositionableInputStream ris = + new RepositionableInputStream(new FileInputStream(this.testFile), + 57); + int c = ris.read(); + assertEquals(1, ris.position()); + ris.read(); + ris.position(0); + assertEquals(0, ris.position()); + int c1 = ris.read(); + assertEquals(c, c1); + ris.position(0); + byte [] bytes = new byte[LINE.length()]; + long offset = 0; + for (int i = 0; i < 10; i++) { + ris.read(bytes, 0, LINE.length()); + assertEquals(LINE, new String(bytes)); + offset += LINE.length(); + assertEquals(offset, ris.position()); + } + long p = ris.position(); + ris.position(p - LINE.length()); + assertEquals(p - LINE.length(), ris.position()); + c = ris.read(); + assertEquals(c, c1); + } +} diff --git a/src/test/java/org/archive/io/arc/ARCWriterPoolTest.java b/src/test/java/org/archive/io/arc/ARCWriterPoolTest.java new file mode 100644 index 00000000..f0be6506 --- /dev/null +++ b/src/test/java/org/archive/io/arc/ARCWriterPoolTest.java @@ -0,0 +1,122 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io.arc; + +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.util.Arrays; + +import org.archive.io.WriterPool; +import org.archive.io.WriterPoolMember; +import org.archive.io.WriterPoolSettings; +import org.archive.util.TmpDirTestCase; + + +/** + * Test ARCWriterPool + */ +@SuppressWarnings("deprecation") +public class ARCWriterPoolTest extends TmpDirTestCase { + private static final String PREFIX = "TEST"; + + public void testARCWriterPool() + throws Exception { + final int MAX_ACTIVE = 3; + final int MAX_WAIT_MILLISECONDS = 100; + cleanUpOldFiles(PREFIX); + WriterPool pool = new ARCWriterPool(getSettings(true), + MAX_ACTIVE, MAX_WAIT_MILLISECONDS); + WriterPoolMember [] writers = new WriterPoolMember[MAX_ACTIVE]; + final String CONTENT = "Any old content"; + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + baos.write(CONTENT.getBytes()); + for (int i = 0; i < MAX_ACTIVE; i++) { + writers[i] = pool.borrowFile(); + assertEquals("Number active", i + 1, pool.getNumActive()); + ((ARCWriter)writers[i]).write("http://one.two.three", "no-type", + "0.0.0.0", 1234567890, CONTENT.length(), baos); + } + + // Pool is maxed out. New behavior is that additional requests + // block as long as necessary -- so no longer testing for timeout/ + // exception + + for (int i = (MAX_ACTIVE - 1); i >= 0; i--) { + pool.returnFile(writers[i]); + assertEquals("Number active", i, pool.getNumActive()); + assertEquals("Number idle", MAX_ACTIVE - pool.getNumActive(), + pool.getNumIdle()); + } + pool.close(); + } + + public void testInvalidate() throws Exception { + final int MAX_ACTIVE = 3; + final int MAX_WAIT_MILLISECONDS = 100; + cleanUpOldFiles(PREFIX); + WriterPool pool = new ARCWriterPool(getSettings(true), + MAX_ACTIVE, MAX_WAIT_MILLISECONDS); + WriterPoolMember [] writers = new WriterPoolMember[MAX_ACTIVE]; + final String CONTENT = "Any old content"; + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + baos.write(CONTENT.getBytes()); + for (int i = 0; i < MAX_ACTIVE; i++) { + writers[i] = pool.borrowFile(); + assertEquals("Number active", i + 1, pool.getNumActive()); + ((ARCWriter)writers[i]).write("http://one.two.three", "no-type", + "0.0.0.0", 1234567890, CONTENT.length(), baos); + } + + WriterPoolMember writer2Invalidate = writers[pool.getNumActive() - 1]; + writers[pool.getNumActive() - 1] = null; + pool.invalidateFile(writer2Invalidate); + for (int i = 0; i < (MAX_ACTIVE - 1); i++) { + if (writers[i] == null) { + continue; + } + pool.returnFile(writers[i]); + } + + for (int i = 0; i < MAX_ACTIVE; i++) { + writers[i] = pool.borrowFile(); + assertEquals("Number active", i + 1, pool.getNumActive()); + ((ARCWriter)writers[i]).write("http://one.two.three", "no-type", + "0.0.0.0", 1234567890, CONTENT.length(), baos); + } + for (int i = (MAX_ACTIVE - 1); i >= 0; i--) { + pool.returnFile(writers[i]); + assertEquals("Number active", i, pool.getNumActive()); + assertEquals("Number idle", MAX_ACTIVE - pool.getNumActive(), + pool.getNumIdle()); + } + pool.close(); + } + + private WriterPoolSettings getSettings(final boolean isCompressed) { + File [] files = {getTmpDir()}; + return new WriterPoolSettingsData( + PREFIX, + "${prefix}-${timestamp17}-${serialno}-${heritrix.hostname}", + ARCConstants.DEFAULT_MAX_ARC_FILE_SIZE, + isCompressed, + Arrays.asList(files), + null); + } +} \ No newline at end of file diff --git a/src/test/java/org/archive/io/arc/ARCWriterTest.java b/src/test/java/org/archive/io/arc/ARCWriterTest.java new file mode 100644 index 00000000..f6e2bf6a --- /dev/null +++ b/src/test/java/org/archive/io/arc/ARCWriterTest.java @@ -0,0 +1,699 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io.arc; + +import java.io.BufferedInputStream; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.io.PrintStream; +import java.util.Arrays; +import java.util.Date; +import java.util.Iterator; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.io.input.NullInputStream; +import org.apache.commons.io.output.NullOutputStream; +import org.archive.io.ArchiveRecord; +import org.archive.io.ArchiveRecordHeader; +import org.archive.io.ReplayInputStream; +import org.archive.io.WriterPoolMember; +import org.archive.io.WriterPoolSettings; +import org.archive.util.ArchiveUtils; +import org.archive.util.TmpDirTestCase; + +import com.google.common.io.Closeables; + + +/** + * Test ARCWriter class. + * + * This code exercises ARCWriter AND ARCReader. First it writes ARCs w/ + * ARCWriter. Then it validates what was written w/ ARCReader. + * + * @author stack + */ +public class ARCWriterTest +extends TmpDirTestCase implements ARCConstants { + /** + * Utility class for writing bad ARCs (with trailing junk) + */ + public class CorruptibleARCWriter extends ARCWriter { + byte[] endJunk = null; + + public CorruptibleARCWriter(AtomicInteger serial_no, WriterPoolSettings settings) { + super(serial_no, settings); + } + + @Override + protected void postWriteRecordTasks() throws IOException { + if (endJunk != null) { + this.write(endJunk); + } + super.postWriteRecordTasks(); + } + + public void setEndJunk(byte[] b) throws IOException { + this.endJunk = b; + } + } + + /** + * Suffix to use for ARC files made by JUNIT. + */ + private static final String SUFFIX = "JUNIT"; + + private static final String SOME_URL = "http://www.archive.org/test/"; + + + private static final AtomicInteger SERIAL_NO = new AtomicInteger(); + + /* + * @see TestCase#setUp() + */ + protected void setUp() throws Exception { + super.setUp(); + } + + /* + * @see TestCase#tearDown() + */ + protected void tearDown() throws Exception { + super.tearDown(); + } + + protected static String getContent() { + return getContent(null); + } + + protected static String getContent(String indexStr) { + String page = (indexStr != null)? "Page #" + indexStr: "Some Page"; + return "HTTP/1.1 200 OK\r\n" + + "Content-Type: text/html\r\n\r\n" + + "" + page + + "" + + "" + page + + ""; + } + + @SuppressWarnings("deprecation") + protected int writeRandomHTTPRecord(ARCWriter arcWriter, int index) + throws IOException { + String indexStr = Integer.toString(index); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + // Start the record with an arbitrary 14-digit date per RFC2540 + String now = ArchiveUtils.get14DigitDate(); + int recordLength = 0; + byte[] record = (getContent(indexStr)).getBytes(); + recordLength += record.length; + baos.write(record); + // Add the newline between records back in + baos.write("\n".getBytes()); + recordLength += 1; + arcWriter.write("http://www.one.net/id=" + indexStr, "text/html", + "0.1.2.3", Long.parseLong(now), recordLength, baos); + return recordLength; + } + + private File writeRecords(String baseName, boolean compress, + long maxSize, int recordCount) + throws IOException { + cleanUpOldFiles(baseName); + File [] files = {getTmpDir()}; + ARCWriter arcWriter = + new ARCWriter( + SERIAL_NO, + new WriterPoolSettingsData( + baseName, + "${prefix}-"+SUFFIX, + maxSize, + compress, + Arrays.asList(files), + null)); + assertNotNull(arcWriter); + for (int i = 0; i < recordCount; i++) { + writeRandomHTTPRecord(arcWriter, i); + } + arcWriter.close(); + assertTrue("Doesn't exist: " + + arcWriter.getFile().getAbsolutePath(), + arcWriter.getFile().exists()); + return arcWriter.getFile(); + } + + private void validate(File arcFile, int recordCount) + throws FileNotFoundException, IOException { + ARCReader reader = ARCReaderFactory.get(arcFile); + assertNotNull(reader); + List metaDatas = null; + if (recordCount == -1) { + metaDatas = reader.validate(); + } else { + metaDatas = reader.validate(recordCount); + } + reader.close(); + // Now, run through each of the records doing absolute get going from + // the end to start. Reopen the arc so no context between this test + // and the previous. + + for (int i = metaDatas.size() - 1; i >= 0; i--) { + reader = ARCReaderFactory.get(arcFile); + ARCRecordMetaData meta = (ARCRecordMetaData)metaDatas.get(i); + ArchiveRecord r = reader.get(meta.getOffset()); + String mimeType = r.getHeader().getMimetype(); + assertTrue("Record is bogus", + mimeType != null && mimeType.length() > 0); + reader.close(); + } + assertEquals("Metadata count not as expected",recordCount, metaDatas.size()); + for (Iterator i = metaDatas.iterator(); i.hasNext();) { + ARCRecordMetaData r = (ARCRecordMetaData)i.next(); + assertTrue("Record is empty", r.getLength() > 0); + } + } + + public void testCheckARCFileSize() + throws IOException { + runCheckARCFileSizeTest("checkARCFileSize", false); + } + + public void testCheckARCFileSizeCompressed() + throws IOException { + runCheckARCFileSizeTest("checkARCFileSize", true); + } + + public void testWriteRecord() throws IOException { + final int recordCount = 2; + File arcFile = writeRecords("writeRecord", false, + DEFAULT_MAX_ARC_FILE_SIZE, recordCount); + validate(arcFile, recordCount + 1); // Header record. + } + + public void testRandomAccess() throws IOException { + final int recordCount = 3; + File arcFile = writeRecords("writeRecord", true, + DEFAULT_MAX_ARC_FILE_SIZE, recordCount); + ARCReader reader = ARCReaderFactory.get(arcFile); + // Get to second record. Get its offset for later use. + boolean readFirst = false; + String url = null; + long offset = -1; + long totalRecords = 0; + boolean readSecond = false; + for (final Iterator i = reader.iterator(); i.hasNext(); totalRecords++) { + ARCRecord ar = (ARCRecord)i.next(); + if (!readFirst) { + readFirst = true; + continue; + } + if (!readSecond) { + url = ar.getMetaData().getUrl(); + offset = ar.getMetaData().getOffset(); + readSecond = true; + } + } + reader.close(); + + reader = ARCReaderFactory.get(arcFile, offset); + ArchiveRecord ar = reader.get(); + assertEquals(ar.getHeader().getUrl(), url); + ar.close(); + reader.close(); + + // Get reader again. See how iterator works with offset + reader = ARCReaderFactory.get(arcFile, offset); + int count = 0; + for (final Iterator i = reader.iterator(); i.hasNext(); i.next()) { + count++; + } + reader.close(); + assertEquals(totalRecords - 1, count); + } + + public void testWriteRecordCompressed() throws IOException { + final int recordCount = 2; + File arcFile = writeRecords("writeRecordCompressed", true, + DEFAULT_MAX_ARC_FILE_SIZE, recordCount); + validate(arcFile, recordCount + 1 /*Header record*/); + } + + public void testWriteGiantRecord() throws IOException { + PrintStream dummyStream = new PrintStream(new NullOutputStream()); + ARCWriter arcWriter = + new ARCWriter( + SERIAL_NO, + dummyStream, + new File("dummy"), + new WriterPoolSettingsData( + "", + "", + -1, + false, + null, + null)); + assertNotNull(arcWriter); + + // Start the record with an arbitrary 14-digit date per RFC2540 + long now = System.currentTimeMillis(); + long recordLength = org.apache.commons.io.FileUtils.ONE_GB * 3; + + arcWriter.write("dummy:uri", "application/octet-stream", + "0.1.2.3", now, recordLength, new NullInputStream(recordLength)); + arcWriter.close(); + } + + private void runCheckARCFileSizeTest(String baseName, boolean compress) + throws FileNotFoundException, IOException { + File f = writeRecords(baseName, compress, 1024, 15); + validate(f, 15+1); + } + + protected CorruptibleARCWriter createARCWriter(String name, boolean compress) { + File [] files = {getTmpDir()}; + return new CorruptibleARCWriter( + SERIAL_NO, + new WriterPoolSettingsData( + name, + "${prefix}-"+SUFFIX, + DEFAULT_MAX_ARC_FILE_SIZE, + compress, + Arrays.asList(files), + null)); + } + + protected static ByteArrayInputStream getBais(String str) + throws IOException { + return new ByteArrayInputStream(str.getBytes()); + } + + /** + * Writes a record, suppressing normal length-checks (so that + * intentionally malformed records may be written). + */ + protected static void writeRecord(ARCWriter writer, String url, + String type, int len, ByteArrayInputStream bais) + throws IOException { + writer.write(url, type, "192.168.1.1", (new Date()).getTime(), len, + bais, false); + } + + protected int iterateRecords(ARCReader r) + throws IOException { + int count = 0; + for (Iterator i = r.iterator(); i.hasNext();) { + ARCRecord rec = (ARCRecord)i.next(); + rec.close(); + if (count != 0) { + assertTrue("Unexpected URL " + rec.getMetaData().getUrl(), + rec.getMetaData().getUrl().startsWith(SOME_URL)); + } + count++; + } + return count; + } + + protected CorruptibleARCWriter createArcWithOneRecord(String name, + boolean compressed) + throws IOException { + CorruptibleARCWriter writer = createARCWriter(name, compressed); + String content = getContent(); + writeRecord(writer, SOME_URL, "text/html", + content.length(), getBais(content)); + return writer; + } + + public void testSpaceInURL() { + String eMessage = null; + try { + holeyUrl("testSpaceInURL", false, " "); + } catch (IOException e) { + eMessage = e.getMessage(); + } + assertTrue("Didn't get expected exception: " + eMessage, + eMessage.startsWith("Metadata line doesn't match")); + } + + public void testTabInURL() { + String eMessage = null; + try { + holeyUrl("testTabInURL", false, "\t"); + } catch (IOException e) { + eMessage = e.getMessage(); + } + assertTrue("Didn't get expected exception: " + eMessage, + eMessage.startsWith("Metadata line doesn't match")); + } + + protected void holeyUrl(String name, boolean compress, String urlInsert) + throws IOException { + ARCWriter writer = null; + try { + writer = createArcWithOneRecord(name, compress); + // Add some bytes on the end to mess up the record. + String content = getContent(); + writeRecord(writer, SOME_URL + urlInsert + "/index.html", "text/html", + content.length(), getBais(content)); + } finally { + Closeables.close(writer, true); + } + } + +// If uncompressed, length has to be right or parse will fail. +// +// public void testLengthTooShort() throws IOException { +// lengthTooShort("testLengthTooShort-" + PREFIX, false); +// } + + public void testLengthTooShortCompressed() throws IOException { + lengthTooShort("testLengthTooShortCompressed", true, false); + } + + public void testLengthTooShortCompressedStrict() + throws IOException { + String eMessage = null; + try { + lengthTooShort("testLengthTooShortCompressedStrict", + true, true); + } catch (RuntimeException e) { + eMessage = e.getMessage(); + } + assertTrue("Didn't get expected exception: " + eMessage, + eMessage.startsWith("java.io.IOException: Record STARTING at")); + } + + protected void lengthTooShort(String name, boolean compress, boolean strict) + throws IOException { + CorruptibleARCWriter writer = null; + try { + writer = createArcWithOneRecord(name, compress); + // Add some bytes on the end to mess up the record. + String content = getContent(); + ByteArrayInputStream bais = getBais(content+"SOME TRAILING BYTES"); + writeRecord(writer, SOME_URL, "text/html", + content.length(), bais); + writer.setEndJunk("SOME TRAILING BYTES".getBytes()); + writeRecord(writer, SOME_URL, "text/html", + content.length(), getBais(content)); + } finally { + Closeables.close(writer, true); + } + + // Catch System.err into a byte stream. + ByteArrayOutputStream os = new ByteArrayOutputStream(); + PrintStream origErr = System.err; + ARCReader r = null; + try { + System.setErr(new PrintStream(os)); + + r = ARCReaderFactory.get(writer.getFile()); + r.setStrict(strict); + int count = iterateRecords(r); + assertTrue("Count wrong " + count, count == 4); + + // Make sure we get the warning string which complains about the + // trailing bytes. + String err = os.toString(); + assertTrue("No message " + err, err.startsWith("WARNING") && + (err.indexOf("Record STARTING at") > 0)); + r.close(); + } finally { + Closeables.close(r, true); + System.setErr(origErr); + } + } + +// If uncompressed, length has to be right or parse will fail. +// +// public void testLengthTooLong() +// throws IOException { +// lengthTooLong("testLengthTooLongCompressed-" + PREFIX, +// false, false); +// } + + public void testLengthTooLongCompressed() + throws IOException { + lengthTooLong("testLengthTooLongCompressed", + true, false); + } + + public void testLengthTooLongCompressedStrict() { + String eMessage = null; + try { + lengthTooLong("testLengthTooLongCompressed", + true, true); + } catch (IOException e) { + eMessage = e.getMessage(); + } + assertTrue("Didn't get expected exception: " + eMessage, + eMessage.startsWith("Premature EOF before end-of-record")); + } + + protected void lengthTooLong(String name, boolean compress, + boolean strict) + throws IOException { + ARCWriter writer = createArcWithOneRecord(name, compress); + // Add a record with a length that is too long. + String content = getContent(); + writeRecord(writer, SOME_URL+"2", "text/html", + content.length() + 10, getBais(content)); + writeRecord(writer, SOME_URL+"3", "text/html", + content.length(), getBais(content)); + writer.close(); + + // Catch System.err. + ByteArrayOutputStream os = new ByteArrayOutputStream(); + + PrintStream origErr = System.err; + ARCReader r = null; + try { + System.setErr(new PrintStream(os)); + + r = ARCReaderFactory.get(writer.getFile()); + r.setStrict(strict); + int count = iterateRecords(r); + assertTrue("Count wrong " + count, count == 4); + + // Make sure we get the warning string which complains about the + // trailing bytes. + String err = os.toString(); + assertTrue("No message " + err, + err.startsWith("WARNING Premature EOF before end-of-record")); + } finally { + Closeables.close(r, true); + System.setErr(origErr); + } + } + + public void testGapError() throws IOException { + ARCWriter writer = createArcWithOneRecord("testGapError", true); + String content = getContent(); + // Make a 'weird' RIS that returns bad 'remaining' length + // awhen remaining should be 0 + ReplayInputStream ris = new ReplayInputStream(content.getBytes(), + content.length(), null) { + public long remaining() { + return (super.remaining()==0) ? -1 : super.remaining(); + } + }; + String message = null; + try { + writer.write(SOME_URL, "text/html", "192.168.1.1", + (new Date()).getTime(), content.length(), ris); + } catch (IOException e) { + message = e.getMessage(); + } finally { + IOUtils.closeQuietly(ris); + } + writer.close(); + assertTrue("No gap when should be", + message != null && + message.indexOf("Gap between expected and actual") >= 0); + } + + /** + * Write an arc file for other tests to use. + * @param arcdir Directory to write to. + * @param compress True if file should be compressed. + * @return ARC written. + * @throws IOException + */ + public static File createARCFile(File arcdir, boolean compress) + throws IOException { + File [] files = {arcdir}; + ARCWriter writer = new ARCWriter(SERIAL_NO, + new WriterPoolSettingsData( + "", + "test", + DEFAULT_MAX_ARC_FILE_SIZE, + compress, + Arrays.asList(files), + null)); + String content = getContent(); + writeRecord(writer, SOME_URL, "text/html", content.length(), + getBais(content)); + writer.close(); + return writer.getFile(); + } + +// public void testSpeed() throws IOException { +// ARCWriter writer = createArcWithOneRecord("speed", true); +// // Add a record with a length that is too long. +// String content = getContent(); +// final int count = 100000; +// logger.info("Starting speed write of " + count + " records."); +// for (int i = 0; i < count; i++) { +// writeRecord(writer, SOME_URL, "text/html", content.length(), +// getBaos(content)); +// } +// writer.close(); +// logger.info("Finished speed write test."); +// } + + + public void testValidateMetaLine() throws Exception { + final String line = "http://www.aandw.net/images/walden2.png " + + "128.197.34.86 20060111174224 image/png 2160"; + ARCWriter w = createARCWriter("testValidateMetaLine", true); + try { + w.validateMetaLine(line); + w.validateMetaLine(line + LINE_SEPARATOR); + w.validateMetaLine(line + "\\r\\n"); + } finally { + w.close(); + } + } + + public void testArcRecordOffsetReads() throws Exception { + ARCReader r = getSingleRecordReader("testArcRecordInBufferStream"); + ARCRecord ar = getSingleRecord(r); + // Now try getting some random set of bytes out of it + // at an odd offset (used to fail because we were + // doing bad math to find where in buffer to read). + final byte[] buffer = new byte[17]; + final int maxRead = 4; + int totalRead = 0; + while (totalRead < maxRead) { + totalRead = totalRead + + ar.read(buffer, 13 + totalRead, maxRead - totalRead); + assertTrue(totalRead > 0); + } + r.close(); + } + + // available should always be >= 0; extra read()s should all give EOF + public void testArchiveRecordAvailableConsistent() throws Exception { + // first test reading byte-at-a-time via no-param read() + ARCReader r = getSingleRecordReader("testArchiveRecordAvailableConsistent"); + ARCRecord record = getSingleRecord(r); + int c = record.read(); + while(c>=0) { + c = record.read(); + } + // consecutive reads after EOR should always give -1, still show zero available() + for (int i=0; i<5; i++) { + assertTrue("available negative:"+record.available(), record.available()>=0); + assertEquals(-1, record.read()); + } + r.close(); + } + + // should always give -1 on repeated reads past EOR + public void testArchiveRecordEORConsistent() throws Exception { + ARCReader r = getSingleRecordReader("testArchiveRecordEORConsistent"); + ARCRecord record = getSingleRecord(r); + this.readToEOS(record); + // consecutive reads after EOR should always give -1 + for (int i=0; i<5; i++) { + assertEquals(-1, record.read(new byte[1])); + } + r.close(); + } + + // should not throw premature EOF when wrapped with BufferedInputStream + // [HER-1450] showed this was the case using Apache Tika + public void testArchiveRecordMarkSupport() throws Exception { + ARCReader r = getSingleRecordReader("testArchiveRecordMarkSupport"); + ARCRecord record = getSingleRecord(r); + record.setStrict(true); + // ensure mark support + InputStream stream = new BufferedInputStream(record); + if (stream.markSupported()) { + for (int i=0; i<3; i++) { + this.readToEOS(stream); + stream.mark(stream.available()); + stream.reset(); + } + stream.close(); + } + r.close(); + } + + /** + * Test a particular style of using the reader iterator. (Should + * possibly be on a reader-centric test class, but the best setup + * functionality is here.) + * + * @throws IOException + */ + public void testReadIterator() throws IOException { + final int recordCount = 3; + File arcFile = writeRecords("writeRecord", true, + DEFAULT_MAX_ARC_FILE_SIZE, recordCount); + ARCReader reader = ARCReaderFactory.get(arcFile); + Iterator it = reader.iterator(); + while (it.hasNext()) { + ArchiveRecord next = it.next(); + next.close(); + } + reader.close(); + } + + protected void readToEOS(InputStream in) throws Exception { + byte [] buf = new byte[1024]; + int read = 0; + while (read >= 0) { + read = in.read(buf); + // System.out.println("readToEOS read " + read + " bytes"); + } + } + + protected ARCReader getSingleRecordReader(String name) throws Exception { + // Get an ARC with one record. + WriterPoolMember w = createArcWithOneRecord(name, true); + w.close(); + // Get reader on said ARC. + ARCReader r = ARCReaderFactory.get(w.getFile()); + return r; + } + + protected ARCRecord getSingleRecord(ARCReader r) { + final Iterator i = r.iterator(); + // Skip first ARC meta record. + i.next(); + i.hasNext(); + // Now we're at first and only record in ARC. + return (ARCRecord) i.next(); + } +} diff --git a/src/test/java/org/archive/io/warc/WARCWriterTest.java b/src/test/java/org/archive/io/warc/WARCWriterTest.java new file mode 100644 index 00000000..35c68714 --- /dev/null +++ b/src/test/java/org/archive/io/warc/WARCWriterTest.java @@ -0,0 +1,512 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.io.warc; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.net.URI; +import java.util.Arrays; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; + +import org.archive.io.ArchiveRecord; +import org.archive.io.ArchiveRecordHeader; +import org.archive.io.UTF8Bytes; +import org.archive.io.WriterPoolMember; +import org.archive.uid.RecordIDGenerator; +import org.archive.uid.UUIDGenerator; +import org.archive.util.ArchiveUtils; +import org.archive.util.TmpDirTestCase; +import org.archive.util.anvl.ANVLRecord; + +/** + * Test Writer and Reader. + * @author stack + * @version $Date: 2006-08-29 19:35:48 -0700 (Tue, 29 Aug 2006) $ $Version$ + */ +public class WARCWriterTest +extends TmpDirTestCase implements WARCConstants { + + private static final AtomicInteger SERIAL_NO = new AtomicInteger(); + + RecordIDGenerator generator = new UUIDGenerator(); + + /** + * Prefix to use for ARC files made by JUNIT. + */ + private static final String SUFFIX = "JUNIT"; + + private static final String SOME_URL = "http://www.archive.org/test/"; + + @SuppressWarnings("unchecked") + public void testCheckHeaderLineValue() throws Exception { + WARCWriter writer = new WARCWriter( + SERIAL_NO, + new WARCWriterPoolSettingsData( + "","test",1,false,Collections.EMPTY_LIST,Collections.EMPTY_LIST,generator)); + writer.checkHeaderValue("one"); + IllegalArgumentException exception = null; + try { + writer.checkHeaderValue("with space"); + } catch(IllegalArgumentException e) { + exception = e; + } + assertNotNull(exception); + exception = null; + try { + writer.checkHeaderValue("with\0x0000controlcharacter"); + } catch(IllegalArgumentException e) { + exception = e; + } + writer.close(); + assertNotNull(exception); + } + + @SuppressWarnings("unchecked") + public void testMimetypes() throws IOException { + WARCWriter writer = new WARCWriter(SERIAL_NO, + new WARCWriterPoolSettingsData( + "m","testM",1,false,Collections.EMPTY_LIST,Collections.EMPTY_LIST,generator)); + writer.checkHeaderLineMimetypeParameter("text/xml"); + writer.checkHeaderLineMimetypeParameter("text/xml+rdf"); + assertEquals(writer.checkHeaderLineMimetypeParameter( + "text/plain; charset=SHIFT-JIS"), "text/plain; charset=SHIFT-JIS"); + assertEquals(writer.checkHeaderLineMimetypeParameter( + "multipart/mixed; \r\n boundary=\"simple boundary\""), + "multipart/mixed; boundary=\"simple boundary\""); + } + + public void testWriteRecord() throws IOException { + File [] files = {getTmpDir()}; + + // Write uncompressed. + WARCWriter writer = + new WARCWriter(SERIAL_NO, new WARCWriterPoolSettingsData( + this.getClass().getName(), "templateWR1", -1, false, Arrays.asList(files), null, generator)); + + writeFile(writer); + writer.close(); + + // Write compressed. + writer = new WARCWriter(SERIAL_NO, new WARCWriterPoolSettingsData( + this.getClass().getName(), "templateWR2", -1, true, Arrays.asList(files), null, generator)); + + writeFile(writer); + writer.close(); + } + + private void writeFile(final WARCWriter writer) + throws IOException { + try { + writeWarcinfoRecord(writer); + writeBasicRecords(writer); + } finally { + writer.close(); + writer.getFile().delete(); + } + } + + private void writeWarcinfoRecord(WARCWriter writer) + throws IOException { + WARCRecordInfo recordInfo = new WARCRecordInfo(); + recordInfo.setType(WARCRecordType.warcinfo); + recordInfo.setUrl(null); + recordInfo.setCreate14DigitDate(ArchiveUtils.getLog14Date()); + recordInfo.setMimetype(ANVLRecord.MIMETYPE); + recordInfo.setExtraHeaders(null); + recordInfo.setEnforceLength(true); + + ANVLRecord meta = new ANVLRecord(); + meta.addLabelValue("size", "1G"); + meta.addLabelValue("operator", "igor"); + byte [] bytes = meta.getUTF8Bytes(); + recordInfo.setContentStream(new ByteArrayInputStream(bytes)); + recordInfo.setContentLength((long) bytes.length); + + final URI recordid = writer.generateRecordId(WARCWriter.TYPE, WARCRecordType.warcinfo.toString()); + recordInfo.setRecordId(recordid); + + writer.writeRecord(recordInfo); + } + + protected void writeBasicRecords(final WARCWriter writer) + throws IOException { + WARCRecordInfo recordInfo = new WARCRecordInfo(); + recordInfo.setType(WARCRecordType.metadata); + recordInfo.setUrl("http://www.archive.org/"); + recordInfo.setCreate14DigitDate(ArchiveUtils.get14DigitDate()); + recordInfo.setMimetype("no/type"); + recordInfo.setEnforceLength(true); + + ANVLRecord headerFields = new ANVLRecord(); + headerFields.addLabelValue("x", "y"); + headerFields.addLabelValue("a", "b"); + recordInfo.setExtraHeaders(headerFields); + + URI rid = (new UUIDGenerator()).getQualifiedRecordID(TYPE, WARCRecordType.metadata.toString()); + recordInfo.setRecordId(rid); + + final String content = "Any old content."; + for (int i = 0; i < 10; i++) { + String body = i + ". " + content; + byte [] bodyBytes = body.getBytes(UTF8Bytes.UTF8); + recordInfo.setContentStream(new ByteArrayInputStream(bodyBytes)); + recordInfo.setContentLength((long)bodyBytes.length); + writer.writeRecord(recordInfo); + } + } + + /** + * @return Generic HTML Content. + */ + protected static String getContent() { + return getContent(null); + } + + /** + * @return Generic HTML Content with mention of passed indexStr + * in title and body. + */ + protected static String getContent(String indexStr) { + String page = (indexStr != null)? "Page #" + indexStr: "Some Page"; + return "HTTP/1.1 200 OK\r\n" + + "Content-Type: text/html\r\n\r\n" + + "" + page + + "" + + "" + page + + ""; + } + + /** + * Write random HTML Record. + * @param w Where to write. + * @param index An index to put into content. + * @return Length of record written. + * @throws IOException + */ + protected int writeRandomHTTPRecord(WARCWriter w, int index) + throws IOException { + WARCRecordInfo recordInfo = new WARCRecordInfo(); + recordInfo.setType(WARCRecordType.resource); + recordInfo.setCreate14DigitDate(ArchiveUtils.get14DigitDate()); + recordInfo.setMimetype("text/html; charset=UTF-8"); + recordInfo.setRecordId(w.generateRecordId(null)); + recordInfo.setEnforceLength(true); + + String indexStr = Integer.toString(index); + recordInfo.setUrl("http://www.one.net/id=" + indexStr); + + byte[] record = (getContent(indexStr)).getBytes(); + recordInfo.setContentLength((long) record.length); + + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + baos.write(record); + recordInfo.setContentStream(new ByteArrayInputStream(baos.toByteArray())); + + // Add named fields for ip, checksum, and relate the metadata + // and request to the resource field. + recordInfo.addExtraHeader(NAMED_FIELD_IP_LABEL, "127.0.0.1"); + + w.writeRecord(recordInfo); + return record.length; + } + + /** + * Fill a WARC with HTML Records. + * @param baseName WARC basename. + * @param compress Whether to compress or not. + * @param maxSize Maximum WARC size. + * @param recordCount How many records. + * @return The written file. + * @throws IOException + */ + private File writeRecords(String baseName, boolean compress, + int maxSize, int recordCount) + throws IOException { + cleanUpOldFiles(baseName); + File [] files = {getTmpDir()}; + WARCWriter w = new WARCWriter(SERIAL_NO, new WARCWriterPoolSettingsData( + baseName + '-' + SUFFIX, "${prefix}", maxSize, compress, Arrays.asList(files), null, generator)); + + assertNotNull(w); + for (int i = 0; i < recordCount; i++) { + writeRandomHTTPRecord(w, i); + } + w.close(); + assertTrue("Doesn't exist: " + w.getFile().getAbsolutePath(), + w.getFile().exists()); + return w.getFile(); + } + + /** + * Run validation of passed file. + * @param f File to validate. + * @param recordCount Expected count of records. + * @throws FileNotFoundException + * @throws IOException + */ + private void validate(File f, int recordCount) + throws FileNotFoundException, IOException { + WARCReader reader = WARCReaderFactory.get(f); + assertNotNull(reader); + List headers = null; + if (recordCount == -1) { + headers = reader.validate(); + } else { + headers = reader.validate(recordCount); + } + reader.close(); + + // Now, run through each of the records doing absolute get going from + // the end to start. Reopen the arc so no context between this test + // and the previous. + + for (int i = headers.size() - 1; i >= 0; i--) { + reader = WARCReaderFactory.get(f); + ArchiveRecordHeader h = (ArchiveRecordHeader)headers.get(i); + ArchiveRecord r = reader.get(h.getOffset()); + String mimeType = r.getHeader().getMimetype(); + assertTrue("Record is bogus", + mimeType != null && mimeType.length() > 0); + reader.close(); + } + + assertTrue("Metadatas not equal", headers.size() == recordCount); + for (Iterator i = headers.iterator(); i.hasNext();) { + ArchiveRecordHeader r = (ArchiveRecordHeader)i.next(); + assertTrue("Record is empty", r.getLength() > 0); + } + } + + public void testWriteRecords() throws IOException { + final int recordCount = 2; + File f = writeRecords("writeRecords", false, DEFAULT_MAX_WARC_FILE_SIZE, + recordCount); + validate(f, recordCount + 1); // Header record. + } + + public void testRandomAccess() throws IOException { + final int recordCount = 3; + File f = writeRecords("randomAccess", true, DEFAULT_MAX_WARC_FILE_SIZE, + recordCount); + WARCReader reader = WARCReaderFactory.get(f); + // Get to second record. Get its offset for later use. + boolean readFirst = false; + String url = null; + long offset = -1; + long totalRecords = 0; + boolean readSecond = false; + for (final Iterator i = reader.iterator(); i.hasNext(); + totalRecords++) { + WARCRecord ar = (WARCRecord)i.next(); + if (!readFirst) { + readFirst = true; + continue; + } + if (!readSecond) { + url = ar.getHeader().getUrl(); + offset = ar.getHeader().getOffset(); + readSecond = true; + } + } + reader.close(); + + reader = WARCReaderFactory.get(f, offset); + ArchiveRecord ar = reader.get(); + assertEquals(ar.getHeader().getUrl(), url); + ar.close(); + reader.close(); + + // Get reader again. See how iterator works with offset + reader = WARCReaderFactory.get(f, offset); + int count = 0; + for (final Iterator i = reader.iterator(); i.hasNext(); i.next()) { + count++; + } + reader.close(); + assertEquals(totalRecords - 1, count); + } + + public void testWriteRecordCompressed() throws IOException { + final int recordCount = 2; + File arcFile = writeRecords("writeRecordCompressed", true, + DEFAULT_MAX_WARC_FILE_SIZE, recordCount); + validate(arcFile, recordCount + 1 /*Header record*/); + } + + protected WARCWriter createWARCWriter(String name, + boolean compress) { + File [] files = {getTmpDir()}; + return new WARCWriter(SERIAL_NO, + new WARCWriterPoolSettingsData( + name, + "${prefix}-"+SUFFIX, + DEFAULT_MAX_WARC_FILE_SIZE, + compress, + Arrays.asList(files), + null, + generator)); + } + + protected static ByteArrayOutputStream getBaos(String str) + throws IOException { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + baos.write(str.getBytes()); + return baos; + } + + protected static void writeRecord(WARCWriter w, String url, + String mimetype, int len, ByteArrayOutputStream baos) + throws IOException { + WARCRecordInfo recordInfo = new WARCRecordInfo(); + recordInfo.setType(WARCRecordType.resource); + recordInfo.setUrl(url); + recordInfo.setCreate14DigitDate(ArchiveUtils.get14DigitDate()); + recordInfo.setMimetype(mimetype); + recordInfo.setRecordId(w.generateRecordId(null)); + recordInfo.setExtraHeaders(null); + recordInfo.setContentStream(new ByteArrayInputStream(baos.toByteArray())); + recordInfo.setContentLength((long) len); + recordInfo.setEnforceLength(true); + + w.writeRecord(recordInfo); + } + + protected int iterateRecords(WARCReader r) + throws IOException { + int count = 0; + for (Iterator i = r.iterator(); i.hasNext();) { + ArchiveRecord ar = i.next(); + ar.close(); + if (count != 0) { + assertTrue("Unexpected URL " + ar.getHeader().getUrl(), + ar.getHeader().getUrl().equals(SOME_URL)); + } + count++; + } + return count; + } + + protected WARCWriter createWithOneRecord(String name, + boolean compressed) + throws IOException { + WARCWriter writer = createWARCWriter(name, compressed); + String content = getContent(); + writeRecord(writer, SOME_URL, "text/html", + content.length(), getBaos(content)); + return writer; + } + + public void testSpaceInURL() throws IOException { + long bytesWritten = holeyUrl("testSpaceInURL", false, " "); + assertEquals("Unexpected successful writing occurred",0,bytesWritten); + } + + public void testTabInURL() throws IOException { + long bytesWritten = holeyUrl("testTabInURL", false, "\t"); + assertEquals("Unexpected successful writing occurred",0,bytesWritten); + } + + protected long holeyUrl(String name, boolean compress, String urlInsert) + throws IOException { + WARCWriter writer = createWithOneRecord(name, compress); + // Add some bytes on the end to mess up the record. + long startPos = writer.getPosition(); + String content = getContent(); + ByteArrayOutputStream baos = getBaos(content); + writeRecord(writer, SOME_URL + urlInsert + "/index.html", "text/html", + content.length(), baos); + long endPos = writer.getPosition(); + writer.close(); + return endPos-startPos; + } + + /** + * Write an arc file for other tests to use. + * @param arcdir Directory to write to. + * @param compress True if file should be compressed. + * @return ARC written. + * @throws IOException + */ + public static File createWARCFile(File arcdir, boolean compress) + throws IOException { + File [] files = {arcdir}; + WARCWriter writer = + new WARCWriter(SERIAL_NO, + new WARCWriterPoolSettingsData( + "", + "test", + DEFAULT_MAX_WARC_FILE_SIZE, + compress, + Arrays.asList(files), + null, + new UUIDGenerator())); + String content = getContent(); + writeRecord(writer, SOME_URL, "text/html", content.length(), + getBaos(content)); + writer.close(); + return writer.getFile(); + } + +// public void testSpeed() throws IOException { +// ARCWriter writer = createArcWithOneRecord("speed", true); +// // Add a record with a length that is too long. +// String content = getContent(); +// final int count = 100000; +// logger.info("Starting speed write of " + count + " records."); +// for (int i = 0; i < count; i++) { +// writeRecord(writer, SOME_URL, "text/html", content.length(), +// getBaos(content)); +// } +// writer.close(); +// logger.info("Finished speed write test."); +// } + + public void testArcRecordOffsetReads() throws Exception { + // Get an ARC with one record. + WriterPoolMember w = + createWithOneRecord("testArcRecordInBufferStream", true); + w.close(); + // Get reader on said ARC. + WARCReader r = WARCReaderFactory.get(w.getFile()); + final Iterator i = r.iterator(); + // Skip first ARC meta record. + ArchiveRecord ar = i.next(); + i.hasNext(); + // Now we're at first and only record in ARC. + ar = (WARCRecord) i.next(); + // Now try getting some random set of bytes out of it + // at an odd offset (used to fail because we were + // doing bad math to find where in buffer to read). + final byte[] buffer = new byte[17]; + final int maxRead = 4; + int totalRead = 0; + while (totalRead < maxRead) { + totalRead = totalRead + + ar.read(buffer, 13 + totalRead, maxRead - totalRead); + assertTrue(totalRead > 0); + } + } +} \ No newline at end of file diff --git a/src/test/java/org/archive/uid/UUIDGeneratorTest.java b/src/test/java/org/archive/uid/UUIDGeneratorTest.java new file mode 100644 index 00000000..79e98fb6 --- /dev/null +++ b/src/test/java/org/archive/uid/UUIDGeneratorTest.java @@ -0,0 +1,44 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.uid; + +import java.net.URI; +import java.net.URISyntaxException; +import java.util.HashMap; +import java.util.Map; + +import junit.framework.TestCase; + +/** + * @author stack + * @version $Revision$ $Date$ + */ +public class UUIDGeneratorTest extends TestCase { + public void testQualifyRecordID() throws URISyntaxException { + RecordIDGenerator g = new UUIDGenerator(); + URI uri = g.getRecordID(); + Map qualifiers = new HashMap(); + qualifiers.put("a", "b"); + URI nuURI = g.qualifyRecordID(uri, qualifiers); + assertNotSame(uri, nuURI); + qualifiers.put("c", "d"); + nuURI = g.qualifyRecordID(nuURI, qualifiers); + assertNotSame(uri, nuURI); + } +} diff --git a/src/test/java/org/archive/util/FileUtilsTest.java b/src/test/java/org/archive/util/FileUtilsTest.java new file mode 100644 index 00000000..19271435 --- /dev/null +++ b/src/test/java/org/archive/util/FileUtilsTest.java @@ -0,0 +1,271 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.util; + +import java.io.File; +import java.io.IOException; +import java.util.Collections; +import java.util.LinkedList; +import java.util.List; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang.math.LongRange; + + +/** + * FileUtils tests. + * + * @contributor stack + * @contributor gojomo + * @version $Date$, $Revision$ + */ +public class FileUtilsTest extends TmpDirTestCase { + private String srcDirName = FileUtilsTest.class.getName() + ".srcdir"; + private File srcDirFile = null; + private String tgtDirName = FileUtilsTest.class.getName() + ".tgtdir"; + private File tgtDirFile = null; + + protected File zeroLengthLinesUnix; + protected File zeroLengthLinesWindows; + + protected File smallLinesUnix; + protected File smallLinesWindows; + protected File largeLinesUnix; + protected File largeLinesWindows; + protected File nakedLastLineUnix; + protected File nakedLastLineWindows; + + + protected void setUp() throws Exception { + super.setUp(); + this.srcDirFile = new File(getTmpDir(), srcDirName); + FileUtils.ensureWriteableDirectory(srcDirFile); + this.tgtDirFile = new File(getTmpDir(), tgtDirName); + FileUtils.ensureWriteableDirectory(tgtDirFile); + addFiles(); + + zeroLengthLinesUnix = setUpLinesFile("zeroLengthLinesUnix",0,0,400,IOUtils.LINE_SEPARATOR_UNIX); + zeroLengthLinesWindows = setUpLinesFile("zeroLengthLinesUnix",0,0,400,IOUtils.LINE_SEPARATOR_WINDOWS); + + smallLinesUnix = setUpLinesFile("smallLinesUnix", 0, 25, 400, IOUtils.LINE_SEPARATOR_UNIX); + smallLinesWindows = setUpLinesFile("smallLinesWindows", 0, 25, 400, IOUtils.LINE_SEPARATOR_WINDOWS); + largeLinesUnix = setUpLinesFile("largeLinesUnix", 128, 256, 5, IOUtils.LINE_SEPARATOR_UNIX); + largeLinesWindows = setUpLinesFile("largeLinesWindows", 128, 256, 4096, IOUtils.LINE_SEPARATOR_WINDOWS); + + nakedLastLineUnix = setUpLinesFile("nakedLastLineUnix", 0, 50, 401, IOUtils.LINE_SEPARATOR_UNIX); + org.apache.commons.io.FileUtils.writeStringToFile(nakedLastLineUnix,"a"); + nakedLastLineWindows = setUpLinesFile("nakedLastLineWindows", 0, 50, 401, IOUtils.LINE_SEPARATOR_WINDOWS); + org.apache.commons.io.FileUtils.writeStringToFile(nakedLastLineWindows,"a"); + } + + private void addFiles() throws IOException { + addFiles(3, this.getName()); + } + + private void addFiles(final int howMany, final String baseName) + throws IOException { + for (int i = 0; i < howMany; i++) { + File.createTempFile(baseName, null, this.srcDirFile); + } + } + + private File setUpLinesFile(String name, int minLineSize, int maxLineSize, int lineCount, String lineEnding) throws IOException { + List lines = new LinkedList(); + StringBuilder sb = new StringBuilder(maxLineSize); + for(int i = 0; i< lineSize; j++) { + sb.append("-"); + } + lines.add(sb.toString()); + } + File file = File.createTempFile(name, null); + org.apache.commons.io.FileUtils.writeLines(file, lines, lineEnding); + return file; + + } + + protected void tearDown() throws Exception { + super.tearDown(); + org.apache.commons.io.FileUtils.deleteQuietly(this.srcDirFile); + org.apache.commons.io.FileUtils.deleteQuietly(this.tgtDirFile); + org.apache.commons.io.FileUtils.deleteQuietly(zeroLengthLinesUnix); + org.apache.commons.io.FileUtils.deleteQuietly(zeroLengthLinesWindows); + org.apache.commons.io.FileUtils.deleteQuietly(smallLinesUnix); + org.apache.commons.io.FileUtils.deleteQuietly(smallLinesWindows); + org.apache.commons.io.FileUtils.deleteQuietly(largeLinesUnix); + org.apache.commons.io.FileUtils.deleteQuietly(largeLinesWindows); + org.apache.commons.io.FileUtils.deleteQuietly(nakedLastLineUnix); + org.apache.commons.io.FileUtils.deleteQuietly(nakedLastLineWindows); + + } + + public void testCopyFile() { + // Test exception copying nonexistent file. + File [] srcFiles = this.srcDirFile.listFiles(); + srcFiles[0].delete(); + IOException e = null; + try { + FileUtils.copyFile(srcFiles[0], + new File(this.tgtDirFile, srcFiles[0].getName())); + } catch (IOException ioe) { + e = ioe; + } + assertNotNull("Didn't get expected IOE", e); + } + + public void testTailLinesZeroLengthUnix() throws IOException { + verifyTailLines(zeroLengthLinesUnix); + } + + public void testTailLinesZeroLengthWindows() throws IOException { + verifyTailLines(zeroLengthLinesWindows); + } + + public void testTailLinesSmallUnix() throws IOException { + verifyTailLines(smallLinesUnix); + } + + public void testTailLinesLargeUnix() throws IOException { + verifyTailLines(largeLinesUnix); + } + + public void testTailLinesSmallWindows() throws IOException { + verifyTailLines(smallLinesWindows); + } + + public void testTailLinesLargeWindows() throws IOException { + verifyTailLines(largeLinesWindows); + } + + public void testTailLinesNakedUnix() throws IOException { + verifyTailLines(nakedLastLineUnix); + } + + public void testTailLinesNakedWindows() throws IOException { + verifyTailLines(nakedLastLineWindows); + } + + @SuppressWarnings("unchecked") + private void verifyTailLines(File file) throws IOException { + List lines = org.apache.commons.io.FileUtils.readLines(file); + verifyTailLines(file, lines, 1, 80); + verifyTailLines(file, lines, 5, 80); + verifyTailLines(file, lines, 10, 80); + verifyTailLines(file, lines, 20, 80); + verifyTailLines(file, lines, 100, 80); + verifyTailLines(file, lines, 1, 1); + verifyTailLines(file, lines, 5, 1); + verifyTailLines(file, lines, 10, 1); + verifyTailLines(file, lines, 20, 1); + verifyTailLines(file, lines, 100, 1); + } + + + private void verifyTailLines(File file, List lines, int count, int estimate) throws IOException { + List testLines; + testLines = getTestTailLines(file,count,estimate); + assertEquals("line counts not equal:"+file.getName()+" "+count+" "+estimate,lines.size(),testLines.size()); + assertEquals("lines not equal: "+file.getName()+" "+count+" "+estimate,lines,testLines); + } + + private List getTestTailLines(File file, int count, int estimate) throws IOException { + long pos = -1; + List testLines = new LinkedList(); + do { + List returnedLines = new LinkedList(); + LongRange range = FileUtils.pagedLines(file,pos,-count,returnedLines,estimate); + Collections.reverse(returnedLines); + testLines.addAll(returnedLines); + pos = range.getMinimumLong()-1; + } while (pos>=0); + Collections.reverse(testLines); + return testLines; + } + + public void testHeadLinesZeroLengthUnix() throws IOException { + verifyHeadLines(zeroLengthLinesUnix); + } + + public void testHeadLinesZeroLengthWindows() throws IOException { + verifyHeadLines(zeroLengthLinesWindows); + } + + public void testHeadLinesSmallUnix() throws IOException { + verifyHeadLines(smallLinesUnix); + } + + public void testHeadLinesLargeUnix() throws IOException { + verifyHeadLines(largeLinesUnix); + } + + public void testHeadLinesSmallWindows() throws IOException { + verifyHeadLines(smallLinesWindows); + } + + public void testHeadLinesLargeWindows() throws IOException { + verifyHeadLines(largeLinesWindows); + } + + public void testHeadLinesNakedUnix() throws IOException { + verifyHeadLines(nakedLastLineUnix); + } + + public void testHeadLinesNakedWindows() throws IOException { + verifyHeadLines(nakedLastLineWindows); + } + + + @SuppressWarnings("unchecked") + private void verifyHeadLines(File file) throws IOException { + List lines = org.apache.commons.io.FileUtils.readLines(file); + verifyHeadLines(file, lines, 1, 80); + verifyHeadLines(file, lines, 5, 80); + verifyHeadLines(file, lines, 10, 80); + verifyHeadLines(file, lines, 20, 80); + verifyHeadLines(file, lines, 100, 80); + verifyHeadLines(file, lines, 1, 1); + verifyHeadLines(file, lines, 5, 1); + verifyHeadLines(file, lines, 10, 1); + verifyHeadLines(file, lines, 20, 1); + verifyHeadLines(file, lines, 100, 1); + } + + + private void verifyHeadLines(File file, List lines, int count, int estimate) throws IOException { + List testLines; + testLines = getTestHeadLines(file,count,estimate); + assertEquals("line counts not equal:"+file.getName()+" "+count+" "+estimate,lines.size(),testLines.size()); + assertEquals("lines not equal: "+file.getName()+" "+count+" "+estimate,lines,testLines); + } + + private List getTestHeadLines(File file, int count, int estimate) throws IOException { + long pos = 0; + List testLines = new LinkedList(); + do { + LongRange range = FileUtils.pagedLines(file,pos,count,testLines,estimate); + pos = range.getMaximumLong(); + } while (pos m = am.asMap(); + logger.fine(m.toString()); + } + + public void testEmptyRecord() throws Exception { + byte [] b = ANVLRecord.EMPTY_ANVL_RECORD.getUTF8Bytes(); + assertEquals(b.length, 2); + assertEquals(b[0], '\r'); + assertEquals(b[1], '\n'); + } + + public void testFolding() throws Exception { + ANVLRecord am = new ANVLRecord(); + Exception e = null; + try { + am.addLabel("Label with \n in it"); + } catch (IllegalArgumentException iae) { + e = iae; + } + assertTrue(e != null && e instanceof IllegalArgumentException); + am.addLabelValue("label", "value with \n in it"); + } + + public void testParse() throws UnsupportedEncodingException, IOException { + String record = " a: b\r\n#c#\r\nc:d\r\n \t\t\r\t\n\te" + + "\r\nx:\r\n # z\r\n\r\n"; + ANVLRecord r = ANVLRecord.load(new ByteArrayInputStream( + record.getBytes("ISO-8859-1"))); + logger.fine(r.toString()); + assertEquals(r.get(0).toString(), "a: b"); + record = " a: b\r\n\r\nsdfsdsdfds"; + r = ANVLRecord.load(new ByteArrayInputStream( + record.getBytes("ISO-8859-1"))); + logger.fine(r.toString()); + record = "x:\r\n # z\r\ny:\r\n\r\n"; + r = ANVLRecord.load(new ByteArrayInputStream( + record.getBytes("ISO-8859-1"))); + logger.fine(r.toString()); + assertEquals(r.get(0).toString(), "x:"); + } + + public void testExampleParse() + throws UnsupportedEncodingException, IOException { + final String sample = "entry:\t\t\r\n# first ###draft\r\n" + + "who:\tGilbert, W.S. | Sullivan, Arthur\r\n" + + "what:\tThe Yeoman of\r\n" + + "\t\tthe Guard\r\n" + + "when/created:\t 1888\r\n\r\n"; + ANVLRecord r = ANVLRecord.load(new ByteArrayInputStream( + sample.getBytes("ISO-8859-1"))); + logger.fine(r.toString()); + } + + public void testPoundLabel() + throws UnsupportedEncodingException, IOException { + final String sample = "ent#ry:\t\t\r\n# first ###draft\r\n" + + "who:\tGilbert, W.S. | Sullivan, Arthur\r\n" + + "what:\tThe Yeoman of\r\n" + + "\t\tthe Guard\r\n" + + "when/created:\t 1888\r\n\r\n"; + ANVLRecord r = ANVLRecord.load(sample); + logger.fine(r.toString()); + } + + public void testNewlineLabel() + throws UnsupportedEncodingException, IOException { + final String sample = "ent\nry:\t\t\r\n# first ###draft\r\n" + + "who:\tGilbert, W.S. | Sullivan, Arthur\r\n" + + "what:\tThe Yeoman of\r\n" + + "\t\tthe Guard\r\n" + + "when/created:\t 1888\r\n\r\n"; + IllegalArgumentException iae = null; + try { + ANVLRecord.load(sample); + } catch(IllegalArgumentException e) { + iae = e; + } + assertTrue(iae != null); + } +} From b04f5d82604245461b6a802f1962d86e3d899e98 Mon Sep 17 00:00:00 2001 From: Lauren Ko Date: Thu, 9 Mar 2017 11:32:03 -0600 Subject: [PATCH 023/211] Updating CHANGES.md --- CHANGES.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGES.md b/CHANGES.md index fee29e16..767881ec 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,6 @@ 1.1.8 ----- +* [Move unit tests over from heritrix3 to webarchive-commons](https://github.com/iipc/webarchive-commons/issues/25) * [Strip empty port via URLParser](https://github.com/iipc/webarchive-commons/pull/69/) * [Use CharsetDetector to guess encoding of HTML documents](https://github.com/iipc/webarchive-commons/pull/68/) * [Fix last header was lost if LF LF](https://github.com/iipc/webarchive-commons/pull/65/) From b655796770eb967c931d656b1c80d4967f91e7fc Mon Sep 17 00:00:00 2001 From: Lauren Ko Date: Tue, 21 Mar 2017 14:20:54 -0500 Subject: [PATCH 024/211] Updating change log. --- CHANGES.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGES.md b/CHANGES.md index 767881ec..ccdc1ce7 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,6 @@ 1.1.8 ----- +* [Improve HTML link extraction](https://github.com/iipc/webarchive-commons/pull/72) * [Move unit tests over from heritrix3 to webarchive-commons](https://github.com/iipc/webarchive-commons/issues/25) * [Strip empty port via URLParser](https://github.com/iipc/webarchive-commons/pull/69/) * [Use CharsetDetector to guess encoding of HTML documents](https://github.com/iipc/webarchive-commons/pull/68/) From aee6ff55bfcaa5a9e15092f8c3b1e40ec9faaf87 Mon Sep 17 00:00:00 2001 From: John Erik Halse Date: Tue, 2 May 2017 12:25:28 +0200 Subject: [PATCH 025/211] [maven-release-plugin] prepare release webarchive-commons-1.1.8 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 24780063..63909b90 100644 --- a/pom.xml +++ b/pom.xml @@ -9,7 +9,7 @@ org.netpreserve.commons webarchive-commons - 1.1.8-SNAPSHOT + 1.1.8 jar webarchive-commons From dfe1f62e416f6a881fe15a2544449fff44dd1e51 Mon Sep 17 00:00:00 2001 From: John Erik Halse Date: Tue, 2 May 2017 12:25:35 +0200 Subject: [PATCH 026/211] [maven-release-plugin] prepare for next development iteration --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 63909b90..23953c06 100644 --- a/pom.xml +++ b/pom.xml @@ -9,7 +9,7 @@ org.netpreserve.commons webarchive-commons - 1.1.8 + 1.1.9-SNAPSHOT jar webarchive-commons From cf34a3e13c09cfa4a1412492cfcf3503df698931 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Fri, 28 Apr 2017 22:41:56 +0200 Subject: [PATCH 027/211] Do not add value of preceding HTTP header field if there is no value (or only white space) --- .../archive/format/http/HttpHeaderParser.java | 4 ++-- .../format/http/HttpResponseParserTest.java | 24 +++++++++++++++++++ 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/archive/format/http/HttpHeaderParser.java b/src/main/java/org/archive/format/http/HttpHeaderParser.java index d63ec405..bee3c28b 100755 --- a/src/main/java/org/archive/format/http/HttpHeaderParser.java +++ b/src/main/java/org/archive/format/http/HttpHeaderParser.java @@ -301,8 +301,9 @@ public ParseState handleByte(byte b, HttpHeaderParser parser) throws HttpParseEx if(isLWSP(b)) { return parser.postColonState; } + // reset previous value also in case the header value is empty + parser.setValueStartIdx(); if(b == CR) { - // TODO: THINK more... parser.valuePreCRState = parser.postColonState; return parser.valuePostCRState; } @@ -310,7 +311,6 @@ public ParseState handleByte(byte b, HttpHeaderParser parser) throws HttpParseEx // TODO: this is lax, is LFLF an OK terminator? return parser.lineStartState; } - parser.setValueStartIdx(); parser.addValueByte(b); return parser.valueState; } diff --git a/src/test/java/org/archive/format/http/HttpResponseParserTest.java b/src/test/java/org/archive/format/http/HttpResponseParserTest.java index c0d13230..ea076a69 100644 --- a/src/test/java/org/archive/format/http/HttpResponseParserTest.java +++ b/src/test/java/org/archive/format/http/HttpResponseParserTest.java @@ -57,4 +57,28 @@ public void testParseWithLf() throws IOException { } + public void testParseEmptyHeaderField() throws IOException { + + HttpResponseParser parser = new HttpResponseParser(); + String message = "200 OK\r\nContent-Type: text/plain\r\nServer: \r\n\r\nHi there"; + try { + HttpResponse response = + parser.parse(new ByteArrayInputStream(message.getBytes(IAUtils.UTF8))); + assertNotNull(response); + HttpHeaders headers = response.getHeaders(); + assertNotNull(headers); + assertEquals(2, headers.size()); + HttpHeader header = headers.get(1); + assertEquals("Server",header.getName()); + System.err.println(header.getValue()); + assertFalse("text/plain".equals(header.getValue())); + TestUtils.assertStreamEquals(response, "Hi there".getBytes(IAUtils.UTF8)); + + } catch (HttpParseException e) { + e.printStackTrace(); + fail(); + } + + } + } From bd08143577ea35cb48047a08b2bb67e806992cc2 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Thu, 29 Sep 2016 11:44:18 +0200 Subject: [PATCH 028/211] Extract also `property` attributes of HTML meta elements, this fixes #67 --- .../java/org/archive/resource/html/ExtractingParseObserver.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java index 826851e0..52989455 100644 --- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java +++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java @@ -406,7 +406,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs private static class MetaTagExtractor implements TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { - ArrayList l = getAttrList(node,"name","rel","content","http-equiv"); + ArrayList l = getAttrList(node,"name","rel","content","http-equiv","property"); if(l != null) { data.addMeta(l); } From 4077670acca3f0d2958d926692cdb3a6b29428ca Mon Sep 17 00:00:00 2001 From: Lauren Ko Date: Tue, 2 May 2017 15:15:06 -0500 Subject: [PATCH 029/211] Fix HTTP-Response-Metadata for wget WARCs. Changes came from https://github.com/commoncrawl/ia-web-commons/commit/58e85a60d75707da55ed499f836e57d49347484a --- .../org/archive/extract/ExtractingResourceFactoryMapper.java | 5 ++++- src/main/java/org/archive/format/warc/WARCConstants.java | 4 +++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java b/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java index ad10be40..0afe16fb 100644 --- a/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java +++ b/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java @@ -153,7 +153,10 @@ private boolean isWARCInfoResource(MetaData envelope) { private boolean isHTTPResponseWARCResource(MetaData envelope) { return childFieldEquals(envelope,WARC_HEADER_METADATA, WARCConstants.CONTENT_TYPE, - WARCConstants.HTTP_RESPONSE_MIMETYPE); + WARCConstants.HTTP_RESPONSE_MIMETYPE) + || childFieldEquals(envelope,WARC_HEADER_METADATA, + WARCConstants.CONTENT_TYPE, + WARCConstants.HTTP_RESPONSE_MIMETYPE_NS); } private boolean isWARCJSONResource(MetaData envelope) { return childFieldEquals(envelope,WARC_HEADER_METADATA, diff --git a/src/main/java/org/archive/format/warc/WARCConstants.java b/src/main/java/org/archive/format/warc/WARCConstants.java index 93a81f96..504dc380 100644 --- a/src/main/java/org/archive/format/warc/WARCConstants.java +++ b/src/main/java/org/archive/format/warc/WARCConstants.java @@ -209,7 +209,9 @@ enum WARCRecordType { "application/http; msgtype=request"; public static final String HTTP_RESPONSE_MIMETYPE = "application/http; msgtype=response"; - + public static final String HTTP_RESPONSE_MIMETYPE_NS = + "application/http;msgtype=response"; // wget does this + public static final String FTP_CONTROL_CONVERSATION_MIMETYPE = "text/x-ftp-control-conversation"; From 3bba7e489b7d946eea83344e2150faebe0b35ed2 Mon Sep 17 00:00:00 2001 From: Lauren Ko Date: Tue, 2 May 2017 15:41:23 -0500 Subject: [PATCH 030/211] Update with fixes for 1.1.9 --- CHANGES.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index ccdc1ce7..1ba5c1de 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,3 +1,9 @@ +1.1.9 +----- +* [Extract `property` attributes of HTML meta elements](https://github.com/iipc/webarchive-commons/pull/75) +* [Do not add value of preceding HTTP header field if there is no value](https://github.com/iipc/webarchive-commons/pull/74) +* [Fix WAT records corresponding to response records of Wget generated WARCs](https://github.com/iipc/webarchive-commons/pull/74) + 1.1.8 ----- * [Improve HTML link extraction](https://github.com/iipc/webarchive-commons/pull/72) From 4101f7e39cbdcc508a936faf8b519e68258b9639 Mon Sep 17 00:00:00 2001 From: Naomi Dushay Date: Tue, 8 Aug 2017 16:08:43 -0700 Subject: [PATCH 031/211] use commons-collections v3.2.2 to avoid v3.2.1 vulnerability --- pom.xml | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index 23953c06..8373cdad 100644 --- a/pom.xml +++ b/pom.xml @@ -72,7 +72,7 @@ guava 17.0 - + org.json json @@ -89,12 +89,12 @@ juniversalchardet 1.0.3 - + commons-httpclient commons-httpclient 3.1 - + org.apache.hadoop @@ -128,12 +128,12 @@ tomcat jasper-compiler - + hsqldb hsqldb - - + + @@ -160,7 +160,7 @@ libidn 1.15 - + it.unimi.dsi dsiutils 2.0.12 @@ -170,13 +170,26 @@ ch.qos.logback logback-classic + + + commons-collections + commons-collections + + + + + commons-collections + commons-collections + 3.2.2 + + org.apache.httpcomponents httpcore 4.3 - + joda-time joda-time From 988bec707c27a01333becfc3bd502af4441ea1e1 Mon Sep 17 00:00:00 2001 From: Lauren Ko Date: Wed, 9 Aug 2017 10:57:28 -0500 Subject: [PATCH 032/211] Update CHANGES.md for PR 77 --- CHANGES.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGES.md b/CHANGES.md index 1ba5c1de..dcb598d9 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,6 @@ 1.1.9 ----- +* [Use commons-collections v3.2.2 to avoid v3.2.1 vulnerability](https://github.com/iipc/webarchive-commons/pull/77) * [Extract `property` attributes of HTML meta elements](https://github.com/iipc/webarchive-commons/pull/75) * [Do not add value of preceding HTTP header field if there is no value](https://github.com/iipc/webarchive-commons/pull/74) * [Fix WAT records corresponding to response records of Wget generated WARCs](https://github.com/iipc/webarchive-commons/pull/74) From 2e8cdea3d245c11e1ea3a2a6153c0038479aef12 Mon Sep 17 00:00:00 2001 From: nruest Date: Tue, 7 May 2019 13:23:28 -0400 Subject: [PATCH 033/211] [maven-release-plugin] prepare release webarchive-commons-1.1.9 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 8373cdad..833f42c3 100644 --- a/pom.xml +++ b/pom.xml @@ -9,7 +9,7 @@ org.netpreserve.commons webarchive-commons - 1.1.9-SNAPSHOT + 1.1.9 jar webarchive-commons From da029db2ba89205b93a5291ed18b9c69155271bb Mon Sep 17 00:00:00 2001 From: nruest Date: Tue, 7 May 2019 13:23:34 -0400 Subject: [PATCH 034/211] [maven-release-plugin] prepare for next development iteration --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 833f42c3..1cbeb99a 100644 --- a/pom.xml +++ b/pom.xml @@ -9,7 +9,7 @@ org.netpreserve.commons webarchive-commons - 1.1.9 + 1.1.10-SNAPSHOT jar webarchive-commons From 723b18a35f8be786cb073282b5ea88b5d8c643ce Mon Sep 17 00:00:00 2001 From: nruest Date: Tue, 7 May 2019 13:56:18 -0400 Subject: [PATCH 035/211] Update TravisCI config; resolves #82. - Test Oracle Java 8 - Test OpenJDK Java 8 - Use trusty - Require sudo for OpenJDK7 - Remove Oracle Java 7 (it's gone!) - Remove mvn site from the build process since there is no javadoc site (at least that I can tell) --- .travis.yml | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index 0dfd3f7f..54daf83b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,7 +1,13 @@ +dist: trusty language: java +# sudo required for OpenJDK7 support per: +# https://github.com/travis-ci/travis-ci/issues/7884#issuecomment-309689557 +sudo: required jdk: - - oraclejdk7 + - openjdk7 + - oraclejdk8 + - openjdk8 before_install: - "git clone https://github.com/iipc/travis.git target/travis" @@ -11,8 +17,8 @@ before_script: - "export MAVEN_OPTS=-Xmx512m" - "ulimit -u 2048" -script: - - "target/travis/deploy-if.sh" +script: + - mvn install -B -V # whitelist in the master branch only branches: @@ -23,4 +29,3 @@ env: global: - secure: "qDKjVdoe4Qcz4WfXiQydU7tyl51T62FUJrjqu4FUPBcgeQhFQiggwhpaE6xCOzOpxbsuBi2R1c8gMQf5esE5iDL5jZMu+kz++dYbuzMTd13ttvZWMW5wRPH0H8iHk609FP/RDtVKKBr7WO0JvvIAZEhWNHZrLXBrrKgdTey171g=" - secure: "FXGBKJNP9X7ePJfS4eYTZtoFo4RT1sxor34XxncSJr7uV6ggtZb4B4WNd16IlLcDk6E32sx8YoWdltaOGwQ5Vg/kux5Ko/wKZCoccS018Ln1bRT86dD1KoPY34rGoNJVQxe7J/1MPqpBKwmi2XCKfzpsEh3W7bbIqg8w9MEOOZA=" - From 79aed910b44510294367a4acf4f3e6376b1c62c0 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Wed, 23 Aug 2017 17:04:52 +0200 Subject: [PATCH 036/211] ExtractingParseObserver: get links from onClick attributes - extract links from JavaScript code snippets in onClick attributes of INPUT and DIV elements --- .../html/ExtractingParseObserver.java | 40 +++++++++++++++++- .../html/ExtractingParseObserverTest.java | 10 +++++ .../resource/html/link-extraction-test.warc | 42 +++++++++++++++++++ 3 files changed, 91 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java index 52989455..e4fa83c7 100644 --- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java +++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java @@ -39,6 +39,15 @@ public class ExtractingParseObserver implements ParseObserver { protected static Pattern cssUrlTrimPattern = Pattern.compile(cssUrlTrimPatString); + protected static String jsOnClickUrl1PatString = + "(?i)^(?:javascript:)?(?:(?:window|top|document|self|parent)\\.)?location(?:\\.href)?\\s*=\\s*('|')([^'\"]{3,256})\\1$"; + protected static String jsOnClickUrl2PatString = + "(?i)^(?:javascript:)?(?:window|parent)\\.open\\((['\"]|')([^\"']{3,256}?)\\1[,)]"; + protected static Pattern[] jsOnClickUrlPatterns = { + Pattern.compile(jsOnClickUrl1PatString), + Pattern.compile(jsOnClickUrl2PatString) + }; + private final static int MAX_TEXT_LEN = 100; private static final String PATH = "path"; @@ -51,6 +60,7 @@ public class ExtractingParseObserver implements ParseObserver { extractors.put("APPLET", new AppletTagExtractor()); extractors.put("AREA", new AreaTagExtractor()); extractors.put("BASE", new BaseTagExtractor()); + extractors.put("DIV", new DivTagExtractor()); extractors.put("EMBED", new EmbedTagExtractor()); extractors.put("FORM", new FormTagExtractor()); extractors.put("FRAME", new FrameTagExtractor()); @@ -268,7 +278,20 @@ private static void addHrefWithAttrs(HTMLMetaData data, TagNode node, if(l != null) { data.addHref(l); } - } + } + + private static void addHrefsOnclick(HTMLMetaData data, TagNode node) { + String onclick = node.getAttribute("onclick"); + if (onclick != null) { + String path = makePath(node.getTagName(), "onclick"); + for (Pattern pattern : jsOnClickUrlPatterns) { + String url = patternJSExtract(pattern, onclick); + if (url != null) { + data.addHref(PATH, path, "url", url); + } + } + } + } private interface TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs); @@ -330,6 +353,12 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs } } + private static class DivTagExtractor implements TagExtractor { + public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { + addHrefsOnclick(data,node); + } + } + private static class EmbedTagExtractor implements TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { addBasicHrefs(data,node,"src"); @@ -386,6 +415,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs private static class InputTagExtractor implements TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { addBasicHrefs(data,node,"src","formaction"); + addHrefsOnclick(data,node); } } @@ -450,4 +480,12 @@ private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String conten } } } + + private static String patternJSExtract(Pattern pattern, String content) { + Matcher m = pattern.matcher(content); + if (m.find()) { + return m.group(2); + } + return null; + } } diff --git a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java index 8f690a06..4828ad64 100644 --- a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java +++ b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java @@ -263,6 +263,16 @@ public void testLinkExtraction() throws ResourceParseException, IOException { {"http://www.your-domain.com/your-page.html", "DIV@/data-href"} }; checkLinks(extractor.getNext(), fbSocialLinks); + String[][] onClickLinks = { + {"webpage.html", "DIV@/onclick"}, + {"index.html", "INPUT@/onclick"}, + {"http://www.x.com/", "INPUT@/onclick"}, + {"button-child.php", "INPUT@/onclick"}, + {"http://example.com/", "INPUT@/onclick"}, + {"http://example.com/location/href/1.html", "INPUT@/onclick"}, + {"http://example.com/location/href/2.html", "INPUT@/onclick"} + }; + checkLinks(extractor.getNext(), onClickLinks); } } diff --git a/src/test/resources/org/archive/resource/html/link-extraction-test.warc b/src/test/resources/org/archive/resource/html/link-extraction-test.warc index ab0e54c8..1a30598e 100644 --- a/src/test/resources/org/archive/resource/html/link-extraction-test.warc +++ b/src/test/resources/org/archive/resource/html/link-extraction-test.warc @@ -318,3 +318,45 @@ Content-Type: text/html +WARC/1.0 +WARC-Type: response +WARC-Date: 2017-08-23T13:54:59Z +Content-Type: application/http;msgtype=response +Content-Length: 1279 + +HTTP/1.1 200 OK +Date: Wed, 23 Aug 2017 13:54:59 GMT +Server: Apache/2.4.18 (Ubuntu) +Last-Modified: Wed, 23 Aug 2017 13:54:03 GMT +ETag: "3ca-5576c0b718ab3" +Accept-Ranges: bytes +Content-Length: 971 +Vary: Accept-Encoding +Keep-Alive: timeout=5, max=100 +Connection: Keep-Alive +Content-Type: text/html + + + +Test Extraction of URLs from INPUT onClick Attributes + + + + +

Click to load webpage

+ + + + + + + + From 26b1e7af27abec102ab36faf6a786dfedf9436fd Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Wed, 23 Aug 2017 14:48:05 +0200 Subject: [PATCH 037/211] ExtractingParseObserver: extract rel, hreflang and type attributes - add "rel" attribute to A and AREA links - add attributes "hreflang" and "type" (MIME type) to A@/href links --- .../html/ExtractingParseObserver.java | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java index 52989455..a487fd34 100644 --- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java +++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java @@ -284,7 +284,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs l.add(makePath("A","href")); l.add("url"); l.add(url); - for(String a : new String[] {"target","alt","title"}) { + for(String a : new String[] {"target","alt","title","rel","hreflang","type"}) { String v = node.getAttribute(a); if(v != null) { l.add(a); @@ -311,7 +311,22 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs private static class AreaTagExtractor implements TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { - addBasicHrefs(data,node,"href"); + String url = node.getAttribute("href"); + if(url != null) { + ArrayList l = new ArrayList(); + l.add(PATH); + l.add(makePath("AREA","href")); + l.add("url"); + l.add(url); + for(String a : new String[] {"rel"}) { + String v = node.getAttribute(a); + if(v != null) { + l.add(a); + l.add(v); + } + } + data.addHref(l); + } } } From a2cc42cac2777d06ab40e09811cdc883773775b9 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Thu, 11 Jun 2020 14:24:03 +0200 Subject: [PATCH 038/211] WAT extractor: do not fail on missing WARC-Filename in warcinfo record, fixes #88 - do not throw IOException if there is no WARC-Filename in warcinfo record - write metadata record (corresponding to warcinfo) without WARC-Target-URI --- src/main/java/org/archive/extract/WATExtractorOutput.java | 2 +- src/main/java/org/archive/format/warc/WARCRecordWriter.java | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/archive/extract/WATExtractorOutput.java b/src/main/java/org/archive/extract/WATExtractorOutput.java index 3bcfa924..4b5f72ed 100644 --- a/src/main/java/org/archive/extract/WATExtractorOutput.java +++ b/src/main/java/org/archive/extract/WATExtractorOutput.java @@ -151,7 +151,7 @@ private void writeWARC(OutputStream recOut, MetaData md) throws IOException { String warcType = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Type"); String targetURI; if(warcType.equals("warcinfo")) { - targetURI = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Filename"); + targetURI = JSONUtils.extractSingle(md, "Envelope.WARC-Header-Metadata.WARC-Filename"); } else { targetURI = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Target-URI"); } diff --git a/src/main/java/org/archive/format/warc/WARCRecordWriter.java b/src/main/java/org/archive/format/warc/WARCRecordWriter.java index 0aab83b7..3278b289 100644 --- a/src/main/java/org/archive/format/warc/WARCRecordWriter.java +++ b/src/main/java/org/archive/format/warc/WARCRecordWriter.java @@ -88,7 +88,10 @@ public void writeJSONMetadataRecord( OutputStream out, { HttpHeaders headers = new HttpHeaders(); headers.add(HEADER_KEY_TYPE, WARCRecordType.metadata.name()); - headers.add(HEADER_KEY_URI, targetURI); + if (targetURI != null) { + // WARC-Target-URI is optional in metadata records + headers.add(HEADER_KEY_URI, targetURI); + } headers.add(HEADER_KEY_DATE, DateUtils.getLog14Date(originalDate)); headers.add(HEADER_KEY_ID, makeRecordId()); headers.add(HEADER_KEY_REFERS_TO, origRecordId); From 04e10397b9137a36812c17276826bc60d1a37ede Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Mon, 15 Jun 2020 13:29:25 +0200 Subject: [PATCH 039/211] Update change log to include #85, #86 and #89 --- CHANGES.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index dcb598d9..bf985ada 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,3 +1,10 @@ +1.1.10 +------ +* [WAT extractor: do not fail on missing WARC-Filename in warcinfo record](https://github.com/iipc/webarchive-commons/pull/89) +* [ExtractingParseObserver: extract rel, hreflang and type attributes](https://github.com/iipc/webarchive-commons/pull/86) +* [ExtractingParseObserver: extract links from onClick attributes](https://github.com/iipc/webarchive-commons/pull/85) +* [Update TravisCI config](https://github.com/iipc/webarchive-commons/pull/83) + 1.1.9 ----- * [Use commons-collections v3.2.2 to avoid v3.2.1 vulnerability](https://github.com/iipc/webarchive-commons/pull/77) From 9041ff4e96f6554658742affe490223dc0241d06 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 13 Oct 2020 01:28:48 +0000 Subject: [PATCH 040/211] Bump junit from 3.8.1 to 4.13.1 Bumps [junit](https://github.com/junit-team/junit4) from 3.8.1 to 4.13.1. - [Release notes](https://github.com/junit-team/junit4/releases) - [Changelog](https://github.com/junit-team/junit4/blob/main/doc/ReleaseNotes4.13.1.md) - [Commits](https://github.com/junit-team/junit4/commits/r4.13.1) Signed-off-by: dependabot[bot] --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 1cbeb99a..5ca7e1a3 100644 --- a/pom.xml +++ b/pom.xml @@ -64,7 +64,7 @@ junit junit - 3.8.1 + 4.13.1 From c2530d77b73838c31f4e83f2be941ec61032ebb2 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Tue, 16 Mar 2021 11:58:11 +0100 Subject: [PATCH 041/211] Fix InterruptibleCharSequenceTest (testInterruptibility) to run on JDK 11 - if thread running the regexp matching is already finished after the initial/current sleeping time, rerun the test again with a shorter sleeping time until the expected RuntimeException is hit --- .../util/InterruptibleCharSequenceTest.java | 26 +++++++++++++------ 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/src/test/java/org/archive/util/InterruptibleCharSequenceTest.java b/src/test/java/org/archive/util/InterruptibleCharSequenceTest.java index a3a5f180..8b5c5d1b 100644 --- a/src/test/java/org/archive/util/InterruptibleCharSequenceTest.java +++ b/src/test/java/org/archive/util/InterruptibleCharSequenceTest.java @@ -107,14 +107,24 @@ public void testNoninterruptible() throws InterruptedException { } public void testInterruptibility() throws InterruptedException { - BlockingQueue