From 2e8cdea3d245c11e1ea3a2a6153c0038479aef12 Mon Sep 17 00:00:00 2001
From: nruest
Date: Tue, 7 May 2019 13:23:28 -0400
Subject: [PATCH 001/177] [maven-release-plugin] prepare release
webarchive-commons-1.1.9
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index 8373cdad..833f42c3 100644
--- a/pom.xml
+++ b/pom.xml
@@ -9,7 +9,7 @@
org.netpreserve.commons
webarchive-commons
- 1.1.9-SNAPSHOT
+ 1.1.9
jar
webarchive-commons
From da029db2ba89205b93a5291ed18b9c69155271bb Mon Sep 17 00:00:00 2001
From: nruest
Date: Tue, 7 May 2019 13:23:34 -0400
Subject: [PATCH 002/177] [maven-release-plugin] prepare for next development
iteration
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index 833f42c3..1cbeb99a 100644
--- a/pom.xml
+++ b/pom.xml
@@ -9,7 +9,7 @@
org.netpreserve.commons
webarchive-commons
- 1.1.9
+ 1.1.10-SNAPSHOT
jar
webarchive-commons
From 723b18a35f8be786cb073282b5ea88b5d8c643ce Mon Sep 17 00:00:00 2001
From: nruest
Date: Tue, 7 May 2019 13:56:18 -0400
Subject: [PATCH 003/177] Update TravisCI config; resolves #82.
- Test Oracle Java 8
- Test OpenJDK Java 8
- Use trusty
- Require sudo for OpenJDK7
- Remove Oracle Java 7 (it's gone!)
- Remove mvn site from the build process since there is no javadoc site
(at least that I can tell)
---
.travis.yml | 13 +++++++++----
1 file changed, 9 insertions(+), 4 deletions(-)
diff --git a/.travis.yml b/.travis.yml
index 0dfd3f7f..54daf83b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,7 +1,13 @@
+dist: trusty
language: java
+# sudo required for OpenJDK7 support per:
+# https://github.com/travis-ci/travis-ci/issues/7884#issuecomment-309689557
+sudo: required
jdk:
- - oraclejdk7
+ - openjdk7
+ - oraclejdk8
+ - openjdk8
before_install:
- "git clone https://github.com/iipc/travis.git target/travis"
@@ -11,8 +17,8 @@ before_script:
- "export MAVEN_OPTS=-Xmx512m"
- "ulimit -u 2048"
-script:
- - "target/travis/deploy-if.sh"
+script:
+ - mvn install -B -V
# whitelist in the master branch only
branches:
@@ -23,4 +29,3 @@ env:
global:
- secure: "qDKjVdoe4Qcz4WfXiQydU7tyl51T62FUJrjqu4FUPBcgeQhFQiggwhpaE6xCOzOpxbsuBi2R1c8gMQf5esE5iDL5jZMu+kz++dYbuzMTd13ttvZWMW5wRPH0H8iHk609FP/RDtVKKBr7WO0JvvIAZEhWNHZrLXBrrKgdTey171g="
- secure: "FXGBKJNP9X7ePJfS4eYTZtoFo4RT1sxor34XxncSJr7uV6ggtZb4B4WNd16IlLcDk6E32sx8YoWdltaOGwQ5Vg/kux5Ko/wKZCoccS018Ln1bRT86dD1KoPY34rGoNJVQxe7J/1MPqpBKwmi2XCKfzpsEh3W7bbIqg8w9MEOOZA="
-
From 79aed910b44510294367a4acf4f3e6376b1c62c0 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Wed, 23 Aug 2017 17:04:52 +0200
Subject: [PATCH 004/177] ExtractingParseObserver: get links from onClick
attributes - extract links from JavaScript code snippets in onClick
attributes of INPUT and DIV elements
---
.../html/ExtractingParseObserver.java | 40 +++++++++++++++++-
.../html/ExtractingParseObserverTest.java | 10 +++++
.../resource/html/link-extraction-test.warc | 42 +++++++++++++++++++
3 files changed, 91 insertions(+), 1 deletion(-)
diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
index 52989455..e4fa83c7 100644
--- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
+++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
@@ -39,6 +39,15 @@ public class ExtractingParseObserver implements ParseObserver {
protected static Pattern cssUrlTrimPattern = Pattern.compile(cssUrlTrimPatString);
+ protected static String jsOnClickUrl1PatString =
+ "(?i)^(?:javascript:)?(?:(?:window|top|document|self|parent)\\.)?location(?:\\.href)?\\s*=\\s*('|')([^'\"]{3,256})\\1$";
+ protected static String jsOnClickUrl2PatString =
+ "(?i)^(?:javascript:)?(?:window|parent)\\.open\\((['\"]|')([^\"']{3,256}?)\\1[,)]";
+ protected static Pattern[] jsOnClickUrlPatterns = {
+ Pattern.compile(jsOnClickUrl1PatString),
+ Pattern.compile(jsOnClickUrl2PatString)
+ };
+
private final static int MAX_TEXT_LEN = 100;
private static final String PATH = "path";
@@ -51,6 +60,7 @@ public class ExtractingParseObserver implements ParseObserver {
extractors.put("APPLET", new AppletTagExtractor());
extractors.put("AREA", new AreaTagExtractor());
extractors.put("BASE", new BaseTagExtractor());
+ extractors.put("DIV", new DivTagExtractor());
extractors.put("EMBED", new EmbedTagExtractor());
extractors.put("FORM", new FormTagExtractor());
extractors.put("FRAME", new FrameTagExtractor());
@@ -268,7 +278,20 @@ private static void addHrefWithAttrs(HTMLMetaData data, TagNode node,
if(l != null) {
data.addHref(l);
}
- }
+ }
+
+ private static void addHrefsOnclick(HTMLMetaData data, TagNode node) {
+ String onclick = node.getAttribute("onclick");
+ if (onclick != null) {
+ String path = makePath(node.getTagName(), "onclick");
+ for (Pattern pattern : jsOnClickUrlPatterns) {
+ String url = patternJSExtract(pattern, onclick);
+ if (url != null) {
+ data.addHref(PATH, path, "url", url);
+ }
+ }
+ }
+ }
private interface TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs);
@@ -330,6 +353,12 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
}
}
+ private static class DivTagExtractor implements TagExtractor {
+ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
+ addHrefsOnclick(data,node);
+ }
+ }
+
private static class EmbedTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
addBasicHrefs(data,node,"src");
@@ -386,6 +415,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
private static class InputTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
addBasicHrefs(data,node,"src","formaction");
+ addHrefsOnclick(data,node);
}
}
@@ -450,4 +480,12 @@ private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String conten
}
}
}
+
+ private static String patternJSExtract(Pattern pattern, String content) {
+ Matcher m = pattern.matcher(content);
+ if (m.find()) {
+ return m.group(2);
+ }
+ return null;
+ }
}
diff --git a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
index 8f690a06..4828ad64 100644
--- a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
+++ b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
@@ -263,6 +263,16 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
{"http://www.your-domain.com/your-page.html", "DIV@/data-href"}
};
checkLinks(extractor.getNext(), fbSocialLinks);
+ String[][] onClickLinks = {
+ {"webpage.html", "DIV@/onclick"},
+ {"index.html", "INPUT@/onclick"},
+ {"http://www.x.com/", "INPUT@/onclick"},
+ {"button-child.php", "INPUT@/onclick"},
+ {"http://example.com/", "INPUT@/onclick"},
+ {"http://example.com/location/href/1.html", "INPUT@/onclick"},
+ {"http://example.com/location/href/2.html", "INPUT@/onclick"}
+ };
+ checkLinks(extractor.getNext(), onClickLinks);
}
}
diff --git a/src/test/resources/org/archive/resource/html/link-extraction-test.warc b/src/test/resources/org/archive/resource/html/link-extraction-test.warc
index ab0e54c8..1a30598e 100644
--- a/src/test/resources/org/archive/resource/html/link-extraction-test.warc
+++ b/src/test/resources/org/archive/resource/html/link-extraction-test.warc
@@ -318,3 +318,45 @@ Content-Type: text/html
+WARC/1.0
+WARC-Type: response
+WARC-Date: 2017-08-23T13:54:59Z
+Content-Type: application/http;msgtype=response
+Content-Length: 1279
+
+HTTP/1.1 200 OK
+Date: Wed, 23 Aug 2017 13:54:59 GMT
+Server: Apache/2.4.18 (Ubuntu)
+Last-Modified: Wed, 23 Aug 2017 13:54:03 GMT
+ETag: "3ca-5576c0b718ab3"
+Accept-Ranges: bytes
+Content-Length: 971
+Vary: Accept-Encoding
+Keep-Alive: timeout=5, max=100
+Connection: Keep-Alive
+Content-Type: text/html
+
+
+
+Test Extraction of URLs from INPUT onClick Attributes
+
+
+
+
+ Click to load webpage
+
+
+
+
+
+