From 79aed910b44510294367a4acf4f3e6376b1c62c0 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Wed, 23 Aug 2017 17:04:52 +0200
Subject: [PATCH 001/197] ExtractingParseObserver: get links from onClick
attributes - extract links from JavaScript code snippets in onClick
attributes of INPUT and DIV elements
---
.../html/ExtractingParseObserver.java | 40 +++++++++++++++++-
.../html/ExtractingParseObserverTest.java | 10 +++++
.../resource/html/link-extraction-test.warc | 42 +++++++++++++++++++
3 files changed, 91 insertions(+), 1 deletion(-)
diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
index 52989455..e4fa83c7 100644
--- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
+++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
@@ -39,6 +39,15 @@ public class ExtractingParseObserver implements ParseObserver {
protected static Pattern cssUrlTrimPattern = Pattern.compile(cssUrlTrimPatString);
+ protected static String jsOnClickUrl1PatString =
+ "(?i)^(?:javascript:)?(?:(?:window|top|document|self|parent)\\.)?location(?:\\.href)?\\s*=\\s*('|')([^'\"]{3,256})\\1$";
+ protected static String jsOnClickUrl2PatString =
+ "(?i)^(?:javascript:)?(?:window|parent)\\.open\\((['\"]|')([^\"']{3,256}?)\\1[,)]";
+ protected static Pattern[] jsOnClickUrlPatterns = {
+ Pattern.compile(jsOnClickUrl1PatString),
+ Pattern.compile(jsOnClickUrl2PatString)
+ };
+
private final static int MAX_TEXT_LEN = 100;
private static final String PATH = "path";
@@ -51,6 +60,7 @@ public class ExtractingParseObserver implements ParseObserver {
extractors.put("APPLET", new AppletTagExtractor());
extractors.put("AREA", new AreaTagExtractor());
extractors.put("BASE", new BaseTagExtractor());
+ extractors.put("DIV", new DivTagExtractor());
extractors.put("EMBED", new EmbedTagExtractor());
extractors.put("FORM", new FormTagExtractor());
extractors.put("FRAME", new FrameTagExtractor());
@@ -268,7 +278,20 @@ private static void addHrefWithAttrs(HTMLMetaData data, TagNode node,
if(l != null) {
data.addHref(l);
}
- }
+ }
+
+ private static void addHrefsOnclick(HTMLMetaData data, TagNode node) {
+ String onclick = node.getAttribute("onclick");
+ if (onclick != null) {
+ String path = makePath(node.getTagName(), "onclick");
+ for (Pattern pattern : jsOnClickUrlPatterns) {
+ String url = patternJSExtract(pattern, onclick);
+ if (url != null) {
+ data.addHref(PATH, path, "url", url);
+ }
+ }
+ }
+ }
private interface TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs);
@@ -330,6 +353,12 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
}
}
+ private static class DivTagExtractor implements TagExtractor {
+ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
+ addHrefsOnclick(data,node);
+ }
+ }
+
private static class EmbedTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
addBasicHrefs(data,node,"src");
@@ -386,6 +415,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
private static class InputTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
addBasicHrefs(data,node,"src","formaction");
+ addHrefsOnclick(data,node);
}
}
@@ -450,4 +480,12 @@ private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String conten
}
}
}
+
+ private static String patternJSExtract(Pattern pattern, String content) {
+ Matcher m = pattern.matcher(content);
+ if (m.find()) {
+ return m.group(2);
+ }
+ return null;
+ }
}
diff --git a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
index 8f690a06..4828ad64 100644
--- a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
+++ b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
@@ -263,6 +263,16 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
{"http://www.your-domain.com/your-page.html", "DIV@/data-href"}
};
checkLinks(extractor.getNext(), fbSocialLinks);
+ String[][] onClickLinks = {
+ {"webpage.html", "DIV@/onclick"},
+ {"index.html", "INPUT@/onclick"},
+ {"http://www.x.com/", "INPUT@/onclick"},
+ {"button-child.php", "INPUT@/onclick"},
+ {"http://example.com/", "INPUT@/onclick"},
+ {"http://example.com/location/href/1.html", "INPUT@/onclick"},
+ {"http://example.com/location/href/2.html", "INPUT@/onclick"}
+ };
+ checkLinks(extractor.getNext(), onClickLinks);
}
}
diff --git a/src/test/resources/org/archive/resource/html/link-extraction-test.warc b/src/test/resources/org/archive/resource/html/link-extraction-test.warc
index ab0e54c8..1a30598e 100644
--- a/src/test/resources/org/archive/resource/html/link-extraction-test.warc
+++ b/src/test/resources/org/archive/resource/html/link-extraction-test.warc
@@ -318,3 +318,45 @@ Content-Type: text/html
+WARC/1.0
+WARC-Type: response
+WARC-Date: 2017-08-23T13:54:59Z
+Content-Type: application/http;msgtype=response
+Content-Length: 1279
+
+HTTP/1.1 200 OK
+Date: Wed, 23 Aug 2017 13:54:59 GMT
+Server: Apache/2.4.18 (Ubuntu)
+Last-Modified: Wed, 23 Aug 2017 13:54:03 GMT
+ETag: "3ca-5576c0b718ab3"
+Accept-Ranges: bytes
+Content-Length: 971
+Vary: Accept-Encoding
+Keep-Alive: timeout=5, max=100
+Connection: Keep-Alive
+Content-Type: text/html
+
+
+
+Test Extraction of URLs from INPUT onClick Attributes
+
+
+
+
+ Click to load webpage
+
+
+
+
+
+