diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java index 52989455..e4fa83c7 100644 --- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java +++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java @@ -39,6 +39,15 @@ public class ExtractingParseObserver implements ParseObserver { protected static Pattern cssUrlTrimPattern = Pattern.compile(cssUrlTrimPatString); + protected static String jsOnClickUrl1PatString = + "(?i)^(?:javascript:)?(?:(?:window|top|document|self|parent)\\.)?location(?:\\.href)?\\s*=\\s*('|')([^'\"]{3,256})\\1$"; + protected static String jsOnClickUrl2PatString = + "(?i)^(?:javascript:)?(?:window|parent)\\.open\\((['\"]|')([^\"']{3,256}?)\\1[,)]"; + protected static Pattern[] jsOnClickUrlPatterns = { + Pattern.compile(jsOnClickUrl1PatString), + Pattern.compile(jsOnClickUrl2PatString) + }; + private final static int MAX_TEXT_LEN = 100; private static final String PATH = "path"; @@ -51,6 +60,7 @@ public class ExtractingParseObserver implements ParseObserver { extractors.put("APPLET", new AppletTagExtractor()); extractors.put("AREA", new AreaTagExtractor()); extractors.put("BASE", new BaseTagExtractor()); + extractors.put("DIV", new DivTagExtractor()); extractors.put("EMBED", new EmbedTagExtractor()); extractors.put("FORM", new FormTagExtractor()); extractors.put("FRAME", new FrameTagExtractor()); @@ -268,7 +278,20 @@ private static void addHrefWithAttrs(HTMLMetaData data, TagNode node, if(l != null) { data.addHref(l); } - } + } + + private static void addHrefsOnclick(HTMLMetaData data, TagNode node) { + String onclick = node.getAttribute("onclick"); + if (onclick != null) { + String path = makePath(node.getTagName(), "onclick"); + for (Pattern pattern : jsOnClickUrlPatterns) { + String url = patternJSExtract(pattern, onclick); + if (url != null) { + data.addHref(PATH, path, "url", url); + } + } + } + } private interface TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs); @@ -330,6 +353,12 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs } } + private static class DivTagExtractor implements TagExtractor { + public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { + addHrefsOnclick(data,node); + } + } + private static class EmbedTagExtractor implements TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { addBasicHrefs(data,node,"src"); @@ -386,6 +415,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs private static class InputTagExtractor implements TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { addBasicHrefs(data,node,"src","formaction"); + addHrefsOnclick(data,node); } } @@ -450,4 +480,12 @@ private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String conten } } } + + private static String patternJSExtract(Pattern pattern, String content) { + Matcher m = pattern.matcher(content); + if (m.find()) { + return m.group(2); + } + return null; + } } diff --git a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java index 8f690a06..4828ad64 100644 --- a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java +++ b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java @@ -263,6 +263,16 @@ public void testLinkExtraction() throws ResourceParseException, IOException { {"http://www.your-domain.com/your-page.html", "DIV@/data-href"} }; checkLinks(extractor.getNext(), fbSocialLinks); + String[][] onClickLinks = { + {"webpage.html", "DIV@/onclick"}, + {"index.html", "INPUT@/onclick"}, + {"http://www.x.com/", "INPUT@/onclick"}, + {"button-child.php", "INPUT@/onclick"}, + {"http://example.com/", "INPUT@/onclick"}, + {"http://example.com/location/href/1.html", "INPUT@/onclick"}, + {"http://example.com/location/href/2.html", "INPUT@/onclick"} + }; + checkLinks(extractor.getNext(), onClickLinks); } } diff --git a/src/test/resources/org/archive/resource/html/link-extraction-test.warc b/src/test/resources/org/archive/resource/html/link-extraction-test.warc index ab0e54c8..1a30598e 100644 --- a/src/test/resources/org/archive/resource/html/link-extraction-test.warc +++ b/src/test/resources/org/archive/resource/html/link-extraction-test.warc @@ -318,3 +318,45 @@ Content-Type: text/html +WARC/1.0 +WARC-Type: response +WARC-Date: 2017-08-23T13:54:59Z +Content-Type: application/http;msgtype=response +Content-Length: 1279 + +HTTP/1.1 200 OK +Date: Wed, 23 Aug 2017 13:54:59 GMT +Server: Apache/2.4.18 (Ubuntu) +Last-Modified: Wed, 23 Aug 2017 13:54:03 GMT +ETag: "3ca-5576c0b718ab3" +Accept-Ranges: bytes +Content-Length: 971 +Vary: Accept-Encoding +Keep-Alive: timeout=5, max=100 +Connection: Keep-Alive +Content-Type: text/html + + + +Test Extraction of URLs from INPUT onClick Attributes + + + + +
Click to load webpage
+ +
+ + + + + + +
+ + + + + +