Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,15 @@ public class ExtractingParseObserver implements ParseObserver {

protected static Pattern cssUrlTrimPattern = Pattern.compile(cssUrlTrimPatString);

protected static String jsOnClickUrl1PatString =
"(?i)^(?:javascript:)?(?:(?:window|top|document|self|parent)\\.)?location(?:\\.href)?\\s*=\\s*('|')([^'\"]{3,256})\\1$";
protected static String jsOnClickUrl2PatString =
"(?i)^(?:javascript:)?(?:window|parent)\\.open\\((['\"]|')([^\"']{3,256}?)\\1[,)]";
protected static Pattern[] jsOnClickUrlPatterns = {
Pattern.compile(jsOnClickUrl1PatString),
Pattern.compile(jsOnClickUrl2PatString)
};

private final static int MAX_TEXT_LEN = 100;

private static final String PATH = "path";
Expand All @@ -51,6 +60,7 @@ public class ExtractingParseObserver implements ParseObserver {
extractors.put("APPLET", new AppletTagExtractor());
extractors.put("AREA", new AreaTagExtractor());
extractors.put("BASE", new BaseTagExtractor());
extractors.put("DIV", new DivTagExtractor());
extractors.put("EMBED", new EmbedTagExtractor());
extractors.put("FORM", new FormTagExtractor());
extractors.put("FRAME", new FrameTagExtractor());
Expand Down Expand Up @@ -268,7 +278,20 @@ private static void addHrefWithAttrs(HTMLMetaData data, TagNode node,
if(l != null) {
data.addHref(l);
}
}
}

private static void addHrefsOnclick(HTMLMetaData data, TagNode node) {
String onclick = node.getAttribute("onclick");
if (onclick != null) {
String path = makePath(node.getTagName(), "onclick");
for (Pattern pattern : jsOnClickUrlPatterns) {
String url = patternJSExtract(pattern, onclick);
if (url != null) {
data.addHref(PATH, path, "url", url);
}
}
}
}

private interface TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs);
Expand Down Expand Up @@ -330,6 +353,12 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
}
}

private static class DivTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
addHrefsOnclick(data,node);
}
}

private static class EmbedTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
addBasicHrefs(data,node,"src");
Expand Down Expand Up @@ -386,6 +415,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
private static class InputTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
addBasicHrefs(data,node,"src","formaction");
addHrefsOnclick(data,node);
}
}

Expand Down Expand Up @@ -450,4 +480,12 @@ private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String conten
}
}
}

private static String patternJSExtract(Pattern pattern, String content) {
Matcher m = pattern.matcher(content);
if (m.find()) {
return m.group(2);
}
return null;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,16 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
{"http://www.your-domain.com/your-page.html", "DIV@/data-href"}
};
checkLinks(extractor.getNext(), fbSocialLinks);
String[][] onClickLinks = {
{"webpage.html", "DIV@/onclick"},
{"index.html", "INPUT@/onclick"},
{"http://www.x.com/", "INPUT@/onclick"},
{"button-child.php", "INPUT@/onclick"},
{"http://example.com/", "INPUT@/onclick"},
{"http://example.com/location/href/1.html", "INPUT@/onclick"},
{"http://example.com/location/href/2.html", "INPUT@/onclick"}
};
checkLinks(extractor.getNext(), onClickLinks);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -318,3 +318,45 @@ Content-Type: text/html
</div>


WARC/1.0
WARC-Type: response
WARC-Date: 2017-08-23T13:54:59Z
Content-Type: application/http;msgtype=response
Content-Length: 1279

HTTP/1.1 200 OK
Date: Wed, 23 Aug 2017 13:54:59 GMT
Server: Apache/2.4.18 (Ubuntu)
Last-Modified: Wed, 23 Aug 2017 13:54:03 GMT
ETag: "3ca-5576c0b718ab3"
Accept-Ranges: bytes
Content-Length: 971
Vary: Accept-Encoding
Keep-Alive: timeout=5, max=100
Connection: Keep-Alive
Content-Type: text/html

<!DOCTYPE html>
<head>
<title>Test Extraction of URLs from INPUT onClick Attributes</title>
<meta charset="utf-8">
</head>

<body>
<div onclick="location.href='webpage.html'">Click to load webpage</div>

<form>
<input type=button onClick="parent.location='index.html'" value='click here'/>
<input type=button onClick="parent.open('http://www.x.com/')" value='new window'/>
<input type=button onClick=window.open("button-child.php","demo","width=550,height=300,left=150,top=200,toolbar=0,status=0,");
value="Open child Window"/>
<input type="button" value="Open Window 2" onclick="window.open(&#39;http://example.com/&#39;, #39;width=500&#39;);"/>
<input type="button" value="Open href 1" onclick="window.location.href='http://example.com/location/href/1.html'"/>
<input type="button" value="Open href 2" onclick="Javascript:document.location.href=&#39;http://example.com/location/href/2.html&#39;"/>
</form>


</body>
</html>