Skip to content

Commit 3d0cdd7

Browse files
authored
Merge pull request iipc#85 from sebastian-nagel/cc-8-wat-extract-onclick-links
ExtractingParseObserver: extract links from onClick attributes
2 parents 4586a94 + 79aed91 commit 3d0cdd7

3 files changed

Lines changed: 91 additions & 1 deletion

File tree

src/main/java/org/archive/resource/html/ExtractingParseObserver.java

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,15 @@ public class ExtractingParseObserver implements ParseObserver {
3939

4040
protected static Pattern cssUrlTrimPattern = Pattern.compile(cssUrlTrimPatString);
4141

42+
protected static String jsOnClickUrl1PatString =
43+
"(?i)^(?:javascript:)?(?:(?:window|top|document|self|parent)\\.)?location(?:\\.href)?\\s*=\\s*('|')([^'\"]{3,256})\\1$";
44+
protected static String jsOnClickUrl2PatString =
45+
"(?i)^(?:javascript:)?(?:window|parent)\\.open\\((['\"]|')([^\"']{3,256}?)\\1[,)]";
46+
protected static Pattern[] jsOnClickUrlPatterns = {
47+
Pattern.compile(jsOnClickUrl1PatString),
48+
Pattern.compile(jsOnClickUrl2PatString)
49+
};
50+
4251
private final static int MAX_TEXT_LEN = 100;
4352

4453
private static final String PATH = "path";
@@ -51,6 +60,7 @@ public class ExtractingParseObserver implements ParseObserver {
5160
extractors.put("APPLET", new AppletTagExtractor());
5261
extractors.put("AREA", new AreaTagExtractor());
5362
extractors.put("BASE", new BaseTagExtractor());
63+
extractors.put("DIV", new DivTagExtractor());
5464
extractors.put("EMBED", new EmbedTagExtractor());
5565
extractors.put("FORM", new FormTagExtractor());
5666
extractors.put("FRAME", new FrameTagExtractor());
@@ -268,7 +278,20 @@ private static void addHrefWithAttrs(HTMLMetaData data, TagNode node,
268278
if(l != null) {
269279
data.addHref(l);
270280
}
271-
}
281+
}
282+
283+
private static void addHrefsOnclick(HTMLMetaData data, TagNode node) {
284+
String onclick = node.getAttribute("onclick");
285+
if (onclick != null) {
286+
String path = makePath(node.getTagName(), "onclick");
287+
for (Pattern pattern : jsOnClickUrlPatterns) {
288+
String url = patternJSExtract(pattern, onclick);
289+
if (url != null) {
290+
data.addHref(PATH, path, "url", url);
291+
}
292+
}
293+
}
294+
}
272295

273296
private interface TagExtractor {
274297
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs);
@@ -345,6 +368,12 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
345368
}
346369
}
347370

371+
private static class DivTagExtractor implements TagExtractor {
372+
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
373+
addHrefsOnclick(data,node);
374+
}
375+
}
376+
348377
private static class EmbedTagExtractor implements TagExtractor {
349378
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
350379
addBasicHrefs(data,node,"src");
@@ -401,6 +430,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
401430
private static class InputTagExtractor implements TagExtractor {
402431
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
403432
addBasicHrefs(data,node,"src","formaction");
433+
addHrefsOnclick(data,node);
404434
}
405435
}
406436

@@ -465,4 +495,12 @@ private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String conten
465495
}
466496
}
467497
}
498+
499+
private static String patternJSExtract(Pattern pattern, String content) {
500+
Matcher m = pattern.matcher(content);
501+
if (m.find()) {
502+
return m.group(2);
503+
}
504+
return null;
505+
}
468506
}

src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -263,6 +263,16 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
263263
{"http://www.your-domain.com/your-page.html", "DIV@/data-href"}
264264
};
265265
checkLinks(extractor.getNext(), fbSocialLinks);
266+
String[][] onClickLinks = {
267+
{"webpage.html", "DIV@/onclick"},
268+
{"index.html", "INPUT@/onclick"},
269+
{"http://www.x.com/", "INPUT@/onclick"},
270+
{"button-child.php", "INPUT@/onclick"},
271+
{"http://example.com/", "INPUT@/onclick"},
272+
{"http://example.com/location/href/1.html", "INPUT@/onclick"},
273+
{"http://example.com/location/href/2.html", "INPUT@/onclick"}
274+
};
275+
checkLinks(extractor.getNext(), onClickLinks);
266276
}
267277

268278
}

src/test/resources/org/archive/resource/html/link-extraction-test.warc

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -318,3 +318,45 @@ Content-Type: text/html
318318
</div>
319319

320320

321+
WARC/1.0
322+
WARC-Type: response
323+
WARC-Date: 2017-08-23T13:54:59Z
324+
Content-Type: application/http;msgtype=response
325+
Content-Length: 1279
326+
327+
HTTP/1.1 200 OK
328+
Date: Wed, 23 Aug 2017 13:54:59 GMT
329+
Server: Apache/2.4.18 (Ubuntu)
330+
Last-Modified: Wed, 23 Aug 2017 13:54:03 GMT
331+
ETag: "3ca-5576c0b718ab3"
332+
Accept-Ranges: bytes
333+
Content-Length: 971
334+
Vary: Accept-Encoding
335+
Keep-Alive: timeout=5, max=100
336+
Connection: Keep-Alive
337+
Content-Type: text/html
338+
339+
<!DOCTYPE html>
340+
<head>
341+
<title>Test Extraction of URLs from INPUT onClick Attributes</title>
342+
<meta charset="utf-8">
343+
</head>
344+
345+
<body>
346+
<div onclick="location.href='webpage.html'">Click to load webpage</div>
347+
348+
<form>
349+
<input type=button onClick="parent.location='index.html'" value='click here'/>
350+
<input type=button onClick="parent.open('http://www.x.com/')" value='new window'/>
351+
<input type=button onClick=window.open("button-child.php","demo","width=550,height=300,left=150,top=200,toolbar=0,status=0,");
352+
value="Open child Window"/>
353+
<input type="button" value="Open Window 2" onclick="window.open(&#39;http://example.com/&#39;, #39;width=500&#39;);"/>
354+
<input type="button" value="Open href 1" onclick="window.location.href='http://example.com/location/href/1.html'"/>
355+
<input type="button" value="Open href 2" onclick="Javascript:document.location.href=&#39;http://example.com/location/href/2.html&#39;"/>
356+
</form>
357+
358+
359+
</body>
360+
</html>
361+
362+

0 commit comments

Comments
 (0)