@@ -39,6 +39,15 @@ public class ExtractingParseObserver implements ParseObserver {
3939
4040 protected static Pattern cssUrlTrimPattern = Pattern .compile (cssUrlTrimPatString );
4141
42+ protected static String jsOnClickUrl1PatString =
43+ "(?i)^(?:javascript:)?(?:(?:window|top|document|self|parent)\\ .)?location(?:\\ .href)?\\ s*=\\ s*('|')([^'\" ]{3,256})\\ 1$" ;
44+ protected static String jsOnClickUrl2PatString =
45+ "(?i)^(?:javascript:)?(?:window|parent)\\ .open\\ ((['\" ]|')([^\" ']{3,256}?)\\ 1[,)]" ;
46+ protected static Pattern [] jsOnClickUrlPatterns = {
47+ Pattern .compile (jsOnClickUrl1PatString ),
48+ Pattern .compile (jsOnClickUrl2PatString )
49+ };
50+
4251 private final static int MAX_TEXT_LEN = 100 ;
4352
4453 private static final String PATH = "path" ;
@@ -51,6 +60,7 @@ public class ExtractingParseObserver implements ParseObserver {
5160 extractors .put ("APPLET" , new AppletTagExtractor ());
5261 extractors .put ("AREA" , new AreaTagExtractor ());
5362 extractors .put ("BASE" , new BaseTagExtractor ());
63+ extractors .put ("DIV" , new DivTagExtractor ());
5464 extractors .put ("EMBED" , new EmbedTagExtractor ());
5565 extractors .put ("FORM" , new FormTagExtractor ());
5666 extractors .put ("FRAME" , new FrameTagExtractor ());
@@ -268,7 +278,20 @@ private static void addHrefWithAttrs(HTMLMetaData data, TagNode node,
268278 if (l != null ) {
269279 data .addHref (l );
270280 }
271- }
281+ }
282+
283+ private static void addHrefsOnclick (HTMLMetaData data , TagNode node ) {
284+ String onclick = node .getAttribute ("onclick" );
285+ if (onclick != null ) {
286+ String path = makePath (node .getTagName (), "onclick" );
287+ for (Pattern pattern : jsOnClickUrlPatterns ) {
288+ String url = patternJSExtract (pattern , onclick );
289+ if (url != null ) {
290+ data .addHref (PATH , path , "url" , url );
291+ }
292+ }
293+ }
294+ }
272295
273296 private interface TagExtractor {
274297 public void extract (HTMLMetaData data , TagNode node , ExtractingParseObserver obs );
@@ -345,6 +368,12 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
345368 }
346369 }
347370
371+ private static class DivTagExtractor implements TagExtractor {
372+ public void extract (HTMLMetaData data , TagNode node , ExtractingParseObserver obs ) {
373+ addHrefsOnclick (data ,node );
374+ }
375+ }
376+
348377 private static class EmbedTagExtractor implements TagExtractor {
349378 public void extract (HTMLMetaData data , TagNode node , ExtractingParseObserver obs ) {
350379 addBasicHrefs (data ,node ,"src" );
@@ -401,6 +430,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
401430 private static class InputTagExtractor implements TagExtractor {
402431 public void extract (HTMLMetaData data , TagNode node , ExtractingParseObserver obs ) {
403432 addBasicHrefs (data ,node ,"src" ,"formaction" );
433+ addHrefsOnclick (data ,node );
404434 }
405435 }
406436
@@ -465,4 +495,12 @@ private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String conten
465495 }
466496 }
467497 }
498+
499+ private static String patternJSExtract (Pattern pattern , String content ) {
500+ Matcher m = pattern .matcher (content );
501+ if (m .find ()) {
502+ return m .group (2 );
503+ }
504+ return null ;
505+ }
468506}
0 commit comments