From a23cfebe24a959c929b1fcf9fbb6fc37eae31c76 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Sun, 7 Aug 2016 16:49:47 +0200 Subject: [PATCH 1/3] Make regular expression to extract URLs from CSS more restrictive (allow only `"`, `'`, `\"` or `\'` in front of or after the URL). Avoid long-runners when matching the regex due to heavy back-tracking. --- .../java/org/archive/resource/html/ExtractingParseObserver.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java index e1f57b55..df3742fa 100644 --- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java +++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java @@ -21,7 +21,7 @@ public class ExtractingParseObserver implements ParseObserver { boolean inTitle = false; protected static String cssUrlPatString = - "url\\s*\\(\\s*([\\\\\"']*.+?[\\\\\"']*)\\s*\\)"; + "url\\s*\\(\\s*((?:\\\\?[\"'])?.+?(?:\\\\?[\"'])?)\\s*\\)"; protected static String cssImportNoUrlPatString = "@import\\s+(('[^']+')|(\"[^\"]+\")|(\\('[^']+'\\))|(\\(\"[^\"]+\"\\))|(\\([^)]+\\))|([a-z0-9_.:/\\\\-]+))\\s*;"; From b918f7f18e94c58a4a74d97e98f3c19465466595 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Wed, 4 Jan 2017 18:21:22 +0100 Subject: [PATCH 2/3] Improve clipping of quotation marks in CSS link extraction - clip multiple quotation marks Fix StringIndexOutOfBoundsException in patternCSSExtract - correct check for min. required URL lenght when stripping 4 characters (2 at each end) - simplified code, use non-capturing groups in regular expression --- .../html/ExtractingParseObserver.java | 79 ++++++++++--------- .../html/ExtractingParseObserverTest.java | 48 ++++++----- 2 files changed, 70 insertions(+), 57 deletions(-) diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java index df3742fa..45a48808 100644 --- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java +++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java @@ -23,7 +23,7 @@ public class ExtractingParseObserver implements ParseObserver { protected static String cssUrlPatString = "url\\s*\\(\\s*((?:\\\\?[\"'])?.+?(?:\\\\?[\"'])?)\\s*\\)"; protected static String cssImportNoUrlPatString = - "@import\\s+(('[^']+')|(\"[^\"]+\")|(\\('[^']+'\\))|(\\(\"[^\"]+\"\\))|(\\([^)]+\\))|([a-z0-9_.:/\\\\-]+))\\s*;"; + "@import\\s+((?:'[^']+')|(?:\"[^\"]+\")|(?:\\('[^']+'\\))|(?:\\(\"[^\"]+\"\\))|(?:\\([^)]+\\))|(?:[a-z0-9_.:/\\\\-]+))\\s*;"; protected static Pattern cssImportNoUrlPattern = Pattern .compile(cssImportNoUrlPatString); @@ -368,40 +368,45 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs } } } - private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String content) { - Matcher m = pattern.matcher(content); - int idx = 0; - int contentLen = content.length(); - while((idx < contentLen) && m.find(idx)) { - String url = m.group(1); - int origUrlLength = url.length(); - int urlStart = m.start(1); - int urlEnd = m.end(1); - idx = urlEnd; - if(url.length() < 2) { - continue; - } - if ((url.charAt(0) == '(') - && (url.charAt(origUrlLength-1) == ')')) { - url = url.substring(1, origUrlLength - 1); - urlStart += 1; - origUrlLength -= 2; - } - if (url.charAt(0) == '"') { - url = url.substring(1, origUrlLength - 1); - urlStart += 1; - } else if (url.charAt(0) == '\'') { - url = url.substring(1, origUrlLength - 1); - urlStart += 1; - } else if (url.charAt(0) == '\\') { - if(url.length() == 2) - continue; - url = url.substring(2, origUrlLength - 2); - urlStart += 2; - } - int urlLength = url.length(); - data.addHref("path","STYLE/#text","href",url); - idx += urlLength; - } - } + private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String content) { + Matcher m = pattern.matcher(content); + int idx = 0; + int contentLen = content.length(); + if (contentLen > 100000) + // extract URLs only from the first 100 kB + contentLen = 100000; + FIND: + while((idx < contentLen) && m.find()) { + idx = m.end(); + String url = m.group(1); + if(url.length() < 2) { + continue; + } + if ((url.charAt(0) == '(') + && (url.charAt(url.length()-1) == ')')) { + url = url.substring(1, url.length() - 1); + } + CLIP: + while (url.length() > 1) { + if ((url.charAt(0) == '"' || url.charAt(0) == '\'') + && (url.charAt(url.length() - 1) == '"' + || url.charAt(url.length() - 1) == '\'')) { + if(url.length() <= 2) { + // empty URL + continue FIND; + } + url = url.substring(1, url.length() - 1); + } else if (url.charAt(0) == '\\') { + if(url.length() <= 4) { + // empty URL + continue FIND; + } + url = url.substring(2, url.length() - 2); + } else { + break CLIP; + } + } + data.addHref("path","STYLE/#text","href",url); + } + } } diff --git a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java index 24b6c18a..236b964b 100644 --- a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java +++ b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java @@ -19,7 +19,9 @@ public void testHandleStyleNodeExceptions() throws Exception { "url (' ')", "url('\")", "url(')", - "url('\"')" + "url('\"')", + "url('\\\"\"')", + "url(''''')" }; boolean except = false; HTMLMetaData md = new HTMLMetaData(new MetaData()); @@ -37,6 +39,7 @@ public void testHandleStyleNodeExceptions() throws Exception { assertFalse(except); } } + public void testHandleStyleNode() throws Exception { String[][] tests = { {""}, @@ -45,31 +48,35 @@ public void testHandleStyleNode() throws Exception { {"url(\"foo.gif\")","foo.gif"}, {"url(\\\"foo.gif\\\")","foo.gif"}, {"url(\\'foo.gif\\')","foo.gif"}, - - }; + {"url(''foo.gif'')","foo.gif"}, + {"url( foo.gif )","foo.gif"}, + {"url('''')"} + }; for(String[] testa : tests) { checkExtract(testa); } - // boolean except = false; -// HTMLMetaData md = new HTMLMetaData(new MetaData()); -// ExtractingParseObserver epo = new ExtractingParseObserver(md); -// for(String css : tests) { -// try { -// TextNode tn = new TextNode(css); -// epo.handleStyleNode(tn); -// } catch(Exception e) { -// System.err.format("And the winner is....(%s)\n", css); -// e.printStackTrace(); -// except = true; -// throw e; -// } -// assertFalse(except); -// } } + + /** + * Test whether the pattern matcher does extract nothing and also does not + * not hang-up if an overlong CSS link is truncated. + */ + public void testHandleStyleNodeNoHangupTruncated() throws Exception { + StringBuilder sb = new StringBuilder(); + sb.append("url("); + for (int i = 0; i < 500000; i++) + sb.append('\''); + sb.append("foo.gif"); + for (int i = 0; i < 499000; i++) + sb.append('\''); + String[] test = new String[1]; + test[0] = sb.toString(); + checkExtract(test); + } + private void checkExtract(String[] data) throws JSONException { // System.err.format("CSS(%s) want[0](%s)\n",css,want[0]); String css = data[0]; - boolean except = false; HTMLMetaData md = new HTMLMetaData(new MetaData()); ExtractingParseObserver epo = new ExtractingParseObserver(md); try { @@ -87,7 +94,8 @@ private void checkExtract(String[] data) throws JSONException { assertTrue(o instanceof JSONObject); JSONObject jo = (JSONObject) o; - assertEquals(data[i],jo.getString("href")); + assertEquals("CSS link extraction failed for <" + css + ">", + data[i], jo.getString("href")); } } else { assertNull(a); From 194a1faecf30905c840d71d0bc22b6ea5d6a61fe Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Wed, 18 Jan 2017 12:29:43 +0100 Subject: [PATCH 3/3] CSS link extraction: clip also unpaired leading and trailing quotation marks --- .../html/ExtractingParseObserver.java | 64 +++++++------------ .../html/ExtractingParseObserverTest.java | 9 +-- 2 files changed, 27 insertions(+), 46 deletions(-) diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java index 45a48808..deb8c8c0 100644 --- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java +++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java @@ -22,13 +22,18 @@ public class ExtractingParseObserver implements ParseObserver { protected static String cssUrlPatString = "url\\s*\\(\\s*((?:\\\\?[\"'])?.+?(?:\\\\?[\"'])?)\\s*\\)"; + protected static String cssUrlTrimPatString = + "^(?:\\\\?[\"'])+|(?:\\\\?[\"'])+$"; protected static String cssImportNoUrlPatString = - "@import\\s+((?:'[^']+')|(?:\"[^\"]+\")|(?:\\('[^']+'\\))|(?:\\(\"[^\"]+\"\\))|(?:\\([^)]+\\))|(?:[a-z0-9_.:/\\\\-]+))\\s*;"; + "@import\\s+((?:'[^']+')|(?:\"[^\"]+\")|(?:\\('[^']+'\\))|(?:\\(\"[^\"]+\"\\))|(?:\\([^)]+\\))|(?:[a-z0-9_.:/\\\\-]+))\\s*;"; protected static Pattern cssImportNoUrlPattern = Pattern .compile(cssImportNoUrlPatString); protected static Pattern cssUrlPattern = Pattern.compile(cssUrlPatString); + + protected static Pattern cssUrlTrimPattern = Pattern.compile(cssUrlTrimPatString); + private final static int MAX_TEXT_LEN = 100; // private static String GLOBAL_ATTR[] = {"background"}; @@ -368,45 +373,20 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs } } } - private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String content) { - Matcher m = pattern.matcher(content); - int idx = 0; - int contentLen = content.length(); - if (contentLen > 100000) - // extract URLs only from the first 100 kB - contentLen = 100000; - FIND: - while((idx < contentLen) && m.find()) { - idx = m.end(); - String url = m.group(1); - if(url.length() < 2) { - continue; - } - if ((url.charAt(0) == '(') - && (url.charAt(url.length()-1) == ')')) { - url = url.substring(1, url.length() - 1); - } - CLIP: - while (url.length() > 1) { - if ((url.charAt(0) == '"' || url.charAt(0) == '\'') - && (url.charAt(url.length() - 1) == '"' - || url.charAt(url.length() - 1) == '\'')) { - if(url.length() <= 2) { - // empty URL - continue FIND; - } - url = url.substring(1, url.length() - 1); - } else if (url.charAt(0) == '\\') { - if(url.length() <= 4) { - // empty URL - continue FIND; - } - url = url.substring(2, url.length() - 2); - } else { - break CLIP; - } - } - data.addHref("path","STYLE/#text","href",url); - } - } + private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String content) { + Matcher m = pattern.matcher(content); + int idx = 0; + int contentLen = content.length(); + if (contentLen > 100000) + // extract URLs only from the first 100 kB + contentLen = 100000; + while((idx < contentLen) && m.find()) { + idx = m.end(); + String url = m.group(1); + url = cssUrlTrimPattern.matcher(url).replaceAll(""); + if (!url.isEmpty()) { + data.addHref("path","STYLE/#text","href", url); + } + } + } } diff --git a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java index 236b964b..bfbd6f02 100644 --- a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java +++ b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java @@ -20,8 +20,8 @@ public void testHandleStyleNodeExceptions() throws Exception { "url('\")", "url(')", "url('\"')", - "url('\\\"\"')", - "url(''''')" + "url('\\\"\"')", + "url(''''')" }; boolean except = false; HTMLMetaData md = new HTMLMetaData(new MetaData()); @@ -50,7 +50,8 @@ public void testHandleStyleNode() throws Exception { {"url(\\'foo.gif\\')","foo.gif"}, {"url(''foo.gif'')","foo.gif"}, {"url( foo.gif )","foo.gif"}, - {"url('''')"} + {"url('''')"}, + {"url('foo.gif'')","foo.gif"}, }; for(String[] testa : tests) { checkExtract(testa); @@ -98,7 +99,7 @@ private void checkExtract(String[] data) throws JSONException { data[i], jo.getString("href")); } } else { - assertNull(a); + assertNull("Expected no extracted link for <" + css + ">", a); } }