diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java index e1f57b55..deb8c8c0 100644 --- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java +++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java @@ -21,14 +21,19 @@ public class ExtractingParseObserver implements ParseObserver { boolean inTitle = false; protected static String cssUrlPatString = - "url\\s*\\(\\s*([\\\\\"']*.+?[\\\\\"']*)\\s*\\)"; + "url\\s*\\(\\s*((?:\\\\?[\"'])?.+?(?:\\\\?[\"'])?)\\s*\\)"; + protected static String cssUrlTrimPatString = + "^(?:\\\\?[\"'])+|(?:\\\\?[\"'])+$"; protected static String cssImportNoUrlPatString = - "@import\\s+(('[^']+')|(\"[^\"]+\")|(\\('[^']+'\\))|(\\(\"[^\"]+\"\\))|(\\([^)]+\\))|([a-z0-9_.:/\\\\-]+))\\s*;"; + "@import\\s+((?:'[^']+')|(?:\"[^\"]+\")|(?:\\('[^']+'\\))|(?:\\(\"[^\"]+\"\\))|(?:\\([^)]+\\))|(?:[a-z0-9_.:/\\\\-]+))\\s*;"; protected static Pattern cssImportNoUrlPattern = Pattern .compile(cssImportNoUrlPatString); protected static Pattern cssUrlPattern = Pattern.compile(cssUrlPatString); + + protected static Pattern cssUrlTrimPattern = Pattern.compile(cssUrlTrimPatString); + private final static int MAX_TEXT_LEN = 100; // private static String GLOBAL_ATTR[] = {"background"}; @@ -372,36 +377,16 @@ private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String conten Matcher m = pattern.matcher(content); int idx = 0; int contentLen = content.length(); - while((idx < contentLen) && m.find(idx)) { + if (contentLen > 100000) + // extract URLs only from the first 100 kB + contentLen = 100000; + while((idx < contentLen) && m.find()) { + idx = m.end(); String url = m.group(1); - int origUrlLength = url.length(); - int urlStart = m.start(1); - int urlEnd = m.end(1); - idx = urlEnd; - if(url.length() < 2) { - continue; - } - if ((url.charAt(0) == '(') - && (url.charAt(origUrlLength-1) == ')')) { - url = url.substring(1, origUrlLength - 1); - urlStart += 1; - origUrlLength -= 2; - } - if (url.charAt(0) == '"') { - url = url.substring(1, origUrlLength - 1); - urlStart += 1; - } else if (url.charAt(0) == '\'') { - url = url.substring(1, origUrlLength - 1); - urlStart += 1; - } else if (url.charAt(0) == '\\') { - if(url.length() == 2) - continue; - url = url.substring(2, origUrlLength - 2); - urlStart += 2; + url = cssUrlTrimPattern.matcher(url).replaceAll(""); + if (!url.isEmpty()) { + data.addHref("path","STYLE/#text","href", url); } - int urlLength = url.length(); - data.addHref("path","STYLE/#text","href",url); - idx += urlLength; } } } diff --git a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java index 24b6c18a..bfbd6f02 100644 --- a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java +++ b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java @@ -19,7 +19,9 @@ public void testHandleStyleNodeExceptions() throws Exception { "url (' ')", "url('\")", "url(')", - "url('\"')" + "url('\"')", + "url('\\\"\"')", + "url(''''')" }; boolean except = false; HTMLMetaData md = new HTMLMetaData(new MetaData()); @@ -37,6 +39,7 @@ public void testHandleStyleNodeExceptions() throws Exception { assertFalse(except); } } + public void testHandleStyleNode() throws Exception { String[][] tests = { {""}, @@ -45,31 +48,36 @@ public void testHandleStyleNode() throws Exception { {"url(\"foo.gif\")","foo.gif"}, {"url(\\\"foo.gif\\\")","foo.gif"}, {"url(\\'foo.gif\\')","foo.gif"}, - - }; + {"url(''foo.gif'')","foo.gif"}, + {"url( foo.gif )","foo.gif"}, + {"url('''')"}, + {"url('foo.gif'')","foo.gif"}, + }; for(String[] testa : tests) { checkExtract(testa); } - // boolean except = false; -// HTMLMetaData md = new HTMLMetaData(new MetaData()); -// ExtractingParseObserver epo = new ExtractingParseObserver(md); -// for(String css : tests) { -// try { -// TextNode tn = new TextNode(css); -// epo.handleStyleNode(tn); -// } catch(Exception e) { -// System.err.format("And the winner is....(%s)\n", css); -// e.printStackTrace(); -// except = true; -// throw e; -// } -// assertFalse(except); -// } } + + /** + * Test whether the pattern matcher does extract nothing and also does not + * not hang-up if an overlong CSS link is truncated. + */ + public void testHandleStyleNodeNoHangupTruncated() throws Exception { + StringBuilder sb = new StringBuilder(); + sb.append("url("); + for (int i = 0; i < 500000; i++) + sb.append('\''); + sb.append("foo.gif"); + for (int i = 0; i < 499000; i++) + sb.append('\''); + String[] test = new String[1]; + test[0] = sb.toString(); + checkExtract(test); + } + private void checkExtract(String[] data) throws JSONException { // System.err.format("CSS(%s) want[0](%s)\n",css,want[0]); String css = data[0]; - boolean except = false; HTMLMetaData md = new HTMLMetaData(new MetaData()); ExtractingParseObserver epo = new ExtractingParseObserver(md); try { @@ -87,10 +95,11 @@ private void checkExtract(String[] data) throws JSONException { assertTrue(o instanceof JSONObject); JSONObject jo = (JSONObject) o; - assertEquals(data[i],jo.getString("href")); + assertEquals("CSS link extraction failed for <" + css + ">", + data[i], jo.getString("href")); } } else { - assertNull(a); + assertNull("Expected no extracted link for <" + css + ">", a); } }