From bc9543fe181147a2b12f0267a7a3aded3a97e29c Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Tue, 5 Jul 2016 15:09:59 +0200 Subject: [PATCH] Fix StringIndexOutOfBoundsException in patternCSSExtract - correct check for min. required URL lenght when stripping 4 characters (2 at each end) - simplified code, use non-capturing groups in regular expression --- .../html/ExtractingParseObserver.java | 31 ++++++------------- .../html/ExtractingParseObserverTest.java | 3 +- 2 files changed, 12 insertions(+), 22 deletions(-) diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java index e1f57b55..a29744c8 100644 --- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java +++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java @@ -23,7 +23,7 @@ public class ExtractingParseObserver implements ParseObserver { protected static String cssUrlPatString = "url\\s*\\(\\s*([\\\\\"']*.+?[\\\\\"']*)\\s*\\)"; protected static String cssImportNoUrlPatString = - "@import\\s+(('[^']+')|(\"[^\"]+\")|(\\('[^']+'\\))|(\\(\"[^\"]+\"\\))|(\\([^)]+\\))|([a-z0-9_.:/\\\\-]+))\\s*;"; + "@import\\s+((?:'[^']+')|(?:\"[^\"]+\")|(?:\\('[^']+'\\))|(?:\\(\"[^\"]+\"\\))|(?:\\([^)]+\\))|(?:[a-z0-9_.:/\\\\-]+))\\s*;"; protected static Pattern cssImportNoUrlPattern = Pattern .compile(cssImportNoUrlPatString); @@ -372,36 +372,25 @@ private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String conten Matcher m = pattern.matcher(content); int idx = 0; int contentLen = content.length(); - while((idx < contentLen) && m.find(idx)) { + while((idx < contentLen) && m.find()) { + idx = m.end(); String url = m.group(1); - int origUrlLength = url.length(); - int urlStart = m.start(1); - int urlEnd = m.end(1); - idx = urlEnd; if(url.length() < 2) { continue; } if ((url.charAt(0) == '(') - && (url.charAt(origUrlLength-1) == ')')) { - url = url.substring(1, origUrlLength - 1); - urlStart += 1; - origUrlLength -= 2; + && (url.charAt(url.length()-1) == ')')) { + url = url.substring(1, url.length() - 1); } - if (url.charAt(0) == '"') { - url = url.substring(1, origUrlLength - 1); - urlStart += 1; - } else if (url.charAt(0) == '\'') { - url = url.substring(1, origUrlLength - 1); - urlStart += 1; + if (url.charAt(0) == '"' || url.charAt(0) == '\'') { + url = url.substring(1, url.length() - 1); } else if (url.charAt(0) == '\\') { - if(url.length() == 2) + if(url.length() <= 4) { continue; - url = url.substring(2, origUrlLength - 2); - urlStart += 2; + } + url = url.substring(2, url.length() - 2); } - int urlLength = url.length(); data.addHref("path","STYLE/#text","href",url); - idx += urlLength; } } } diff --git a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java index 24b6c18a..7d8c7ea8 100644 --- a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java +++ b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java @@ -19,7 +19,8 @@ public void testHandleStyleNodeExceptions() throws Exception { "url (' ')", "url('\")", "url(')", - "url('\"')" + "url('\"')", + "url('\\\"\"')" }; boolean except = false; HTMLMetaData md = new HTMLMetaData(new MetaData());