Skip to content

Commit 194a1fa

Browse files
CSS link extraction: clip also unpaired leading and trailing quotation marks
1 parent b918f7f commit 194a1fa

2 files changed

Lines changed: 27 additions & 46 deletions

File tree

src/main/java/org/archive/resource/html/ExtractingParseObserver.java

Lines changed: 22 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,18 @@ public class ExtractingParseObserver implements ParseObserver {
2222

2323
protected static String cssUrlPatString =
2424
"url\\s*\\(\\s*((?:\\\\?[\"'])?.+?(?:\\\\?[\"'])?)\\s*\\)";
25+
protected static String cssUrlTrimPatString =
26+
"^(?:\\\\?[\"'])+|(?:\\\\?[\"'])+$";
2527
protected static String cssImportNoUrlPatString =
26-
"@import\\s+((?:'[^']+')|(?:\"[^\"]+\")|(?:\\('[^']+'\\))|(?:\\(\"[^\"]+\"\\))|(?:\\([^)]+\\))|(?:[a-z0-9_.:/\\\\-]+))\\s*;";
28+
"@import\\s+((?:'[^']+')|(?:\"[^\"]+\")|(?:\\('[^']+'\\))|(?:\\(\"[^\"]+\"\\))|(?:\\([^)]+\\))|(?:[a-z0-9_.:/\\\\-]+))\\s*;";
2729

2830
protected static Pattern cssImportNoUrlPattern = Pattern
2931
.compile(cssImportNoUrlPatString);
3032

3133
protected static Pattern cssUrlPattern = Pattern.compile(cssUrlPatString);
34+
35+
protected static Pattern cssUrlTrimPattern = Pattern.compile(cssUrlTrimPatString);
36+
3237
private final static int MAX_TEXT_LEN = 100;
3338

3439
// private static String GLOBAL_ATTR[] = {"background"};
@@ -368,45 +373,20 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
368373
}
369374
}
370375
}
371-
private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String content) {
372-
Matcher m = pattern.matcher(content);
373-
int idx = 0;
374-
int contentLen = content.length();
375-
if (contentLen > 100000)
376-
// extract URLs only from the first 100 kB
377-
contentLen = 100000;
378-
FIND:
379-
while((idx < contentLen) && m.find()) {
380-
idx = m.end();
381-
String url = m.group(1);
382-
if(url.length() < 2) {
383-
continue;
384-
}
385-
if ((url.charAt(0) == '(')
386-
&& (url.charAt(url.length()-1) == ')')) {
387-
url = url.substring(1, url.length() - 1);
388-
}
389-
CLIP:
390-
while (url.length() > 1) {
391-
if ((url.charAt(0) == '"' || url.charAt(0) == '\'')
392-
&& (url.charAt(url.length() - 1) == '"'
393-
|| url.charAt(url.length() - 1) == '\'')) {
394-
if(url.length() <= 2) {
395-
// empty URL
396-
continue FIND;
397-
}
398-
url = url.substring(1, url.length() - 1);
399-
} else if (url.charAt(0) == '\\') {
400-
if(url.length() <= 4) {
401-
// empty URL
402-
continue FIND;
403-
}
404-
url = url.substring(2, url.length() - 2);
405-
} else {
406-
break CLIP;
407-
}
408-
}
409-
data.addHref("path","STYLE/#text","href",url);
410-
}
411-
}
376+
private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String content) {
377+
Matcher m = pattern.matcher(content);
378+
int idx = 0;
379+
int contentLen = content.length();
380+
if (contentLen > 100000)
381+
// extract URLs only from the first 100 kB
382+
contentLen = 100000;
383+
while((idx < contentLen) && m.find()) {
384+
idx = m.end();
385+
String url = m.group(1);
386+
url = cssUrlTrimPattern.matcher(url).replaceAll("");
387+
if (!url.isEmpty()) {
388+
data.addHref("path","STYLE/#text","href", url);
389+
}
390+
}
391+
}
412392
}

src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@ public void testHandleStyleNodeExceptions() throws Exception {
2020
"url('\")",
2121
"url(')",
2222
"url('\"')",
23-
"url('\\\"\"')",
24-
"url(''''')"
23+
"url('\\\"\"')",
24+
"url(''''')"
2525
};
2626
boolean except = false;
2727
HTMLMetaData md = new HTMLMetaData(new MetaData());
@@ -50,7 +50,8 @@ public void testHandleStyleNode() throws Exception {
5050
{"url(\\'foo.gif\\')","foo.gif"},
5151
{"url(''foo.gif'')","foo.gif"},
5252
{"url( foo.gif )","foo.gif"},
53-
{"url('''')"}
53+
{"url('''')"},
54+
{"url('foo.gif'')","foo.gif"},
5455
};
5556
for(String[] testa : tests) {
5657
checkExtract(testa);
@@ -98,7 +99,7 @@ private void checkExtract(String[] data) throws JSONException {
9899
data[i], jo.getString("href"));
99100
}
100101
} else {
101-
assertNull(a);
102+
assertNull("Expected no extracted link for <" + css + ">", a);
102103
}
103104
}
104105

0 commit comments

Comments
 (0)