Skip to content

Commit ea00e6d

Browse files
Limit pattern matching URLs embedded in CSS to match max. 8000 characters,
add unit test, fixes commoncrawl#12
1 parent a0dcbc5 commit ea00e6d

2 files changed

Lines changed: 19 additions & 1 deletion

File tree

src/main/java/org/archive/resource/html/ExtractingParseObserver.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ public class ExtractingParseObserver implements ParseObserver {
2929
boolean inPre = false;
3030

3131
protected static String cssUrlPatString =
32-
"url\\s*\\(\\s*((?:\\\\?[\"'])?.+?(?:\\\\?[\"'])?)\\s*\\)";
32+
"url\\s*\\(\\s*([^)\\s]{1,8000}?)\\s*\\)";
3333
protected static String cssUrlTrimPatString =
3434
"^(?:\\\\?[\"'])+|(?:\\\\?[\"'])+$";
3535
protected static String cssImportNoUrlPatString =

src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,24 @@ public void testHandleStyleNodeNoHangupTruncated() throws Exception {
9393
checkExtract(test);
9494
}
9595

96+
/**
97+
* Test whether the pattern matcher does not stack overflow with overlong
98+
* sequence of quote characters around a CSS link.
99+
*/
100+
public void testHandleStyleNodeNoStackOverflow() throws Exception {
101+
StringBuilder sb = new StringBuilder();
102+
sb.append("url(");
103+
for (int i = 0; i < 20000; i++)
104+
sb.append('\'');
105+
sb.append("foos.gif");
106+
for (int i = 0; i < 20000; i++)
107+
sb.append('\'');
108+
sb.append(");");
109+
String[] test = new String[1];
110+
test[0] = sb.toString();
111+
checkExtract(test);
112+
}
113+
96114
private void checkExtract(String[] data) throws JSONException {
97115
// System.err.format("CSS(%s) want[0](%s)\n",css,want[0]);
98116
String css = data[0];

0 commit comments

Comments
 (0)