Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,19 @@ public class ExtractingParseObserver implements ParseObserver {
boolean inTitle = false;

protected static String cssUrlPatString =
"url\\s*\\(\\s*([\\\\\"']*.+?[\\\\\"']*)\\s*\\)";
"url\\s*\\(\\s*((?:\\\\?[\"'])?.+?(?:\\\\?[\"'])?)\\s*\\)";
protected static String cssUrlTrimPatString =
"^(?:\\\\?[\"'])+|(?:\\\\?[\"'])+$";
protected static String cssImportNoUrlPatString =
"@import\\s+(('[^']+')|(\"[^\"]+\")|(\\('[^']+'\\))|(\\(\"[^\"]+\"\\))|(\\([^)]+\\))|([a-z0-9_.:/\\\\-]+))\\s*;";
"@import\\s+((?:'[^']+')|(?:\"[^\"]+\")|(?:\\('[^']+'\\))|(?:\\(\"[^\"]+\"\\))|(?:\\([^)]+\\))|(?:[a-z0-9_.:/\\\\-]+))\\s*;";

protected static Pattern cssImportNoUrlPattern = Pattern
.compile(cssImportNoUrlPatString);

protected static Pattern cssUrlPattern = Pattern.compile(cssUrlPatString);

protected static Pattern cssUrlTrimPattern = Pattern.compile(cssUrlTrimPatString);

private final static int MAX_TEXT_LEN = 100;

// private static String GLOBAL_ATTR[] = {"background"};
Expand Down Expand Up @@ -372,36 +377,16 @@ private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String conten
Matcher m = pattern.matcher(content);
int idx = 0;
int contentLen = content.length();
while((idx < contentLen) && m.find(idx)) {
if (contentLen > 100000)
// extract URLs only from the first 100 kB
contentLen = 100000;
while((idx < contentLen) && m.find()) {
idx = m.end();
String url = m.group(1);
int origUrlLength = url.length();
int urlStart = m.start(1);
int urlEnd = m.end(1);
idx = urlEnd;
if(url.length() < 2) {
continue;
}
if ((url.charAt(0) == '(')
&& (url.charAt(origUrlLength-1) == ')')) {
url = url.substring(1, origUrlLength - 1);
urlStart += 1;
origUrlLength -= 2;
}
if (url.charAt(0) == '"') {
url = url.substring(1, origUrlLength - 1);
urlStart += 1;
} else if (url.charAt(0) == '\'') {
url = url.substring(1, origUrlLength - 1);
urlStart += 1;
} else if (url.charAt(0) == '\\') {
if(url.length() == 2)
continue;
url = url.substring(2, origUrlLength - 2);
urlStart += 2;
url = cssUrlTrimPattern.matcher(url).replaceAll("");
if (!url.isEmpty()) {
data.addHref("path","STYLE/#text","href", url);
}
int urlLength = url.length();
data.addHref("path","STYLE/#text","href",url);
idx += urlLength;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@ public void testHandleStyleNodeExceptions() throws Exception {
"url (' ')",
"url('\")",
"url(')",
"url('\"')"
"url('\"')",
"url('\\\"\"')",
"url(''''')"
};
boolean except = false;
HTMLMetaData md = new HTMLMetaData(new MetaData());
Expand All @@ -37,6 +39,7 @@ public void testHandleStyleNodeExceptions() throws Exception {
assertFalse(except);
}
}

public void testHandleStyleNode() throws Exception {
String[][] tests = {
{""},
Expand All @@ -45,31 +48,36 @@ public void testHandleStyleNode() throws Exception {
{"url(\"foo.gif\")","foo.gif"},
{"url(\\\"foo.gif\\\")","foo.gif"},
{"url(\\'foo.gif\\')","foo.gif"},

};
{"url(''foo.gif'')","foo.gif"},
{"url( foo.gif )","foo.gif"},
{"url('''')"},
{"url('foo.gif'')","foo.gif"},
};
for(String[] testa : tests) {
checkExtract(testa);
}
// boolean except = false;
// HTMLMetaData md = new HTMLMetaData(new MetaData());
// ExtractingParseObserver epo = new ExtractingParseObserver(md);
// for(String css : tests) {
// try {
// TextNode tn = new TextNode(css);
// epo.handleStyleNode(tn);
// } catch(Exception e) {
// System.err.format("And the winner is....(%s)\n", css);
// e.printStackTrace();
// except = true;
// throw e;
// }
// assertFalse(except);
// }
}

/**
* Test whether the pattern matcher does extract nothing and also does not
* not hang-up if an overlong CSS link is truncated.
*/
public void testHandleStyleNodeNoHangupTruncated() throws Exception {
StringBuilder sb = new StringBuilder();
sb.append("url(");
for (int i = 0; i < 500000; i++)
sb.append('\'');
sb.append("foo.gif");
for (int i = 0; i < 499000; i++)
sb.append('\'');
String[] test = new String[1];
test[0] = sb.toString();
checkExtract(test);
}

private void checkExtract(String[] data) throws JSONException {
// System.err.format("CSS(%s) want[0](%s)\n",css,want[0]);
String css = data[0];
boolean except = false;
HTMLMetaData md = new HTMLMetaData(new MetaData());
ExtractingParseObserver epo = new ExtractingParseObserver(md);
try {
Expand All @@ -87,10 +95,11 @@ private void checkExtract(String[] data) throws JSONException {

assertTrue(o instanceof JSONObject);
JSONObject jo = (JSONObject) o;
assertEquals(data[i],jo.getString("href"));
assertEquals("CSS link extraction failed for <" + css + ">",
data[i], jo.getString("href"));
}
} else {
assertNull(a);
assertNull("Expected no extracted link for <" + css + ">", a);
}
}

Expand Down