@@ -22,13 +22,18 @@ public class ExtractingParseObserver implements ParseObserver {
2222
2323 protected static String cssUrlPatString =
2424 "url\\ s*\\ (\\ s*((?:\\ \\ ?[\" '])?.+?(?:\\ \\ ?[\" '])?)\\ s*\\ )" ;
25+ protected static String cssUrlTrimPatString =
26+ "^(?:\\ \\ ?[\" '])+|(?:\\ \\ ?[\" '])+$" ;
2527 protected static String cssImportNoUrlPatString =
26- "@import\\ s+((?:'[^']+')|(?:\" [^\" ]+\" )|(?:\\ ('[^']+'\\ ))|(?:\\ (\" [^\" ]+\" \\ ))|(?:\\ ([^)]+\\ ))|(?:[a-z0-9_.:/\\ \\ -]+))\\ s*;" ;
28+ "@import\\ s+((?:'[^']+')|(?:\" [^\" ]+\" )|(?:\\ ('[^']+'\\ ))|(?:\\ (\" [^\" ]+\" \\ ))|(?:\\ ([^)]+\\ ))|(?:[a-z0-9_.:/\\ \\ -]+))\\ s*;" ;
2729
2830 protected static Pattern cssImportNoUrlPattern = Pattern
2931 .compile (cssImportNoUrlPatString );
3032
3133 protected static Pattern cssUrlPattern = Pattern .compile (cssUrlPatString );
34+
35+ protected static Pattern cssUrlTrimPattern = Pattern .compile (cssUrlTrimPatString );
36+
3237 private final static int MAX_TEXT_LEN = 100 ;
3338
3439// private static String GLOBAL_ATTR[] = {"background"};
@@ -368,45 +373,20 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
368373 }
369374 }
370375 }
371- private void patternCSSExtract (HTMLMetaData data , Pattern pattern , String content ) {
372- Matcher m = pattern .matcher (content );
373- int idx = 0 ;
374- int contentLen = content .length ();
375- if (contentLen > 100000 )
376- // extract URLs only from the first 100 kB
377- contentLen = 100000 ;
378- FIND :
379- while ((idx < contentLen ) && m .find ()) {
380- idx = m .end ();
381- String url = m .group (1 );
382- if (url .length () < 2 ) {
383- continue ;
384- }
385- if ((url .charAt (0 ) == '(' )
386- && (url .charAt (url .length ()-1 ) == ')' )) {
387- url = url .substring (1 , url .length () - 1 );
388- }
389- CLIP :
390- while (url .length () > 1 ) {
391- if ((url .charAt (0 ) == '"' || url .charAt (0 ) == '\'' )
392- && (url .charAt (url .length () - 1 ) == '"'
393- || url .charAt (url .length () - 1 ) == '\'' )) {
394- if (url .length () <= 2 ) {
395- // empty URL
396- continue FIND ;
397- }
398- url = url .substring (1 , url .length () - 1 );
399- } else if (url .charAt (0 ) == '\\' ) {
400- if (url .length () <= 4 ) {
401- // empty URL
402- continue FIND ;
403- }
404- url = url .substring (2 , url .length () - 2 );
405- } else {
406- break CLIP ;
407- }
408- }
409- data .addHref ("path" ,"STYLE/#text" ,"href" ,url );
410- }
411- }
376+ private void patternCSSExtract (HTMLMetaData data , Pattern pattern , String content ) {
377+ Matcher m = pattern .matcher (content );
378+ int idx = 0 ;
379+ int contentLen = content .length ();
380+ if (contentLen > 100000 )
381+ // extract URLs only from the first 100 kB
382+ contentLen = 100000 ;
383+ while ((idx < contentLen ) && m .find ()) {
384+ idx = m .end ();
385+ String url = m .group (1 );
386+ url = cssUrlTrimPattern .matcher (url ).replaceAll ("" );
387+ if (!url .isEmpty ()) {
388+ data .addHref ("path" ,"STYLE/#text" ,"href" , url );
389+ }
390+ }
391+ }
412392}
0 commit comments