diff --git a/src/main/java/org/archive/url/IAURLCanonicalizer.java b/src/main/java/org/archive/url/IAURLCanonicalizer.java index 029598f6..1664bc60 100644 --- a/src/main/java/org/archive/url/IAURLCanonicalizer.java +++ b/src/main/java/org/archive/url/IAURLCanonicalizer.java @@ -63,27 +63,27 @@ public void canonicalize(HandyURL url) { String query = url.getQuery(); if(query != null) { - if(query.equals("")) { - if(rules.isSet(QUERY_SETTINGS, QUERY_STRIP_EMPTY)) { - query = null; - } - } else { - // we have a query... what to do with it? - - // first remove uneeded: - if(rules.isSet(QUERY_SETTINGS, QUERY_STRIP_SESSION_ID)) { - query = URLRegexTransformer.stripQuerySessionID(query); - } - // lower-case: - if(rules.isSet(QUERY_SETTINGS, QUERY_LOWERCASE)) { - query = query.toLowerCase(); - } - // re-order? - if(rules.isSet(QUERY_SETTINGS, QUERY_ALPHA_REORDER)) { - query = alphaReorderQuery(query); - } - } - url.setQuery(query); + // we have a query... what to do with it? + + // first remove uneeded: + if(rules.isSet(QUERY_SETTINGS, QUERY_STRIP_SESSION_ID)) { + query = URLRegexTransformer.stripQuerySessionID(query); + } + // lower-case: + if(rules.isSet(QUERY_SETTINGS, QUERY_LOWERCASE)) { + query = query.toLowerCase(); + } + // re-order? + if(rules.isSet(QUERY_SETTINGS, QUERY_ALPHA_REORDER)) { + query = alphaReorderQuery(query); + } + + if(query.equals("")) { + if(rules.isSet(QUERY_SETTINGS, QUERY_STRIP_EMPTY)) { + query = null; + } + } + url.setQuery(query); } } diff --git a/src/main/java/org/archive/url/URLRegexTransformer.java b/src/main/java/org/archive/url/URLRegexTransformer.java index c5505a74..617e0225 100644 --- a/src/main/java/org/archive/url/URLRegexTransformer.java +++ b/src/main/java/org/archive/url/URLRegexTransformer.java @@ -16,11 +16,11 @@ public class URLRegexTransformer { private static final OptimizedPattern QUERY_OPTS[] = { - new OptimizedPattern("(?i)^(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2), - new OptimizedPattern("(?i)^(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2), - new OptimizedPattern("(?i)^(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2), - new OptimizedPattern("(?i)^(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2), - new OptimizedPattern("(?i)^(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", "cftoken=", 1, 2), + new OptimizedPattern("(?i)^(.*)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2), + new OptimizedPattern("(?i)^(.*)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2), + new OptimizedPattern("(?i)^(.*)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2), + new OptimizedPattern("(?i)^(.*)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2), + new OptimizedPattern("(?i)^(.*)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", "cftoken=", 1, 2), }; diff --git a/src/test/java/org/archive/url/IAURLCanonicalizerTest.java b/src/test/java/org/archive/url/IAURLCanonicalizerTest.java index 3263edc7..b3d7fe97 100644 --- a/src/test/java/org/archive/url/IAURLCanonicalizerTest.java +++ b/src/test/java/org/archive/url/IAURLCanonicalizerTest.java @@ -53,5 +53,16 @@ public void testGetDefaultPort() { assertEquals(80,IAURLCanonicalizer.getDefaultPort("http")); assertEquals(443,IAURLCanonicalizer.getDefaultPort("https")); } + + public void testStripSessionId() throws URISyntaxException { + IAURLCanonicalizer iaC = new IAURLCanonicalizer(new DefaultIACanonicalizerRules()); + compCan(iaC, + "http://www.nsf.gov/statistics/sed/2009/SED_2009.zip?CFID=14387305&CFTOKEN=72942008&jsessionid=f030eacc7e49c4ca0b077922347418418766", + "http://nsf.gov/statistics/sed/2009/sed_2009.zip?jsessionid=f030eacc7e49c4ca0b077922347418418766"); + compCan(iaC, + "http://www.nsf.gov/statistics/sed/2009/SED_2009.zip?CFID=14387305&CFTOKEN=72942008", + "http://nsf.gov/statistics/sed/2009/sed_2009.zip"); + + } }