From 0cc72a1c3d1db464d27a3fe8d89e4138e28be171 Mon Sep 17 00:00:00 2001 From: Hunter Stern Date: Mon, 21 Mar 2016 14:27:07 -0700 Subject: [PATCH 1/2] Make canonicalizer be able to strip session id params even if they are the first params in the query string. And add session id strip test. --- .../org/archive/url/IAURLCanonicalizer.java | 42 +++++++++---------- .../org/archive/url/URLRegexTransformer.java | 10 ++--- .../archive/url/IAURLCanonicalizerTest.java | 12 ++++++ 3 files changed, 38 insertions(+), 26 deletions(-) diff --git a/src/main/java/org/archive/url/IAURLCanonicalizer.java b/src/main/java/org/archive/url/IAURLCanonicalizer.java index 029598f6..1664bc60 100644 --- a/src/main/java/org/archive/url/IAURLCanonicalizer.java +++ b/src/main/java/org/archive/url/IAURLCanonicalizer.java @@ -63,27 +63,27 @@ public void canonicalize(HandyURL url) { String query = url.getQuery(); if(query != null) { - if(query.equals("")) { - if(rules.isSet(QUERY_SETTINGS, QUERY_STRIP_EMPTY)) { - query = null; - } - } else { - // we have a query... what to do with it? - - // first remove uneeded: - if(rules.isSet(QUERY_SETTINGS, QUERY_STRIP_SESSION_ID)) { - query = URLRegexTransformer.stripQuerySessionID(query); - } - // lower-case: - if(rules.isSet(QUERY_SETTINGS, QUERY_LOWERCASE)) { - query = query.toLowerCase(); - } - // re-order? - if(rules.isSet(QUERY_SETTINGS, QUERY_ALPHA_REORDER)) { - query = alphaReorderQuery(query); - } - } - url.setQuery(query); + // we have a query... what to do with it? + + // first remove uneeded: + if(rules.isSet(QUERY_SETTINGS, QUERY_STRIP_SESSION_ID)) { + query = URLRegexTransformer.stripQuerySessionID(query); + } + // lower-case: + if(rules.isSet(QUERY_SETTINGS, QUERY_LOWERCASE)) { + query = query.toLowerCase(); + } + // re-order? + if(rules.isSet(QUERY_SETTINGS, QUERY_ALPHA_REORDER)) { + query = alphaReorderQuery(query); + } + + if(query.equals("")) { + if(rules.isSet(QUERY_SETTINGS, QUERY_STRIP_EMPTY)) { + query = null; + } + } + url.setQuery(query); } } diff --git a/src/main/java/org/archive/url/URLRegexTransformer.java b/src/main/java/org/archive/url/URLRegexTransformer.java index c5505a74..617e0225 100644 --- a/src/main/java/org/archive/url/URLRegexTransformer.java +++ b/src/main/java/org/archive/url/URLRegexTransformer.java @@ -16,11 +16,11 @@ public class URLRegexTransformer { private static final OptimizedPattern QUERY_OPTS[] = { - new OptimizedPattern("(?i)^(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2), - new OptimizedPattern("(?i)^(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2), - new OptimizedPattern("(?i)^(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2), - new OptimizedPattern("(?i)^(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2), - new OptimizedPattern("(?i)^(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", "cftoken=", 1, 2), + new OptimizedPattern("(?i)^(.*)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2), + new OptimizedPattern("(?i)^(.*)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2), + new OptimizedPattern("(?i)^(.*)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2), + new OptimizedPattern("(?i)^(.*)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2), + new OptimizedPattern("(?i)^(.*)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", "cftoken=", 1, 2), }; diff --git a/src/test/java/org/archive/url/IAURLCanonicalizerTest.java b/src/test/java/org/archive/url/IAURLCanonicalizerTest.java index 3263edc7..fade9750 100644 --- a/src/test/java/org/archive/url/IAURLCanonicalizerTest.java +++ b/src/test/java/org/archive/url/IAURLCanonicalizerTest.java @@ -53,5 +53,17 @@ public void testGetDefaultPort() { assertEquals(80,IAURLCanonicalizer.getDefaultPort("http")); assertEquals(443,IAURLCanonicalizer.getDefaultPort("https")); } + + public void testStripSessionId() throws URISyntaxException { + IAURLCanonicalizer iaC = new IAURLCanonicalizer(new DefaultIACanonicalizerRules()); + compCan(iaC, + "http://www.nsf.gov/statistics/sed/2009/SED_2009.zip?CFID=14387305&CFTOKEN=72942008&jsessionid=f030eacc7e49c4ca0b077922347418418766", + "http://nsf.gov/statistics/sed/2009/sed_2009.zip?jsessionid=f030eacc7e49c4ca0b077922347418418766"); + + compCan(iaC, + "http://www.nsf.gov/statistics/sed/2009/SED_2009.zip?CFID=14387305&CFTOKEN=72942008", + "http://nsf.gov/statistics/sed/2009/sed_2009.zip"); + + } } From c7fa50dff57329b7df99007be79ed02f4d8223cd Mon Sep 17 00:00:00 2001 From: Hunter Stern Date: Mon, 21 Mar 2016 15:11:21 -0700 Subject: [PATCH 2/2] Format better --- .../org/archive/url/IAURLCanonicalizerTest.java | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/test/java/org/archive/url/IAURLCanonicalizerTest.java b/src/test/java/org/archive/url/IAURLCanonicalizerTest.java index fade9750..b3d7fe97 100644 --- a/src/test/java/org/archive/url/IAURLCanonicalizerTest.java +++ b/src/test/java/org/archive/url/IAURLCanonicalizerTest.java @@ -56,13 +56,12 @@ public void testGetDefaultPort() { public void testStripSessionId() throws URISyntaxException { IAURLCanonicalizer iaC = new IAURLCanonicalizer(new DefaultIACanonicalizerRules()); - compCan(iaC, - "http://www.nsf.gov/statistics/sed/2009/SED_2009.zip?CFID=14387305&CFTOKEN=72942008&jsessionid=f030eacc7e49c4ca0b077922347418418766", - "http://nsf.gov/statistics/sed/2009/sed_2009.zip?jsessionid=f030eacc7e49c4ca0b077922347418418766"); - - compCan(iaC, - "http://www.nsf.gov/statistics/sed/2009/SED_2009.zip?CFID=14387305&CFTOKEN=72942008", - "http://nsf.gov/statistics/sed/2009/sed_2009.zip"); + compCan(iaC, + "http://www.nsf.gov/statistics/sed/2009/SED_2009.zip?CFID=14387305&CFTOKEN=72942008&jsessionid=f030eacc7e49c4ca0b077922347418418766", + "http://nsf.gov/statistics/sed/2009/sed_2009.zip?jsessionid=f030eacc7e49c4ca0b077922347418418766"); + compCan(iaC, + "http://www.nsf.gov/statistics/sed/2009/SED_2009.zip?CFID=14387305&CFTOKEN=72942008", + "http://nsf.gov/statistics/sed/2009/sed_2009.zip"); }