Skip to content

Commit 5cfff50

Browse files
author
Hunter Stern
committed
Make canonicalizer be able to strip session id params even if they are the first params in the query string. And add session id strip test. And change IAURLCanonicalizer.java to ensure that if after transformations on the query string have completed and the query is empty, there is not a ? added to the end of the url.
1 parent 7a7cf08 commit 5cfff50

3 files changed

Lines changed: 32 additions & 23 deletions

File tree

src/main/java/org/archive/url/IAURLCanonicalizer.java

Lines changed: 17 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -63,25 +63,24 @@ public void canonicalize(HandyURL url) {
6363

6464
String query = url.getQuery();
6565
if(query != null) {
66-
if(query.equals("")) {
67-
if(rules.isSet(QUERY_SETTINGS, QUERY_STRIP_EMPTY)) {
68-
query = null;
69-
}
70-
} else {
71-
// we have a query... what to do with it?
66+
// we have a query... what to do with it?
7267

73-
// first remove uneeded:
74-
if(rules.isSet(QUERY_SETTINGS, QUERY_STRIP_SESSION_ID)) {
75-
query = URLRegexTransformer.stripQuerySessionID(query);
76-
}
77-
// lower-case:
78-
if(rules.isSet(QUERY_SETTINGS, QUERY_LOWERCASE)) {
79-
query = query.toLowerCase();
80-
}
81-
// re-order?
82-
if(rules.isSet(QUERY_SETTINGS, QUERY_ALPHA_REORDER)) {
83-
query = alphaReorderQuery(query);
84-
}
68+
// first remove uneeded:
69+
if(rules.isSet(QUERY_SETTINGS, QUERY_STRIP_SESSION_ID)) {
70+
query = URLRegexTransformer.stripQuerySessionID(query);
71+
}
72+
// lower-case:
73+
if(rules.isSet(QUERY_SETTINGS, QUERY_LOWERCASE)) {
74+
query = query.toLowerCase();
75+
}
76+
// re-order?
77+
if(rules.isSet(QUERY_SETTINGS, QUERY_ALPHA_REORDER)) {
78+
query = alphaReorderQuery(query);
79+
}
80+
if(query.equals("")) {
81+
if(rules.isSet(QUERY_SETTINGS, QUERY_STRIP_EMPTY)) {
82+
query = null;
83+
}
8584
}
8685
url.setQuery(query);
8786
}

src/main/java/org/archive/url/URLRegexTransformer.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,11 @@ public class URLRegexTransformer {
1616

1717
private static final OptimizedPattern QUERY_OPTS[] = {
1818

19-
new OptimizedPattern("(?i)^(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2),
20-
new OptimizedPattern("(?i)^(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2),
21-
new OptimizedPattern("(?i)^(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2),
22-
new OptimizedPattern("(?i)^(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2),
23-
new OptimizedPattern("(?i)^(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", "cftoken=", 1, 2),
19+
new OptimizedPattern("(?i)^(.*)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2),
20+
new OptimizedPattern("(?i)^(.*)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2),
21+
new OptimizedPattern("(?i)^(.*)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2),
22+
new OptimizedPattern("(?i)^(.*)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2),
23+
new OptimizedPattern("(?i)^(.*)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", "cftoken=", 1, 2),
2424
};
2525

2626

src/test/java/org/archive/url/IAURLCanonicalizerTest.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,5 +53,15 @@ public void testGetDefaultPort() {
5353
assertEquals(80,IAURLCanonicalizer.getDefaultPort("http"));
5454
assertEquals(443,IAURLCanonicalizer.getDefaultPort("https"));
5555
}
56+
57+
public void testStripSessionId() throws URISyntaxException {
58+
IAURLCanonicalizer iaC = new IAURLCanonicalizer(new DefaultIACanonicalizerRules());
59+
compCan(iaC,
60+
"http://www.nsf.gov/statistics/sed/2009/SED_2009.zip?CFID=14387305&CFTOKEN=72942008&jsessionid=f030eacc7e49c4ca0b077922347418418766",
61+
"http://nsf.gov/statistics/sed/2009/sed_2009.zip?jsessionid=f030eacc7e49c4ca0b077922347418418766");
62+
compCan(iaC,
63+
"http://www.nsf.gov/statistics/sed/2009/SED_2009.zip?CFID=14387305&CFTOKEN=72942008",
64+
"http://nsf.gov/statistics/sed/2009/sed_2009.zip");
65+
}
5666

5767
}

0 commit comments

Comments
 (0)