Skip to content

Commit 1485fdd

Browse files
author
Kristinn Sigurðsson
committed
Merge pull request iipc#54 from vonrosen/modify-strip-session-params
Make canonicalizer be able to strip session id params even if they ar…
2 parents cf5f600 + 5cfff50 commit 1485fdd

3 files changed

Lines changed: 32 additions & 23 deletions

File tree

src/main/java/org/archive/url/IAURLCanonicalizer.java

Lines changed: 17 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -63,25 +63,24 @@ public void canonicalize(HandyURL url) {
6363

6464
String query = url.getQuery();
6565
if(query != null) {
66-
if(query.equals("")) {
67-
if(rules.isSet(QUERY_SETTINGS, QUERY_STRIP_EMPTY)) {
68-
query = null;
69-
}
70-
} else {
71-
// we have a query... what to do with it?
66+
// we have a query... what to do with it?
7267

73-
// first remove uneeded:
74-
if(rules.isSet(QUERY_SETTINGS, QUERY_STRIP_SESSION_ID)) {
75-
query = URLRegexTransformer.stripQuerySessionID(query);
76-
}
77-
// lower-case:
78-
if(rules.isSet(QUERY_SETTINGS, QUERY_LOWERCASE)) {
79-
query = query.toLowerCase();
80-
}
81-
// re-order?
82-
if(rules.isSet(QUERY_SETTINGS, QUERY_ALPHA_REORDER)) {
83-
query = alphaReorderQuery(query);
84-
}
68+
// first remove uneeded:
69+
if(rules.isSet(QUERY_SETTINGS, QUERY_STRIP_SESSION_ID)) {
70+
query = URLRegexTransformer.stripQuerySessionID(query);
71+
}
72+
// lower-case:
73+
if(rules.isSet(QUERY_SETTINGS, QUERY_LOWERCASE)) {
74+
query = query.toLowerCase();
75+
}
76+
// re-order?
77+
if(rules.isSet(QUERY_SETTINGS, QUERY_ALPHA_REORDER)) {
78+
query = alphaReorderQuery(query);
79+
}
80+
if(query.equals("")) {
81+
if(rules.isSet(QUERY_SETTINGS, QUERY_STRIP_EMPTY)) {
82+
query = null;
83+
}
8584
}
8685
url.setQuery(query);
8786
}

src/main/java/org/archive/url/URLRegexTransformer.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,11 @@ public class URLRegexTransformer {
1616

1717
private static final OptimizedPattern QUERY_OPTS[] = {
1818

19-
new OptimizedPattern("(?i)^(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2),
20-
new OptimizedPattern("(?i)^(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2),
21-
new OptimizedPattern("(?i)^(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2),
22-
new OptimizedPattern("(?i)^(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2),
23-
new OptimizedPattern("(?i)^(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", "cftoken=", 1, 2),
19+
new OptimizedPattern("(?i)^(.*)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2),
20+
new OptimizedPattern("(?i)^(.*)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2),
21+
new OptimizedPattern("(?i)^(.*)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2),
22+
new OptimizedPattern("(?i)^(.*)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2),
23+
new OptimizedPattern("(?i)^(.*)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", "cftoken=", 1, 2),
2424
};
2525

2626

src/test/java/org/archive/url/IAURLCanonicalizerTest.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,5 +53,15 @@ public void testGetDefaultPort() {
5353
assertEquals(80,IAURLCanonicalizer.getDefaultPort("http"));
5454
assertEquals(443,IAURLCanonicalizer.getDefaultPort("https"));
5555
}
56+
57+
public void testStripSessionId() throws URISyntaxException {
58+
IAURLCanonicalizer iaC = new IAURLCanonicalizer(new DefaultIACanonicalizerRules());
59+
compCan(iaC,
60+
"http://www.nsf.gov/statistics/sed/2009/SED_2009.zip?CFID=14387305&CFTOKEN=72942008&jsessionid=f030eacc7e49c4ca0b077922347418418766",
61+
"http://nsf.gov/statistics/sed/2009/sed_2009.zip?jsessionid=f030eacc7e49c4ca0b077922347418418766");
62+
compCan(iaC,
63+
"http://www.nsf.gov/statistics/sed/2009/SED_2009.zip?CFID=14387305&CFTOKEN=72942008",
64+
"http://nsf.gov/statistics/sed/2009/sed_2009.zip");
65+
}
5666

5767
}

0 commit comments

Comments
 (0)