Skip to content

Commit fc24be8

Browse files
committed
moving org.archive.net.PublicSuffixes to ia-web-commons
1 parent 329ff22 commit fc24be8

3 files changed

Lines changed: 7601 additions & 0 deletions

File tree

Lines changed: 363 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,363 @@
1+
/*
2+
* This file is part of the Heritrix web crawler (crawler.archive.org).
3+
*
4+
* Licensed to the Internet Archive (IA) by one or more individual
5+
* contributors.
6+
*
7+
* The IA licenses this file to You under the Apache License, Version 2.0
8+
* (the "License"); you may not use this file except in compliance with
9+
* the License. You may obtain a copy of the License at
10+
*
11+
* http://www.apache.org/licenses/LICENSE-2.0
12+
*
13+
* Unless required by applicable law or agreed to in writing, software
14+
* distributed under the License is distributed on an "AS IS" BASIS,
15+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16+
* See the License for the specific language governing permissions and
17+
* limitations under the License.
18+
*/
19+
20+
package org.archive.net;
21+
22+
import java.io.BufferedReader;
23+
import java.io.BufferedWriter;
24+
import java.io.FileInputStream;
25+
import java.io.FileWriter;
26+
import java.io.IOException;
27+
import java.io.InputStream;
28+
import java.io.InputStreamReader;
29+
import java.io.OutputStreamWriter;
30+
import java.io.PrintWriter;
31+
import java.io.UnsupportedEncodingException;
32+
import java.util.ArrayList;
33+
import java.util.List;
34+
import java.util.regex.Matcher;
35+
import java.util.regex.Pattern;
36+
37+
import org.apache.commons.io.IOUtils;
38+
import org.archive.util.TextUtils;
39+
40+
/**
41+
* Utility class for making use of the information about 'public suffixes' at
42+
* http://publicsuffix.org.
43+
*
44+
* The public suffix list (once known as 'effective TLDs') was motivated by the
45+
* need to decide on which broader domains a subdomain was allowed to set
46+
* cookies. For example, a server at 'www.example.com' can set cookies for
47+
* 'www.example.com' or 'example.com' but not 'com'. 'www.example.co.uk' can set
48+
* cookies for 'www.example.co.uk' or 'example.co.uk' but not 'co.uk' or 'uk'.
49+
* The number of rules for all top-level-domains and 2nd- or 3rd- level domains
50+
* has become quite long; essentially the broadest domain a subdomain may assign
51+
* to is the one that was sold/registered to a specific name registrant.
52+
*
53+
* This concept should be useful in other contexts, too. Grouping URIs (or
54+
* queues of URIs to crawl) together with others sharing the same registered
55+
* suffix may be useful for applying the same rules to all, such as assigning
56+
* them to the same queue or crawler in a multi- machine setup.
57+
*
58+
* As of Heritrix3, we prefer the term 'Assignment Level Domain' (ALD)
59+
* for such domains, by analogy to 'Top Level Domain' (TLD) or '2nd Level
60+
* Domain' (2LD), etc.
61+
*
62+
* @author Gojomo
63+
*
64+
* this version of PublicSuffixes uses suffix-tree data structure for generating less
65+
* redundant regular expression. It may be even possible to write a light-weight,
66+
* thread-safe matcher based on this class.
67+
* @author Kenji Nagahashi
68+
*/
69+
public class PublicSuffixes {
70+
protected static Pattern topmostAssignedSurtPrefixPattern;
71+
protected static String topmostAssignedSurtPrefixRegex;
72+
73+
/**
74+
* prefix tree node. each Node represents sequence of letters (prefix)
75+
* and alternative sequences following it (list of Node's). Nodes in
76+
* {@code branches} are sorted for skip list like lookup and for generating
77+
* effective regular expression (see {@link #compareTo(Node)} and {@link #compareTo(char).)
78+
*
79+
* as is intended for internal use only, there's no access methods. procedures for updating
80+
* prefix tree with new input are defined within this class ({@link #addBranch(CharSequence)}).
81+
*
82+
* terminal node could be represented in two different form: 1) Node with zero branches,
83+
* or 2) Node with zero-length {@code cs}. So, root node must be initialized with empty (not null)
84+
* {@code branches} unless empty string matches the overall pattern.
85+
* {@code cs} must not be null except for root node.
86+
*/
87+
public static class Node implements Comparable<Node> {
88+
protected CharSequence cs;
89+
protected List<Node> branches;
90+
public Node() {
91+
this("", null);
92+
}
93+
protected Node(CharSequence cs) {
94+
this(cs, null);
95+
}
96+
protected Node(CharSequence cs, List<Node> branches) {
97+
this.cs = cs;
98+
this.branches = branches;
99+
}
100+
public void addBranch(CharSequence s) {
101+
if (branches == null) {
102+
branches = new ArrayList<Node>();
103+
branches.add(new Node("", null));
104+
}
105+
for (int i = 0; i < branches.size(); i++) {
106+
Node alt = branches.get(i);
107+
if (alt.add(s)) return;
108+
if (alt.compareTo(s.charAt(0)) > 0) {
109+
Node alt1 = new Node(s, null);
110+
branches.add(i, alt1);
111+
return;
112+
}
113+
}
114+
Node alt2 = new Node(s, null);
115+
branches.add(alt2);
116+
}
117+
public boolean add(CharSequence s) {
118+
int l = Math.min(s.length(), cs.length());
119+
int i = 0;
120+
while (i < l && s.charAt(i) == cs.charAt(i))
121+
i++;
122+
// zero-length match holds only when both cs and s are empty.
123+
if (i == 0) return cs.length() == 0 && s.length() == 0;
124+
if (i < cs.length()) {
125+
CharSequence cs0 = cs.subSequence(0, i);
126+
CharSequence cs1 = cs.subSequence(i, cs.length());
127+
CharSequence cs2 = s.subSequence(i, s.length());
128+
cs = cs0;
129+
Node alt1 = new Node(cs1, branches);
130+
(branches = new ArrayList<Node>()).add(alt1);
131+
addBranch(cs2);
132+
} else {
133+
assert i == cs.length();
134+
addBranch(s.subSequence(i, s.length()));
135+
}
136+
return true;
137+
}
138+
public int compareTo(Node other) {
139+
if (other.cs == null || other.cs.length() == 0)
140+
return (cs == null || cs.length() == 0) ? 0 : -1;
141+
return compareTo(other.cs.charAt(0));
142+
}
143+
public int compareTo(char oc) {
144+
if (cs == null || cs.length() == 0) return 1;
145+
// '!' and '*' must come after ordinary letters, in this order, for regexp
146+
// to work as intended.
147+
char c = cs.charAt(0);
148+
if (c == oc) return 0;
149+
if (c == '!') return oc == '*' ? -1 : 1;
150+
if (c == '*') return 1;
151+
if (oc == '*' || oc == '!') return -1;
152+
return Character.valueOf(c).compareTo(oc);
153+
// for generating the same regexp as previous version.
154+
//return Character.valueOf(oc).compareTo(c);
155+
}
156+
}
157+
158+
/**
159+
* Utility method for dumping a regex String, based on a published public
160+
* suffix list, which matches any SURT-form hostname up through the broadest
161+
* 'private' (assigned/sold) domain-segment. That is, for any of the
162+
* SURT-form hostnames...
163+
*
164+
* com,example, com,example,www, com,example,california,www
165+
*
166+
* ...the regex will match 'com,example,'.
167+
*
168+
* @param args
169+
* @throws IOException
170+
*/
171+
public static void main(String args[]) throws IOException {
172+
InputStream is;
173+
if (args.length == 0 || "=".equals(args[0])) {
174+
// use bundled list
175+
is = PublicSuffixes.class.getClassLoader().getResourceAsStream(
176+
"effective_tld_names.dat");
177+
} else {
178+
is = new FileInputStream(args[0]);
179+
}
180+
BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8"));
181+
String regex = getTopmostAssignedSurtPrefixRegex(reader);
182+
IOUtils.closeQuietly(is);
183+
184+
boolean needsClose = false;
185+
BufferedWriter writer;
186+
if (args.length >= 2) {
187+
// write to specified file
188+
writer = new BufferedWriter(new FileWriter(args[1]));
189+
needsClose = true;
190+
} else {
191+
// write to stdout
192+
writer = new BufferedWriter(new OutputStreamWriter(System.out));
193+
}
194+
writer.append(regex);
195+
writer.flush();
196+
if (needsClose) {
197+
writer.close();
198+
}
199+
}
200+
/**
201+
* Reads a file of the format promulgated by publicsuffix.org, ignoring
202+
* comments and '!' exceptions/notations, converting domain segments to
203+
* SURT-ordering. Leaves glob-style '*' wildcarding in place. Returns root
204+
* node of SURT-ordered prefix tree.
205+
*
206+
* @param reader
207+
* @return root of prefix tree node.
208+
* @throws IOException
209+
*/
210+
protected static Node readPublishedFileToSurtTrie(BufferedReader reader) throws IOException {
211+
// initializing with empty Alt list prevents empty pattern from being
212+
// created for the first addBranch()
213+
Node alt = new Node(null, new ArrayList<Node>());
214+
String line;
215+
while ((line = reader.readLine()) != null) {
216+
// discard whitespace, empty lines, comments, exceptions
217+
line = line.trim();
218+
if (line.length() == 0 || line.startsWith("//")) continue;
219+
// discard utf8 notation after entry
220+
line = line.split("\\s+")[0];
221+
// TODO: maybe we don't need to create lower-cased String
222+
line = line.toLowerCase();
223+
// SURT-order domain segments
224+
String[] segs = line.split("\\.");
225+
StringBuilder sb = new StringBuilder();
226+
for (int i = segs.length - 1; i >= 0; i--) {
227+
if (segs[i].length() == 0) continue;
228+
sb.append(segs[i]).append(',');
229+
}
230+
alt.addBranch(sb.toString());
231+
}
232+
return alt;
233+
}
234+
/**
235+
* utility function for dumping prefix tree structure. intended for debug use.
236+
* @param alt root of prefix tree.
237+
* @param lv indent level. 0 for root (no indent).
238+
* @param out writer to send output to.
239+
*/
240+
public static void dump(Node alt, int lv, PrintWriter out) {
241+
for (int i = 0; i < lv; i++)
242+
out.print(" ");
243+
out.println(alt.cs != null ? ('"'+alt.cs.toString()+'"') : "(null)");
244+
if (alt.branches != null) {
245+
for (Node br : alt.branches) {
246+
dump(br, lv + 1, out);
247+
}
248+
}
249+
}
250+
/**
251+
* bulids regular expression from prefix-tree {@code alt} into buffer {@code sb}.
252+
* @param alt prefix tree root.
253+
* @param sb StringBuffer to store regular expression.
254+
*/
255+
protected static void buildRegex(Node alt, StringBuilder sb) {
256+
String close = null;
257+
if (alt.cs != null) {
258+
// actually '!' always be the first character, because it is
259+
// always used along with '*'.
260+
for (int i = 0; i < alt.cs.length(); i++) {
261+
char c = alt.cs.charAt(i);
262+
if (c == '!') {
263+
if (close != null)
264+
throw new RuntimeException("more than one '!'");
265+
sb.append("(?=");
266+
close = ")";
267+
} else if (c == '*') {
268+
sb.append("[-\\w]+");
269+
} else {
270+
sb.append(c);
271+
}
272+
}
273+
}
274+
if (alt.branches != null) {
275+
// alt.branches.size() should always be > 1
276+
if (alt.branches.size() > 1) {
277+
sb.append("(?:");
278+
}
279+
String sep = "";
280+
for (Node alt1 : alt.branches) {
281+
sb.append(sep); sep = "|";
282+
buildRegex(alt1, sb);
283+
}
284+
if (alt.branches.size() > 1) {
285+
sb.append(")");
286+
}
287+
}
288+
if (close != null)
289+
sb.append(close);
290+
}
291+
292+
/**
293+
* Converts SURT-ordered list of public prefixes into a Java regex which
294+
* matches the public-portion "plus one" segment, giving the domain on which
295+
* cookies can be set or other policy grouping should occur. Also adds to
296+
* regex a fallback matcher that for any new/unknown TLDs assumes the
297+
* second-level domain is assignable. (Eg: 'zzz,example,').
298+
*
299+
* @param list
300+
* @return
301+
*/
302+
private static String surtPrefixRegexFromTrie(Node trie) {
303+
StringBuilder regex = new StringBuilder();
304+
regex.append("(?ix)^\n");
305+
trie.addBranch("*,"); // for new/unknown TLDs
306+
buildRegex(trie, regex);
307+
regex.append("\n([-\\w]+,)");
308+
return regex.toString();
309+
}
310+
311+
public static synchronized Pattern getTopmostAssignedSurtPrefixPattern() {
312+
if (topmostAssignedSurtPrefixPattern == null) {
313+
topmostAssignedSurtPrefixPattern = Pattern
314+
.compile(getTopmostAssignedSurtPrefixRegex());
315+
}
316+
return topmostAssignedSurtPrefixPattern;
317+
}
318+
319+
public static synchronized String getTopmostAssignedSurtPrefixRegex() {
320+
if (topmostAssignedSurtPrefixRegex == null) {
321+
// use bundled list
322+
try {
323+
BufferedReader reader = new BufferedReader(new InputStreamReader(
324+
PublicSuffixes.class.getClassLoader().getResourceAsStream(
325+
"effective_tld_names.dat"), "UTF-8"));
326+
topmostAssignedSurtPrefixRegex = getTopmostAssignedSurtPrefixRegex(reader);
327+
IOUtils.closeQuietly(reader);
328+
} catch (UnsupportedEncodingException ex) {
329+
// should never happen
330+
throw new RuntimeException(ex);
331+
}
332+
}
333+
return topmostAssignedSurtPrefixRegex;
334+
}
335+
336+
public static String getTopmostAssignedSurtPrefixRegex(BufferedReader reader) {
337+
try {
338+
Node trie = readPublishedFileToSurtTrie(reader);
339+
return surtPrefixRegexFromTrie(trie);
340+
} catch (IOException e) {
341+
throw new RuntimeException(e);
342+
}
343+
}
344+
345+
/**
346+
* Truncate SURT to its topmost assigned domain segment; that is,
347+
* the public suffix plus one segment, but as a SURT-ordered prefix.
348+
*
349+
* if the pattern doesn't match, the passed-in SURT is returned.
350+
*
351+
* @param surt SURT to truncate
352+
* @return truncated-to-topmost-assigned SURT prefix
353+
*/
354+
public static String reduceSurtToAssignmentLevel(String surt) {
355+
Matcher matcher = TextUtils.getMatcher(
356+
getTopmostAssignedSurtPrefixRegex(), surt);
357+
if (matcher.find()) {
358+
surt = matcher.group();
359+
}
360+
TextUtils.recycleMatcher(matcher);
361+
return surt;
362+
}
363+
}

0 commit comments

Comments
 (0)