Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 30 additions & 1 deletion src/java/org/commoncrawl/tools/UrlSamplerHost.java
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,8 @@ public void write(DataOutput out) throws IOException {
public static class SampleMapper
extends Mapper<Text, Text, DomainScorePair, TextCountPair> {

private boolean hostStripWWW = false;

private DomainScorePair outputKey = new DomainScorePair();
private TextCountPair outputValue = new TextCountPair();

Expand All @@ -132,6 +134,10 @@ public static class SampleMapper
* But do not strip if the host name is &quote;www.tld&quote; (e.g.,
* <code>www.com</code>).
*
* Stripping is required for per-host limit configurations before 2026,
* based on Common Crawl web graphs where the leading <code>www.</code> was
* stripped.
*
* @param host
* name
* @return host name with leading www. stripped
Expand All @@ -145,6 +151,12 @@ private static String hostStripWWW(String host) {
return host;
}

@Override
public void setup(Context context) {
Configuration conf = context.getConfiguration();
hostStripWWW = conf.getBoolean("urlsample.host.strip.www", false);
}

@Override
public void map(Text key, Text value, Context context)
throws IOException, InterruptedException {
Expand All @@ -162,7 +174,10 @@ public void map(Text key, Text value, Context context)
String host;
try {
URL u = new URL(url);
host = hostStripWWW(u.getHost());
host = u.getHost();
if (hostStripWWW) {
host = hostStripWWW(host);
}
} catch (Exception e) {
LOG.warn("Malformed URL: '{}', skipping ({})", url, e.getMessage());
context.getCounter("UrlSampler", "MALFORMED_URL").increment(1);
Expand Down Expand Up @@ -352,6 +367,20 @@ private void sample(Path[] inputs, Path output) throws Exception {
public void usage() {
System.err
.println("Usage: UrlSamplerHost [-D...] <host_limits> <input_dir>... <output_dir>\n");
System.err.println(
"\nThe host_limits file defines the maximum number of URLs to sample per host.");
System.err.println("\nProperties:");
System.err.println(
"\t-Durlsample.host.strip.www=(true|false)\tstrip leading www. from host names");
System.err.println(
"\t\t\t(depending on whether the limits file uses stripped host names)");
System.err.println("Properties to configure defaults, if host is not in the limits file:");
System.err.println(
"\t-Durlsample.urls.per.host\tmax. number of URLs to sample per host");
System.err.println(
"\t\t\t-1 : sample randomly with low probability (default)");
System.err.println(
"\t-Durlsample.default.score\tdefault score for sampled URLs (default: 0.001)");
}

@Override
Expand Down