From 61ee0008c2dd1a8d618e202abf61042edbbdd3b4 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Wed, 4 Feb 2026 13:18:43 +0100 Subject: [PATCH] UrlSamplerHost: make stripping of leading www. from host name configurable --- .../org/commoncrawl/tools/UrlSamplerHost.java | 31 ++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/src/java/org/commoncrawl/tools/UrlSamplerHost.java b/src/java/org/commoncrawl/tools/UrlSamplerHost.java index 971f78a2a6..e296ffa90b 100644 --- a/src/java/org/commoncrawl/tools/UrlSamplerHost.java +++ b/src/java/org/commoncrawl/tools/UrlSamplerHost.java @@ -123,6 +123,8 @@ public void write(DataOutput out) throws IOException { public static class SampleMapper extends Mapper { + private boolean hostStripWWW = false; + private DomainScorePair outputKey = new DomainScorePair(); private TextCountPair outputValue = new TextCountPair(); @@ -132,6 +134,10 @@ public static class SampleMapper * But do not strip if the host name is "e;www.tld"e; (e.g., * www.com). * + * Stripping is required for per-host limit configurations before 2026, + * based on Common Crawl web graphs where the leading www. was + * stripped. + * * @param host * name * @return host name with leading www. stripped @@ -145,6 +151,12 @@ private static String hostStripWWW(String host) { return host; } + @Override + public void setup(Context context) { + Configuration conf = context.getConfiguration(); + hostStripWWW = conf.getBoolean("urlsample.host.strip.www", false); + } + @Override public void map(Text key, Text value, Context context) throws IOException, InterruptedException { @@ -162,7 +174,10 @@ public void map(Text key, Text value, Context context) String host; try { URL u = new URL(url); - host = hostStripWWW(u.getHost()); + host = u.getHost(); + if (hostStripWWW) { + host = hostStripWWW(host); + } } catch (Exception e) { LOG.warn("Malformed URL: '{}', skipping ({})", url, e.getMessage()); context.getCounter("UrlSampler", "MALFORMED_URL").increment(1); @@ -352,6 +367,20 @@ private void sample(Path[] inputs, Path output) throws Exception { public void usage() { System.err .println("Usage: UrlSamplerHost [-D...] ... \n"); + System.err.println( + "\nThe host_limits file defines the maximum number of URLs to sample per host."); + System.err.println("\nProperties:"); + System.err.println( + "\t-Durlsample.host.strip.www=(true|false)\tstrip leading www. from host names"); + System.err.println( + "\t\t\t(depending on whether the limits file uses stripped host names)"); + System.err.println("Properties to configure defaults, if host is not in the limits file:"); + System.err.println( + "\t-Durlsample.urls.per.host\tmax. number of URLs to sample per host"); + System.err.println( + "\t\t\t-1 : sample randomly with low probability (default)"); + System.err.println( + "\t-Durlsample.default.score\tdefault score for sampled URLs (default: 0.001)"); } @Override