From 9f90db1a360d1c7ac907006477fb12ff961a1c10 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Thu, 8 Jan 2026 15:02:36 +0100 Subject: [PATCH 1/2] AdaptiveScoringFilter: Delay revisits of non-canonical pages - cf. #36 for lazy canonical link detection - delay revisits on pages with a canonical link pointing to a different URL - the delay is configurable per property scoring.adaptive.penalty.non_canonical as a penalty on the generator sort value - fix typos in documentation --- build.xml | 1 + conf/nutch-default.xml | 17 ++++-- .../apache/nutch/fetcher/FetcherThread.java | 2 +- src/plugin/build.xml | 2 + .../adaptive/AdaptiveScoringFilter.java | 60 ++++++++++++++++--- 5 files changed, 70 insertions(+), 12 deletions(-) diff --git a/build.xml b/build.xml index 5645288931..a4530c40f1 100644 --- a/build.xml +++ b/build.xml @@ -1257,6 +1257,7 @@ + diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index c9a06799ac..cdac434830 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -2216,12 +2216,21 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this + + scoring.adaptive.penalty.non_canonical + .07 + + Penalize non-canonical pages, i.e., pages with a canonical link not equal to the URL. + The default is to delay the revisit up to 7 days (7 * scoring.adaptive.factor.fetchtime). + + + scoring.adaptive.mark.orphan.after 518400 Time span (in minutes) after which a page not seen anymore by inlink or - seed is marked as orpaned. Default = 518400 minutes = one year. + seed is marked as orphaned. Default = 518400 minutes = one year. @@ -2230,7 +2239,7 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this 172800 Time span (in minutes) after which a gone page not seen anymore - by inlink or seed is marked as orpaned. Also duplicates and unfetched pages + by inlink or seed is marked as orphaned. Also duplicates and unfetched pages with a retry count >= 3 are considered as gone. Default = 172800 minutes = four month. @@ -2241,7 +2250,7 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this 518400 Time span (in minutes) after which a redirect not seen anymore by inlink - or seed is marked as orpaned. Default = 518400 minutes = one year. + or seed is marked as orphaned. Default = 518400 minutes = one year. @@ -2250,7 +2259,7 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this 518400 Time span (in minutes) after which a still unfetched page not seen anymore - by inlink or seed is marked as orpaned. Default = 518400 minutes = one year. + by inlink or seed is marked as orphaned. Default = 518400 minutes = one year. diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java index a918ec020c..75ae606cb4 100644 --- a/src/java/org/apache/nutch/fetcher/FetcherThread.java +++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java @@ -84,7 +84,7 @@ public class FetcherThread extends Thread { private static final Logger LOG = LoggerFactory .getLogger(MethodHandles.lookup().lookupClass()); - private static Writable EMPTY_VALUE = NullWritable.get(); + private static final Writable EMPTY_VALUE = NullWritable.get(); private Configuration conf; private URLFilters urlFilters; diff --git a/src/plugin/build.xml b/src/plugin/build.xml index e3146bee0d..81d3ece682 100755 --- a/src/plugin/build.xml +++ b/src/plugin/build.xml @@ -143,6 +143,7 @@ + @@ -229,6 +230,7 @@ + diff --git a/src/plugin/scoring-adaptive/src/java/org/apache/nutch/scoring/adaptive/AdaptiveScoringFilter.java b/src/plugin/scoring-adaptive/src/java/org/apache/nutch/scoring/adaptive/AdaptiveScoringFilter.java index 6bdd319b71..52417dfd0b 100644 --- a/src/plugin/scoring-adaptive/src/java/org/apache/nutch/scoring/adaptive/AdaptiveScoringFilter.java +++ b/src/plugin/scoring-adaptive/src/java/org/apache/nutch/scoring/adaptive/AdaptiveScoringFilter.java @@ -34,10 +34,13 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; import org.apache.hadoop.util.StringUtils; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.Generator; +import org.apache.nutch.metadata.Nutch; import org.apache.nutch.scoring.AbstractScoringFilter; import org.apache.nutch.scoring.ScoringFilterException; @@ -50,6 +53,8 @@ *
  • the page score
  • *
  • the crawl status (fetched, not modified, redirect, gone)
  • *
  • the time elapsed since the scheduled fetch time
  • + *
  • whether or not a canonical link has been detected on the page and the + * link points to a different URL
  • * *

    * @@ -66,16 +71,14 @@ * *

    * The plugin is thought for large crawls where there are far more URLs than can - * be fetched and taking a good sample is mandatory. Sampling is, of course, - * usually based on the page score - relevant pages with a high score are - * fetched with higher probability. However, a dynamic rotation of generated + * be fetched and selecting a representative sample is mandatory. Sampling is, + * of course, usually based on the page score - relevant pages with a high score + * are fetched with higher probability. However, a dynamic rotation of generated * items helps to avoid that the same page with a slightly higher score is - * fetched again while others are still waiting to be queued. It also allows to - * adjust the probabilities that gone or not modified pages are refetched. + * fetched again while others are still waiting to be queued. The plugin also + * allows to adjust when pages gone or not modified are revisited. *

    * - * [TODO:experimental] - * * The plugin also includes heuristics to "retire" pages to status * db_orphan if they fail to fetch or are duplicates and are not seen in seeds * or via inlinks (cf. the plugin scoring-orphan). @@ -156,6 +159,20 @@ public class AdaptiveScoringFilter extends AbstractScoringFilter { */ public static final String ADAPTIVE_INJECTED_BOOST = "scoring.adaptive.boost.injected"; + /** + * Penalty for pages with a canonical link different than the page URL. + * + * Revisits are delayed by subtracting this penalty from the generator sort + * value. + * + * Note: In order to avoid that pages without a canonical link are preferred, + * the penalty shouldn't be too high. The default is + * 7 * scoring.adaptive.factor.fetchtime, that is a revisit can + * be delayed by up to 7 days, in comparison to a page where the canonical + * link equals the page URL, or a page without a canonical link. + */ + public static final String ADAPTIVE_NON_CANONICAL_PENALTY = "scoring.adaptive.penalty.non_canonical"; + /* * Time span (in minutes) after which a page not seen anymore by inlink or * seed is marked as orphaned. @@ -200,6 +217,8 @@ public class AdaptiveScoringFilter extends AbstractScoringFilter { public static final String SUCCESSFUL_FETCH_TIME = "_sft_"; public static final Text WRITABLE_SUCCESSFUL_FETCH_TIME = new Text(SUCCESSFUL_FETCH_TIME); + private static final Writable EMPTY_VALUE = NullWritable.get(); + private Configuration conf; /** @@ -215,6 +234,7 @@ public class AdaptiveScoringFilter extends AbstractScoringFilter { private float adaptiveLastSeenTimeSort; private float adaptiveFetchRetryPenalty; private float adaptiveBoostInjected; + private float nonCanonicalPenalty; private Map statusSortMap = new TreeMap(); private Map contentTypeSortMap = new HashMap(); @@ -279,6 +299,10 @@ public void setConf(Configuration conf) { // is marked as orphaned when it's last seen time is not given orphanTimeLastSeenDefault = nowMinutes; } + + /* Penalize non-canonical pages: default is to delay revisits by 7 days */ + nonCanonicalPenalty = conf.getFloat(ADAPTIVE_NON_CANONICAL_PENALTY, + 7 * adaptiveFetchTimeSort); } private void readSortFile(Reader sortFileReader) throws IOException { @@ -390,6 +414,10 @@ public float generatorSortValue(Text url, CrawlDatum datum, float initSort) initSort -= adaptiveLastSeenTimeSort * daysSinceLastSeen; } } + if (pageIsNotCanonical(url, datum)) { + // penalize for not being the canonical page + initSort -= nonCanonicalPenalty; + } return initSort; } @@ -462,4 +490,22 @@ private static boolean pageIsRedirect(CrawlDatum datum) { return false; } + private static boolean pageIsNotCanonical(Text url, CrawlDatum datum) { + if (datum.getStatus() != CrawlDatum.STATUS_DB_FETCHED) { + // If not successfully fetched, there's no canonical link + return false; + } + Writable canonicalUrl = datum.getMetaData().get(Nutch.CANONICAL_LINK_KEY); + if (canonicalUrl != null && !canonicalUrl.equals(EMPTY_VALUE) + && !url.equals(canonicalUrl)) { + /* + * If there is a canonical link and it's different from the URL, it's not + * the canonical page. + */ + return true; + } + // Otherwise, it's the canonical page or no canonical link was detected. + return false; + } + } From 75d9759707842842e839fb31080b711218b2c52a Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Fri, 9 Jan 2026 18:07:29 +0100 Subject: [PATCH 2/2] Generator2: fix NPE when generate.count.mode != domain --- src/java/org/apache/nutch/crawl/Generator2.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/java/org/apache/nutch/crawl/Generator2.java b/src/java/org/apache/nutch/crawl/Generator2.java index abe77d50d7..6de2adab81 100644 --- a/src/java/org/apache/nutch/crawl/Generator2.java +++ b/src/java/org/apache/nutch/crawl/Generator2.java @@ -838,7 +838,8 @@ public void reduce(DomainScorePair key, Iterable values, // log metrics per host/domain LOG.info( "{} :: selected={}, selected_hosts={}, max_urls_overflow={}, max_hosts_overflow={}, max_urls_per_host_overflow={}", - key.getDomain(), hostOrDomainCount, hosts.size(), maxUrlsOverflow, + key.getDomain(), hostOrDomainCount, + (hosts == null ? 0 : hosts.size()), maxUrlsOverflow, maxHostsOverflowCount, maxUrlsPerHostOverflowCount); }