From 9f90db1a360d1c7ac907006477fb12ff961a1c10 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Thu, 8 Jan 2026 15:02:36 +0100
Subject: [PATCH 1/2] AdaptiveScoringFilter: Delay revisits of non-canonical
pages - cf. #36 for lazy canonical link detection - delay revisits on pages
with a canonical link pointing to a different URL - the delay is
configurable per property scoring.adaptive.penalty.non_canonical as a
penalty on the generator sort value - fix typos in documentation
---
build.xml | 1 +
conf/nutch-default.xml | 17 ++++--
.../apache/nutch/fetcher/FetcherThread.java | 2 +-
src/plugin/build.xml | 2 +
.../adaptive/AdaptiveScoringFilter.java | 60 ++++++++++++++++---
5 files changed, 70 insertions(+), 12 deletions(-)
diff --git a/build.xml b/build.xml
index 5645288931..a4530c40f1 100644
--- a/build.xml
+++ b/build.xml
@@ -1257,6 +1257,7 @@
+
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index c9a06799ac..cdac434830 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -2216,12 +2216,21 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this
+
+ scoring.adaptive.penalty.non_canonical
+ .07
+
+ Penalize non-canonical pages, i.e., pages with a canonical link not equal to the URL.
+ The default is to delay the revisit up to 7 days (7 * scoring.adaptive.factor.fetchtime).
+
+
+
scoring.adaptive.mark.orphan.after518400
Time span (in minutes) after which a page not seen anymore by inlink or
- seed is marked as orpaned. Default = 518400 minutes = one year.
+ seed is marked as orphaned. Default = 518400 minutes = one year.
@@ -2230,7 +2239,7 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this
172800
Time span (in minutes) after which a gone page not seen anymore
- by inlink or seed is marked as orpaned. Also duplicates and unfetched pages
+ by inlink or seed is marked as orphaned. Also duplicates and unfetched pages
with a retry count >= 3 are considered as gone.
Default = 172800 minutes = four month.
@@ -2241,7 +2250,7 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this
518400
Time span (in minutes) after which a redirect not seen anymore by inlink
- or seed is marked as orpaned. Default = 518400 minutes = one year.
+ or seed is marked as orphaned. Default = 518400 minutes = one year.
@@ -2250,7 +2259,7 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this
518400
Time span (in minutes) after which a still unfetched page not seen anymore
- by inlink or seed is marked as orpaned. Default = 518400 minutes = one year.
+ by inlink or seed is marked as orphaned. Default = 518400 minutes = one year.
diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java
index a918ec020c..75ae606cb4 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherThread.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java
@@ -84,7 +84,7 @@ public class FetcherThread extends Thread {
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
- private static Writable EMPTY_VALUE = NullWritable.get();
+ private static final Writable EMPTY_VALUE = NullWritable.get();
private Configuration conf;
private URLFilters urlFilters;
diff --git a/src/plugin/build.xml b/src/plugin/build.xml
index e3146bee0d..81d3ece682 100755
--- a/src/plugin/build.xml
+++ b/src/plugin/build.xml
@@ -143,6 +143,7 @@
+
@@ -229,6 +230,7 @@
+
diff --git a/src/plugin/scoring-adaptive/src/java/org/apache/nutch/scoring/adaptive/AdaptiveScoringFilter.java b/src/plugin/scoring-adaptive/src/java/org/apache/nutch/scoring/adaptive/AdaptiveScoringFilter.java
index 6bdd319b71..52417dfd0b 100644
--- a/src/plugin/scoring-adaptive/src/java/org/apache/nutch/scoring/adaptive/AdaptiveScoringFilter.java
+++ b/src/plugin/scoring-adaptive/src/java/org/apache/nutch/scoring/adaptive/AdaptiveScoringFilter.java
@@ -34,10 +34,13 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.StringUtils;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Generator;
+import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.scoring.AbstractScoringFilter;
import org.apache.nutch.scoring.ScoringFilterException;
@@ -50,6 +53,8 @@
*
the page score
*
the crawl status (fetched, not modified, redirect, gone)
*
the time elapsed since the scheduled fetch time
+ *
whether or not a canonical link has been detected on the page and the
+ * link points to a different URL
*
*
*
@@ -66,16 +71,14 @@
*
*
* The plugin is thought for large crawls where there are far more URLs than can
- * be fetched and taking a good sample is mandatory. Sampling is, of course,
- * usually based on the page score - relevant pages with a high score are
- * fetched with higher probability. However, a dynamic rotation of generated
+ * be fetched and selecting a representative sample is mandatory. Sampling is,
+ * of course, usually based on the page score - relevant pages with a high score
+ * are fetched with higher probability. However, a dynamic rotation of generated
* items helps to avoid that the same page with a slightly higher score is
- * fetched again while others are still waiting to be queued. It also allows to
- * adjust the probabilities that gone or not modified pages are refetched.
+ * fetched again while others are still waiting to be queued. The plugin also
+ * allows to adjust when pages gone or not modified are revisited.
*
*
- * [TODO:experimental]
- *
* The plugin also includes heuristics to "retire" pages to status
* db_orphan if they fail to fetch or are duplicates and are not seen in seeds
* or via inlinks (cf. the plugin scoring-orphan).
@@ -156,6 +159,20 @@ public class AdaptiveScoringFilter extends AbstractScoringFilter {
*/
public static final String ADAPTIVE_INJECTED_BOOST = "scoring.adaptive.boost.injected";
+ /**
+ * Penalty for pages with a canonical link different than the page URL.
+ *
+ * Revisits are delayed by subtracting this penalty from the generator sort
+ * value.
+ *
+ * Note: In order to avoid that pages without a canonical link are preferred,
+ * the penalty shouldn't be too high. The default is
+ * 7 * scoring.adaptive.factor.fetchtime, that is a revisit can
+ * be delayed by up to 7 days, in comparison to a page where the canonical
+ * link equals the page URL, or a page without a canonical link.
+ */
+ public static final String ADAPTIVE_NON_CANONICAL_PENALTY = "scoring.adaptive.penalty.non_canonical";
+
/*
* Time span (in minutes) after which a page not seen anymore by inlink or
* seed is marked as orphaned.
@@ -200,6 +217,8 @@ public class AdaptiveScoringFilter extends AbstractScoringFilter {
public static final String SUCCESSFUL_FETCH_TIME = "_sft_";
public static final Text WRITABLE_SUCCESSFUL_FETCH_TIME = new Text(SUCCESSFUL_FETCH_TIME);
+ private static final Writable EMPTY_VALUE = NullWritable.get();
+
private Configuration conf;
/**
@@ -215,6 +234,7 @@ public class AdaptiveScoringFilter extends AbstractScoringFilter {
private float adaptiveLastSeenTimeSort;
private float adaptiveFetchRetryPenalty;
private float adaptiveBoostInjected;
+ private float nonCanonicalPenalty;
private Map statusSortMap = new TreeMap();
private Map contentTypeSortMap = new HashMap();
@@ -279,6 +299,10 @@ public void setConf(Configuration conf) {
// is marked as orphaned when it's last seen time is not given
orphanTimeLastSeenDefault = nowMinutes;
}
+
+ /* Penalize non-canonical pages: default is to delay revisits by 7 days */
+ nonCanonicalPenalty = conf.getFloat(ADAPTIVE_NON_CANONICAL_PENALTY,
+ 7 * adaptiveFetchTimeSort);
}
private void readSortFile(Reader sortFileReader) throws IOException {
@@ -390,6 +414,10 @@ public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
initSort -= adaptiveLastSeenTimeSort * daysSinceLastSeen;
}
}
+ if (pageIsNotCanonical(url, datum)) {
+ // penalize for not being the canonical page
+ initSort -= nonCanonicalPenalty;
+ }
return initSort;
}
@@ -462,4 +490,22 @@ private static boolean pageIsRedirect(CrawlDatum datum) {
return false;
}
+ private static boolean pageIsNotCanonical(Text url, CrawlDatum datum) {
+ if (datum.getStatus() != CrawlDatum.STATUS_DB_FETCHED) {
+ // If not successfully fetched, there's no canonical link
+ return false;
+ }
+ Writable canonicalUrl = datum.getMetaData().get(Nutch.CANONICAL_LINK_KEY);
+ if (canonicalUrl != null && !canonicalUrl.equals(EMPTY_VALUE)
+ && !url.equals(canonicalUrl)) {
+ /*
+ * If there is a canonical link and it's different from the URL, it's not
+ * the canonical page.
+ */
+ return true;
+ }
+ // Otherwise, it's the canonical page or no canonical link was detected.
+ return false;
+ }
+
}
From 75d9759707842842e839fb31080b711218b2c52a Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Fri, 9 Jan 2026 18:07:29 +0100
Subject: [PATCH 2/2] Generator2: fix NPE when generate.count.mode != domain
---
src/java/org/apache/nutch/crawl/Generator2.java | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/src/java/org/apache/nutch/crawl/Generator2.java b/src/java/org/apache/nutch/crawl/Generator2.java
index abe77d50d7..6de2adab81 100644
--- a/src/java/org/apache/nutch/crawl/Generator2.java
+++ b/src/java/org/apache/nutch/crawl/Generator2.java
@@ -838,7 +838,8 @@ public void reduce(DomainScorePair key, Iterable values,
// log metrics per host/domain
LOG.info(
"{} :: selected={}, selected_hosts={}, max_urls_overflow={}, max_hosts_overflow={}, max_urls_per_host_overflow={}",
- key.getDomain(), hostOrDomainCount, hosts.size(), maxUrlsOverflow,
+ key.getDomain(), hostOrDomainCount,
+ (hosts == null ? 0 : hosts.size()), maxUrlsOverflow,
maxHostsOverflowCount, maxUrlsPerHostOverflowCount);
}