commoncrawl · sebastian-nagel · Jan 10, 2026 · Jan 8, 2026 · Jan 9, 2026
diff --git a/build.xml b/build.xml
@@ -1257,6 +1257,7 @@
         <!--source path="${plugins.dir}/protocol-selenium/src/java"/-->
         <!--source path="${plugins.dir}/publish-rabbitmq/src/java"/-->
         <source path="${plugins.dir}/scoring-adaptive/src/java"/>
+        <source path="${plugins.dir}/scoring-adaptive/src/test"/>
         <source path="${plugins.dir}/scoring-depth/src/java/" />
         <source path="${plugins.dir}/scoring-link/src/java/" />
         <source path="${plugins.dir}/scoring-opic/src/java/" />

diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
@@ -2216,12 +2216,21 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this
   </description>
 </property>
 
+<property>
+  <name>scoring.adaptive.penalty.non_canonical</name>
+  <value>.07</value>
+  <description>
+    Penalize non-canonical pages, i.e., pages with a canonical link not equal to the URL.
+    The default is to delay the revisit up to 7 days (7 * scoring.adaptive.factor.fetchtime).
+  </description>
+</property>
+
 <property>
   <name>scoring.adaptive.mark.orphan.after</name>
    <value>518400</value>
    <description>
      Time span (in minutes) after which a page not seen anymore by inlink or
-     seed is marked as orpaned. Default = 518400 minutes = one year.
+     seed is marked as orphaned. Default = 518400 minutes = one year.
    </description>
 </property>
 
@@ -2230,7 +2239,7 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this
    <value>172800</value>
    <description>
      Time span (in minutes) after which a gone page not seen anymore
-     by inlink or seed is marked as orpaned. Also duplicates and unfetched pages
+     by inlink or seed is marked as orphaned. Also duplicates and unfetched pages
      with a retry count >= 3 are considered as gone.
      Default = 172800 minutes = four month.
    </description>
@@ -2241,7 +2250,7 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this
    <value>518400</value>
    <description>
      Time span (in minutes) after which a redirect not seen anymore by inlink
-     or seed is marked as orpaned. Default = 518400 minutes = one year.
+     or seed is marked as orphaned. Default = 518400 minutes = one year.
    </description>
 </property>
 
@@ -2250,7 +2259,7 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this
    <value>518400</value>
    <description>
      Time span (in minutes) after which a still unfetched page not seen anymore
-     by inlink or seed is marked as orpaned. Default = 518400 minutes = one year.
+     by inlink or seed is marked as orphaned. Default = 518400 minutes = one year.
    </description>
 </property>
 

diff --git a/src/java/org/apache/nutch/crawl/Generator2.java b/src/java/org/apache/nutch/crawl/Generator2.java
@@ -838,7 +838,8 @@ public void reduce(DomainScorePair key, Iterable<SelectorEntry> values,
       // log metrics per host/domain
       LOG.info(
           "{} :: selected={}, selected_hosts={}, max_urls_overflow={}, max_hosts_overflow={}, max_urls_per_host_overflow={}",
-          key.getDomain(), hostOrDomainCount, hosts.size(), maxUrlsOverflow,
+          key.getDomain(), hostOrDomainCount,
+          (hosts == null ? 0 : hosts.size()), maxUrlsOverflow,
           maxHostsOverflowCount, maxUrlsPerHostOverflowCount);
     }
 

diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java
@@ -84,7 +84,7 @@ public class FetcherThread extends Thread {
   private static final Logger LOG = LoggerFactory
       .getLogger(MethodHandles.lookup().lookupClass());
 
-  private static Writable EMPTY_VALUE = NullWritable.get();
+  private static final Writable EMPTY_VALUE = NullWritable.get();
 
   private Configuration conf;
   private URLFilters urlFilters;

diff --git a/src/plugin/build.xml b/src/plugin/build.xml
@@ -143,6 +143,7 @@
      <ant dir="protocol-http" target="test"/>
      <ant dir="protocol-httpclient" target="test"/>
      <ant dir="protocol-okhttp" target="test"/>
+     <ant dir="scoring-adaptive" target="test"/>
      <ant dir="scoring-orphan" target="test"/>
      <ant dir="scoring-metadata" target="test"/>
      <ant dir="subcollection" target="test"/>
@@ -229,6 +230,7 @@
     <ant dir="protocol-okhttp" target="clean"/>
 <!--<ant dir="protocol-selenium" target="clean" />-->
 <!--<ant dir="publish-rabbitmq" target="clean"/>-->
+    <ant dir="scoring-adaptive" target="clean"/>
     <ant dir="scoring-depth" target="clean"/>
     <ant dir="scoring-link" target="clean"/>
     <ant dir="scoring-opic" target="clean"/>

diff --git a/...in/scoring-adaptive/src/java/org/apache/nutch/scoring/adaptive/AdaptiveScoringFilter.java b/...in/scoring-adaptive/src/java/org/apache/nutch/scoring/adaptive/AdaptiveScoringFilter.java
@@ -34,10 +34,13 @@
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.NullWritable;
 import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.util.StringUtils;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.Generator;
+import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.scoring.AbstractScoringFilter;
 import org.apache.nutch.scoring.ScoringFilterException;
 
@@ -50,6 +53,8 @@
  * <li>the page score</li>
  * <li>the crawl status (fetched, not modified, redirect, gone)</li>
  * <li>the time elapsed since the scheduled fetch time</li>
+ * <li>whether or not a canonical link has been detected on the page and the
+ * link points to a different URL</li>
  * </ul>
  * </p>
  * 
@@ -66,16 +71,14 @@
  * 
  * <p>
  * The plugin is thought for large crawls where there are far more URLs than can
- * be fetched and taking a good sample is mandatory. Sampling is, of course,
- * usually based on the page score - relevant pages with a high score are
- * fetched with higher probability. However, a dynamic rotation of generated
+ * be fetched and selecting a representative sample is mandatory. Sampling is,
+ * of course, usually based on the page score - relevant pages with a high score
+ * are fetched with higher probability. However, a dynamic rotation of generated
  * items helps to avoid that the same page with a slightly higher score is
- * fetched again while others are still waiting to be queued. It also allows to
- * adjust the probabilities that gone or not modified pages are refetched.
+ * fetched again while others are still waiting to be queued. The plugin also
+ * allows to adjust when pages gone or not modified are revisited.
  * </p>
  * 
- * [TODO:experimental]
- * 
  * The plugin also includes heuristics to &quot;retire&quot; pages to status
  * db_orphan if they fail to fetch or are duplicates and are not seen in seeds
  * or via inlinks (cf. the plugin scoring-orphan).
@@ -156,6 +159,20 @@ public class AdaptiveScoringFilter extends AbstractScoringFilter {
    */
   public static final String ADAPTIVE_INJECTED_BOOST = "scoring.adaptive.boost.injected";
 
+  /**
+   * Penalty for pages with a canonical link different than the page URL.
+   * 
+   * Revisits are delayed by subtracting this penalty from the generator sort
+   * value.
+   * 
+   * Note: In order to avoid that pages without a canonical link are preferred,
+   * the penalty shouldn't be too high. The default is
+   * <code>7 * scoring.adaptive.factor.fetchtime</code>, that is a revisit can
+   * be delayed by up to 7 days, in comparison to a page where the canonical
+   * link equals the page URL, or a page without a canonical link.
+   */
+  public static final String ADAPTIVE_NON_CANONICAL_PENALTY = "scoring.adaptive.penalty.non_canonical";
+
   /*
    * Time span (in minutes) after which a page not seen anymore by inlink or
    * seed is marked as orphaned.
@@ -200,6 +217,8 @@ public class AdaptiveScoringFilter extends AbstractScoringFilter {
   public static final String SUCCESSFUL_FETCH_TIME = "_sft_";
   public static final Text WRITABLE_SUCCESSFUL_FETCH_TIME = new Text(SUCCESSFUL_FETCH_TIME);
 
+  private static final Writable EMPTY_VALUE = NullWritable.get();
+
   private Configuration conf;
 
   /**
@@ -215,6 +234,7 @@ public class AdaptiveScoringFilter extends AbstractScoringFilter {
   private float adaptiveLastSeenTimeSort;
   private float adaptiveFetchRetryPenalty;
   private float adaptiveBoostInjected;
+  private float nonCanonicalPenalty;
 
   private Map<Byte, Float> statusSortMap = new TreeMap<Byte, Float>();
   private Map<String, Float> contentTypeSortMap = new HashMap<String, Float>();
@@ -279,6 +299,10 @@ public void setConf(Configuration conf) {
       // is marked as orphaned when it's last seen time is not given
       orphanTimeLastSeenDefault = nowMinutes;
     }
+
+    /* Penalize non-canonical pages: default is to delay revisits by 7 days */
+    nonCanonicalPenalty = conf.getFloat(ADAPTIVE_NON_CANONICAL_PENALTY,
+        7 * adaptiveFetchTimeSort);
   }
 
   private void readSortFile(Reader sortFileReader) throws IOException {
@@ -390,6 +414,10 @@ public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
         initSort -= adaptiveLastSeenTimeSort * daysSinceLastSeen;
       }
     }
+    if (pageIsNotCanonical(url, datum)) {
+      // penalize for not being the canonical page
+      initSort -= nonCanonicalPenalty;
+    }
     return initSort;
   }
 
@@ -462,4 +490,22 @@ private static boolean pageIsRedirect(CrawlDatum datum) {
     return false;
   }
 
+  private static boolean pageIsNotCanonical(Text url, CrawlDatum datum) {
+    if (datum.getStatus() != CrawlDatum.STATUS_DB_FETCHED) {
+      // If not successfully fetched, there's no canonical link
+      return false;
+    }
+    Writable canonicalUrl = datum.getMetaData().get(Nutch.CANONICAL_LINK_KEY);
+    if (canonicalUrl != null && !canonicalUrl.equals(EMPTY_VALUE)
+        && !url.equals(canonicalUrl)) {
+      /*
+       * If there is a canonical link and it's different from the URL, it's not
+       * the canonical page.
+       */
+      return true;
+    }
+    // Otherwise, it's the canonical page or no canonical link was detected.
+    return false;
+  }
+
 }