commoncrawl · sebastian-nagel · Dec 19, 2025 · Dec 4, 2025 · Nov 28, 2025 · Nov 29, 2025
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
@@ -1179,6 +1179,15 @@
   </description>
 </property>
 
+<property>
+  <name>fetcher.detect.canonical.link</name>
+  <value>false</value>
+  <description>If true, fetcher will detect canonical links in HTML content
+  relying on the class org.commoncrawl.util.CanonicalLinkDetector. Found
+  links are store in CrawlDatum metadata as &quot;canonical.link&quot;.
+  </description>
+</property>
+
 <property>
   <name>fetcher.timelimit.mins</name>
   <value>-1</value>

diff --git a/ivy/ivy.xml b/ivy/ivy.xml
@@ -101,7 +101,7 @@
 		  crawler-commons, downgraded to commons-io 2.8.0 shipped by Hadoop 3.3.6
 		  https://github.com/commoncrawl/crawler-commons/tree/commons-io-downgrade
 		-->
-		<dependency org="com.github.crawler-commons" name="crawler-commons" rev="1.6-SNAPSHOT" />
+		<dependency org="com.github.crawler-commons" name="crawler-commons" rev="1.7-SNAPSHOT" />
 
 		<dependency org="com.google.code.gson" name="gson" rev="2.13.1"/>
 		<dependency org="com.martinkl.warc" name="warc-hadoop" rev="0.1.0">
@@ -146,6 +146,7 @@
         <!-- Required for JUnit 5 (Jupiter) test execution -->
         <dependency org="org.junit.jupiter" name="junit-jupiter-engine" rev="5.13.4" conf="test->default"/>
         <dependency org="org.junit.jupiter" name="junit-jupiter-api" rev="5.13.4" conf="test->default"/>
+        <dependency org="org.junit.jupiter" name="junit-jupiter-params" rev="5.13.4" conf="test->default"/>
 
 		<!-- Jetty used to serve test pages for unit tests, but is also provided as dependency of Hadoop -->
 		<dependency org="org.eclipse.jetty" name="jetty-server" rev="10.0.25" conf="test->default">

diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java
@@ -31,7 +31,9 @@
 import java.util.concurrent.atomic.AtomicLong;
 
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.NullWritable;
 import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.util.StringUtils;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.NutchWritable;
@@ -67,6 +69,7 @@
 import org.apache.nutch.service.NutchServer;
 import org.apache.nutch.util.StringUtil;
 import org.apache.nutch.util.URLUtil;
+import org.commoncrawl.util.CanonicalLinkDetector;
 import org.commoncrawl.util.WarcCapture;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -81,6 +84,8 @@ public class FetcherThread extends Thread {
   private static final Logger LOG = LoggerFactory
       .getLogger(MethodHandles.lookup().lookupClass());
 
+  private static Writable EMPTY_VALUE = NullWritable.get();
+
   private Configuration conf;
   private URLFilters urlFilters;
   private URLExemptionFilters urlExemptionFilters;
@@ -140,6 +145,7 @@ public class FetcherThread extends Thread {
   private boolean storingProtocolVersions;
 
   private boolean signatureWithoutParsing;
+  private boolean detectCanonicalLink;
 
   private AtomicInteger pages;
 
@@ -179,6 +185,8 @@ public FetcherThread(Configuration conf, AtomicInteger activeThreads, FetchItemQ
     this.parseUtil = new ParseUtil(conf);
     this.skipTruncated = conf.getBoolean(ParseSegment.SKIP_TRUNCATED, true);
     this.signatureWithoutParsing = conf.getBoolean("fetcher.signature", false);
+    this.detectCanonicalLink = conf.getBoolean("fetcher.detect.canonical.link",
+        false);
     this.protocolFactory = new ProtocolFactory(conf);
     this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_FETCHER);
     this.maxCrawlDelay = conf.getInt("fetcher.max.crawl.delay", 30) * 1000;
@@ -751,6 +759,14 @@ private ParseStatus output(Text key, CrawlDatum datum, Content content,
               .calculate(content, new ParseStatus().getEmptyParse(conf));
           datum.setSignature(signature);
         }
+
+        if (detectCanonicalLink) {
+          /*
+           * TODO: if parsing, then canonical links should be detected on the
+           * DOM tree.
+           */
+          addCanonicalLink(key, datum, content);
+        }
       }
 
       /*
@@ -936,6 +952,37 @@ private ParseStatus output(Text key, CrawlDatum datum, Content content,
     return null;
   }
 
+  private void addCanonicalLink(Text key, CrawlDatum datum, Content content) {
+    List<String> canonicalLinks = CanonicalLinkDetector
+        .detectCanonicalLinks(content);
+    if (canonicalLinks.isEmpty() || canonicalLinks.get(0).isEmpty()) {
+      /*
+       * Add a null value, so that a CrawlDb update overwrites outdated
+       * canonical links.
+       */
+      datum.getMetaData().put(Nutch.CANONICAL_LINK_KEY, EMPTY_VALUE);
+    } else {
+      LOG.debug("Found canonical links: {}", canonicalLinks);
+      String link = canonicalLinks.get(0);
+      String urlKey = key.toString();
+      try {
+        if (!link.startsWith("http")) {
+          link = URLUtil.resolveURL(new URL(urlKey), link).toString();
+        }
+        link = normalizers.normalize(link, URLNormalizers.SCOPE_FETCHER);
+        // do not filter, we just recording the canonical link
+      } catch (MalformedURLException e) {
+        link = null;
+      }
+      if (link != null) {
+        Text canonicalLink = new Text(link);
+        datum.getMetaData().put(Nutch.CANONICAL_LINK_KEY, canonicalLink);
+      } else {
+        datum.getMetaData().put(Nutch.CANONICAL_LINK_KEY, EMPTY_VALUE);
+      }
+    }
+  }
+
   private void outputRobotsTxt(List<Content> robotsTxtContent) throws InterruptedException {
     for (Content robotsTxt : robotsTxtContent) {
       LOG.debug("Fetched and stored robots.txt {}",

diff --git a/src/java/org/apache/nutch/metadata/Nutch.java b/src/java/org/apache/nutch/metadata/Nutch.java
@@ -17,6 +17,7 @@
 package org.apache.nutch.metadata;
 
 import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
 
 /**
  * A collection of Nutch internal metadata constants.
@@ -114,4 +115,6 @@ public interface Nutch {
 	public static final String FETCH_EVENT_FETCHTIME = "fetchTime";
 	/** Content-lanueage key in the Pub/Sub event metadata for the content-language of the parsed page*/
 	public static final String FETCH_EVENT_CONTENTLANG = "content-language";
+
+  public static final Writable CANONICAL_LINK_KEY = new Text("canonical.link");
 }
diff --git a/src/java/org/commoncrawl/util/ByteArrayCharSequence.java b/src/java/org/commoncrawl/util/ByteArrayCharSequence.java
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.commoncrawl.util;
+
+import java.nio.charset.StandardCharsets;
+
+/**
+ * Wrap a byte array as a {@link CharSequence} in
+ * {@link StandardCharsets#ISO_8859_1} encoding.
+ * 
+ * For regular expression matching on ASCII characters only, the wrapper should
+ * be faster than creating a {@link String} from the byte array or a
+ * subsequence, because no bytes are converted to chars and no memory is
+ * allocated for a new String.
+ * 
+ * Similar wrappers are part of
+ * <a href="https://extjwnl.sourceforge.net/">extJWNL</a>,
+ * <a href="https://github.com/LAW-Unimi/BUbiNG">BUbiNG</a>, and other Java
+ * libraries.
+ */
+public class ByteArrayCharSequence implements CharSequence {
+
+  private final byte[] data;
+  private final int length;
+  private final int offset;
+
+  public ByteArrayCharSequence() {
+    this(new byte[0], 0, 0);
+  }
+
+  public ByteArrayCharSequence(final byte[] data) {
+    this(data, 0, data.length);
+  }
+
+  public ByteArrayCharSequence(final byte[] data, int length) {
+    this(data, 0, length);
+  }
+
+  public ByteArrayCharSequence(final byte[] data, int offset, int length) {
+    this.data = data;
+    if (offset < 0) {
+      throw new ArrayIndexOutOfBoundsException("Negative offset: " + offset);
+    }
+    if (length < 0) {
+      throw new IllegalArgumentException("Negative length:" + length);
+    }
+    if ((offset + length) > data.length) {
+      throw new ArrayIndexOutOfBoundsException(
+          "(Offset + length) > array_length");
+    }
+    this.length = length;
+    this.offset = offset;
+  }
+
+  @Override
+  public int length() {
+    return this.length;
+  }
+
+  @Override
+  public char charAt(int index) {
+    if (index >= length) {
+      throw new IndexOutOfBoundsException("" + index);
+    }
+    return (char) (data[offset + index] & 0xff);
+  }
+
+  @Override
+  public CharSequence subSequence(int start, int end) {
+    return new ByteArrayCharSequence(data, offset + start, end - start);
+  }
+
+  @Override
+  public String toString() {
+    return new String(data, offset, length, StandardCharsets.ISO_8859_1);
+  }
+}