From 40c7dca0ecce480ee1ce3cd32478775fe0a3c742 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Thu, 4 Dec 2025 13:14:59 +0100
Subject: [PATCH 1/5] Upgrade crawler-commons to 1.6 / 1.7-SNAPSHOT
---
ivy/ivy.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/ivy/ivy.xml b/ivy/ivy.xml
index 485e94a53d..5955f5e630 100644
--- a/ivy/ivy.xml
+++ b/ivy/ivy.xml
@@ -101,7 +101,7 @@
crawler-commons, downgraded to commons-io 2.8.0 shipped by Hadoop 3.3.6
https://github.com/commoncrawl/crawler-commons/tree/commons-io-downgrade
-->
-
+
From 5dd2215a4f36fa55a85b748cf3934607b8b48dee Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Fri, 28 Nov 2025 21:03:32 +0100
Subject: [PATCH 2/5] Extract canonical links in Fetcher
- Add lazy extractor for canonical links
in HTTP header and HTML
- Stubb call in Fetcher
---
ivy/ivy.xml | 1 +
.../apache/nutch/fetcher/FetcherThread.java | 12 ++
.../util/ByteArrayCharSequence.java | 91 +++++++++
.../util/CanonicalLinkDetector.java | 181 ++++++++++++++++++
.../util/TestCanonicalLinkDetector.java | 68 +++++++
5 files changed, 353 insertions(+)
create mode 100644 src/java/org/commoncrawl/util/ByteArrayCharSequence.java
create mode 100644 src/java/org/commoncrawl/util/CanonicalLinkDetector.java
create mode 100644 src/test/org/commoncrawl/util/TestCanonicalLinkDetector.java
diff --git a/ivy/ivy.xml b/ivy/ivy.xml
index 5955f5e630..a13894110c 100644
--- a/ivy/ivy.xml
+++ b/ivy/ivy.xml
@@ -146,6 +146,7 @@
+
diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java
index eaaf1b2944..7f18166779 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherThread.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java
@@ -67,6 +67,7 @@
import org.apache.nutch.service.NutchServer;
import org.apache.nutch.util.StringUtil;
import org.apache.nutch.util.URLUtil;
+import org.commoncrawl.util.CanonicalLinkDetector;
import org.commoncrawl.util.WarcCapture;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -751,6 +752,17 @@ private ParseStatus output(Text key, CrawlDatum datum, Content content,
.calculate(content, new ParseStatus().getEmptyParse(conf));
datum.setSignature(signature);
}
+ boolean extractCanonicalLink = true; // TODO: make configurable
+ if (parseResult == null && !parsing && extractCanonicalLink) {
+ List canonicalLinks = CanonicalLinkDetector.detectCanonicalLinks(content);
+ if (!canonicalLinks.isEmpty()) {
+ LOG.debug("Found canonical links: {}", canonicalLinks);
+ // TODO
+ // - resolve, normalize and filter
+ // - add to metadata of datum
+ // datum.getMetaData().put("", canonicalLinks.get(0));
+ }
+ }
}
/*
diff --git a/src/java/org/commoncrawl/util/ByteArrayCharSequence.java b/src/java/org/commoncrawl/util/ByteArrayCharSequence.java
new file mode 100644
index 0000000000..9946933a30
--- /dev/null
+++ b/src/java/org/commoncrawl/util/ByteArrayCharSequence.java
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.commoncrawl.util;
+
+import java.nio.charset.StandardCharsets;
+
+/**
+ * Wrap a byte array as a {@link CharSequence} in
+ * {@link StandardCharsets#ISO_8859_1} encoding.
+ *
+ * For regular expression matching on ASCII characters only, the wrapper should
+ * be faster than creating a {@link String} from the byte array or a
+ * subsequence, because no bytes are converted to chars and no memory is
+ * allocated for a new String.
+ *
+ * Similar wrappers are part of
+ * extJWNL,
+ * BUbiNG, and other Java
+ * libraries.
+ */
+public class ByteArrayCharSequence implements CharSequence {
+
+ private final byte[] data;
+ private final int length;
+ private final int offset;
+
+ public ByteArrayCharSequence() {
+ this(new byte[0], 0, 0);
+ }
+
+ public ByteArrayCharSequence(final byte[] data) {
+ this(data, 0, data.length);
+ }
+
+ public ByteArrayCharSequence(final byte[] data, int length) {
+ this(data, 0, length);
+ }
+
+ public ByteArrayCharSequence(final byte[] data, int offset, int length) {
+ this.data = data;
+ if (offset < 0) {
+ throw new ArrayIndexOutOfBoundsException("Negative offset: " + offset);
+ }
+ if (length < 0) {
+ throw new IllegalArgumentException("Negative length:" + length);
+ }
+ if ((offset + length) > data.length) {
+ throw new ArrayIndexOutOfBoundsException(
+ "(Offset + length) > array_length");
+ }
+ this.length = length;
+ this.offset = offset;
+ }
+
+ @Override
+ public int length() {
+ return this.length;
+ }
+
+ @Override
+ public char charAt(int index) {
+ if (index >= length) {
+ throw new IndexOutOfBoundsException("" + index);
+ }
+ return (char) (data[offset + index] & 0xff);
+ }
+
+ @Override
+ public CharSequence subSequence(int start, int end) {
+ return new ByteArrayCharSequence(data, offset + start, end - start);
+ }
+
+ @Override
+ public String toString() {
+ return new String(data, offset, length, StandardCharsets.ISO_8859_1);
+ }
+}
\ No newline at end of file
diff --git a/src/java/org/commoncrawl/util/CanonicalLinkDetector.java b/src/java/org/commoncrawl/util/CanonicalLinkDetector.java
new file mode 100644
index 0000000000..80b66cd438
--- /dev/null
+++ b/src/java/org/commoncrawl/util/CanonicalLinkDetector.java
@@ -0,0 +1,181 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.commoncrawl.util;
+
+import java.lang.invoke.MethodHandles;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.http.HeaderElement;
+import org.apache.http.NameValuePair;
+import org.apache.http.message.BasicHeaderValueParser;
+import org.apache.nutch.protocol.Content;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class CanonicalLinkDetector {
+
+ protected static Set SUPPORTED_CONTENT_TYPES = new HashSet<>();
+ static {
+ SUPPORTED_CONTENT_TYPES.add("text/html");
+ SUPPORTED_CONTENT_TYPES.add("application/xhtml+xml");
+ }
+
+ /**
+ * Pattern to match canonical link elements in HTML. The length of the
+ * canonical link URL inside the element is limited to max. 2048 characters.
+ */
+ private static Pattern canonicalLinkPattern = Pattern.compile(
+ "]{0,2054}rel=(?:'canonical'|\"canonical\"|canonical\\b)[^>]{0,2054}>",
+ Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);
+ private static Pattern hrefPattern = Pattern
+ .compile("href=['\"]?([^'\"\\s]{0,2048})", Pattern.CASE_INSENSITIVE);
+
+ private static Pattern canonicalRelValuePattern = Pattern
+ .compile("\\bcanonical\\b", Pattern.CASE_INSENSITIVE);
+ private static final Pattern linkInParentheses = Pattern
+ .compile("^\\s*<\\s*(.*?)\\s*>\\s*$");
+
+ private static final List EMPTY_RESULT = List.of();
+
+ /** top-N bytes of HTML to look for canonical link */
+ private static int CHUNK_SIZE = 65536;
+
+ /** max. number canonical links to detect */
+ private static int MAX_LINKS = 1;
+
+ /**
+ * Extract canonical link from HTTP header.
+ *
+ * The extraction is delegated to {@link BasicHeaderValueParser} because
+ * parsing multi-valued link attributes is far from trivial, e.g.
+ *
+ *