diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index db264d6ef7..c9a06799ac 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1179,6 +1179,15 @@
+
+ fetcher.detect.canonical.link
+ false
+ If true, fetcher will detect canonical links in HTML content
+ relying on the class org.commoncrawl.util.CanonicalLinkDetector. Found
+ links are store in CrawlDatum metadata as "canonical.link".
+
+
+
fetcher.timelimit.mins-1
diff --git a/ivy/ivy.xml b/ivy/ivy.xml
index 485e94a53d..a13894110c 100644
--- a/ivy/ivy.xml
+++ b/ivy/ivy.xml
@@ -101,7 +101,7 @@
crawler-commons, downgraded to commons-io 2.8.0 shipped by Hadoop 3.3.6
https://github.com/commoncrawl/crawler-commons/tree/commons-io-downgrade
-->
-
+
@@ -146,6 +146,7 @@
+
diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java
index eaaf1b2944..a918ec020c 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherThread.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java
@@ -31,7 +31,9 @@
import java.util.concurrent.atomic.AtomicLong;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.StringUtils;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.NutchWritable;
@@ -67,6 +69,7 @@
import org.apache.nutch.service.NutchServer;
import org.apache.nutch.util.StringUtil;
import org.apache.nutch.util.URLUtil;
+import org.commoncrawl.util.CanonicalLinkDetector;
import org.commoncrawl.util.WarcCapture;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -81,6 +84,8 @@ public class FetcherThread extends Thread {
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
+ private static Writable EMPTY_VALUE = NullWritable.get();
+
private Configuration conf;
private URLFilters urlFilters;
private URLExemptionFilters urlExemptionFilters;
@@ -140,6 +145,7 @@ public class FetcherThread extends Thread {
private boolean storingProtocolVersions;
private boolean signatureWithoutParsing;
+ private boolean detectCanonicalLink;
private AtomicInteger pages;
@@ -179,6 +185,8 @@ public FetcherThread(Configuration conf, AtomicInteger activeThreads, FetchItemQ
this.parseUtil = new ParseUtil(conf);
this.skipTruncated = conf.getBoolean(ParseSegment.SKIP_TRUNCATED, true);
this.signatureWithoutParsing = conf.getBoolean("fetcher.signature", false);
+ this.detectCanonicalLink = conf.getBoolean("fetcher.detect.canonical.link",
+ false);
this.protocolFactory = new ProtocolFactory(conf);
this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_FETCHER);
this.maxCrawlDelay = conf.getInt("fetcher.max.crawl.delay", 30) * 1000;
@@ -751,6 +759,14 @@ private ParseStatus output(Text key, CrawlDatum datum, Content content,
.calculate(content, new ParseStatus().getEmptyParse(conf));
datum.setSignature(signature);
}
+
+ if (detectCanonicalLink) {
+ /*
+ * TODO: if parsing, then canonical links should be detected on the
+ * DOM tree.
+ */
+ addCanonicalLink(key, datum, content);
+ }
}
/*
@@ -936,6 +952,37 @@ private ParseStatus output(Text key, CrawlDatum datum, Content content,
return null;
}
+ private void addCanonicalLink(Text key, CrawlDatum datum, Content content) {
+ List canonicalLinks = CanonicalLinkDetector
+ .detectCanonicalLinks(content);
+ if (canonicalLinks.isEmpty() || canonicalLinks.get(0).isEmpty()) {
+ /*
+ * Add a null value, so that a CrawlDb update overwrites outdated
+ * canonical links.
+ */
+ datum.getMetaData().put(Nutch.CANONICAL_LINK_KEY, EMPTY_VALUE);
+ } else {
+ LOG.debug("Found canonical links: {}", canonicalLinks);
+ String link = canonicalLinks.get(0);
+ String urlKey = key.toString();
+ try {
+ if (!link.startsWith("http")) {
+ link = URLUtil.resolveURL(new URL(urlKey), link).toString();
+ }
+ link = normalizers.normalize(link, URLNormalizers.SCOPE_FETCHER);
+ // do not filter, we just recording the canonical link
+ } catch (MalformedURLException e) {
+ link = null;
+ }
+ if (link != null) {
+ Text canonicalLink = new Text(link);
+ datum.getMetaData().put(Nutch.CANONICAL_LINK_KEY, canonicalLink);
+ } else {
+ datum.getMetaData().put(Nutch.CANONICAL_LINK_KEY, EMPTY_VALUE);
+ }
+ }
+ }
+
private void outputRobotsTxt(List robotsTxtContent) throws InterruptedException {
for (Content robotsTxt : robotsTxtContent) {
LOG.debug("Fetched and stored robots.txt {}",
diff --git a/src/java/org/apache/nutch/metadata/Nutch.java b/src/java/org/apache/nutch/metadata/Nutch.java
index 0cfb26369b..eea25e8e95 100644
--- a/src/java/org/apache/nutch/metadata/Nutch.java
+++ b/src/java/org/apache/nutch/metadata/Nutch.java
@@ -17,6 +17,7 @@
package org.apache.nutch.metadata;
import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
/**
* A collection of Nutch internal metadata constants.
@@ -114,4 +115,6 @@ public interface Nutch {
public static final String FETCH_EVENT_FETCHTIME = "fetchTime";
/** Content-lanueage key in the Pub/Sub event metadata for the content-language of the parsed page*/
public static final String FETCH_EVENT_CONTENTLANG = "content-language";
+
+ public static final Writable CANONICAL_LINK_KEY = new Text("canonical.link");
}
diff --git a/src/java/org/commoncrawl/util/ByteArrayCharSequence.java b/src/java/org/commoncrawl/util/ByteArrayCharSequence.java
new file mode 100644
index 0000000000..9946933a30
--- /dev/null
+++ b/src/java/org/commoncrawl/util/ByteArrayCharSequence.java
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.commoncrawl.util;
+
+import java.nio.charset.StandardCharsets;
+
+/**
+ * Wrap a byte array as a {@link CharSequence} in
+ * {@link StandardCharsets#ISO_8859_1} encoding.
+ *
+ * For regular expression matching on ASCII characters only, the wrapper should
+ * be faster than creating a {@link String} from the byte array or a
+ * subsequence, because no bytes are converted to chars and no memory is
+ * allocated for a new String.
+ *
+ * Similar wrappers are part of
+ * extJWNL,
+ * BUbiNG, and other Java
+ * libraries.
+ */
+public class ByteArrayCharSequence implements CharSequence {
+
+ private final byte[] data;
+ private final int length;
+ private final int offset;
+
+ public ByteArrayCharSequence() {
+ this(new byte[0], 0, 0);
+ }
+
+ public ByteArrayCharSequence(final byte[] data) {
+ this(data, 0, data.length);
+ }
+
+ public ByteArrayCharSequence(final byte[] data, int length) {
+ this(data, 0, length);
+ }
+
+ public ByteArrayCharSequence(final byte[] data, int offset, int length) {
+ this.data = data;
+ if (offset < 0) {
+ throw new ArrayIndexOutOfBoundsException("Negative offset: " + offset);
+ }
+ if (length < 0) {
+ throw new IllegalArgumentException("Negative length:" + length);
+ }
+ if ((offset + length) > data.length) {
+ throw new ArrayIndexOutOfBoundsException(
+ "(Offset + length) > array_length");
+ }
+ this.length = length;
+ this.offset = offset;
+ }
+
+ @Override
+ public int length() {
+ return this.length;
+ }
+
+ @Override
+ public char charAt(int index) {
+ if (index >= length) {
+ throw new IndexOutOfBoundsException("" + index);
+ }
+ return (char) (data[offset + index] & 0xff);
+ }
+
+ @Override
+ public CharSequence subSequence(int start, int end) {
+ return new ByteArrayCharSequence(data, offset + start, end - start);
+ }
+
+ @Override
+ public String toString() {
+ return new String(data, offset, length, StandardCharsets.ISO_8859_1);
+ }
+}
\ No newline at end of file
diff --git a/src/java/org/commoncrawl/util/CanonicalLinkDetector.java b/src/java/org/commoncrawl/util/CanonicalLinkDetector.java
new file mode 100644
index 0000000000..8e40777013
--- /dev/null
+++ b/src/java/org/commoncrawl/util/CanonicalLinkDetector.java
@@ -0,0 +1,191 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.commoncrawl.util;
+
+import java.lang.invoke.MethodHandles;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.http.HeaderElement;
+import org.apache.http.NameValuePair;
+import org.apache.http.ParseException;
+import org.apache.http.message.BasicHeaderValueParser;
+import org.apache.nutch.protocol.Content;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class CanonicalLinkDetector {
+
+ private static final Logger LOG = LoggerFactory
+ .getLogger(MethodHandles.lookup().lookupClass());
+
+ protected static Set SUPPORTED_CONTENT_TYPES = new HashSet<>();
+ static {
+ SUPPORTED_CONTENT_TYPES.add("text/html");
+ SUPPORTED_CONTENT_TYPES.add("application/xhtml+xml");
+ }
+
+ /**
+ * Pattern to match canonical link elements in HTML. The length of the
+ * canonical link URL inside the element is limited to max. 2048 characters.
+ */
+ private static Pattern canonicalLinkPattern = Pattern.compile(
+ "]{0,2054}rel=(?:'canonical'|\"canonical\"|canonical\\b)[^>]{0,2054}>",
+ Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);
+ private static Pattern hrefPattern = Pattern
+ .compile("href=['\"]?([^'\"\\s]{0,2048})", Pattern.CASE_INSENSITIVE);
+
+ private static Pattern canonicalRelValuePattern = Pattern
+ .compile("\\bcanonical\\b", Pattern.CASE_INSENSITIVE);
+ private static final Pattern linkInParentheses = Pattern
+ .compile("^\\s*<\\s*(.*?)\\s*>\\s*$");
+
+ private static final List EMPTY_RESULT = List.of();
+
+ /** top-N bytes of HTML to look for canonical link */
+ private static int CHUNK_SIZE = 65536;
+
+ /** max. number canonical links to detect */
+ private static int MAX_LINKS = 1;
+
+ /**
+ * Extract canonical link from HTTP header.
+ *
+ * The extraction is delegated to {@link BasicHeaderValueParser} because
+ * parsing multi-valued link attributes is far from trivial, e.g.
+ *
+ *