diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index db264d6ef7..c9a06799ac 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -1179,6 +1179,15 @@ + + fetcher.detect.canonical.link + false + If true, fetcher will detect canonical links in HTML content + relying on the class org.commoncrawl.util.CanonicalLinkDetector. Found + links are store in CrawlDatum metadata as "canonical.link". + + + fetcher.timelimit.mins -1 diff --git a/ivy/ivy.xml b/ivy/ivy.xml index 485e94a53d..a13894110c 100644 --- a/ivy/ivy.xml +++ b/ivy/ivy.xml @@ -101,7 +101,7 @@ crawler-commons, downgraded to commons-io 2.8.0 shipped by Hadoop 3.3.6 https://github.com/commoncrawl/crawler-commons/tree/commons-io-downgrade --> - + @@ -146,6 +146,7 @@ + diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java index eaaf1b2944..a918ec020c 100644 --- a/src/java/org/apache/nutch/fetcher/FetcherThread.java +++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java @@ -31,7 +31,9 @@ import java.util.concurrent.atomic.AtomicLong; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; import org.apache.hadoop.util.StringUtils; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.NutchWritable; @@ -67,6 +69,7 @@ import org.apache.nutch.service.NutchServer; import org.apache.nutch.util.StringUtil; import org.apache.nutch.util.URLUtil; +import org.commoncrawl.util.CanonicalLinkDetector; import org.commoncrawl.util.WarcCapture; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -81,6 +84,8 @@ public class FetcherThread extends Thread { private static final Logger LOG = LoggerFactory .getLogger(MethodHandles.lookup().lookupClass()); + private static Writable EMPTY_VALUE = NullWritable.get(); + private Configuration conf; private URLFilters urlFilters; private URLExemptionFilters urlExemptionFilters; @@ -140,6 +145,7 @@ public class FetcherThread extends Thread { private boolean storingProtocolVersions; private boolean signatureWithoutParsing; + private boolean detectCanonicalLink; private AtomicInteger pages; @@ -179,6 +185,8 @@ public FetcherThread(Configuration conf, AtomicInteger activeThreads, FetchItemQ this.parseUtil = new ParseUtil(conf); this.skipTruncated = conf.getBoolean(ParseSegment.SKIP_TRUNCATED, true); this.signatureWithoutParsing = conf.getBoolean("fetcher.signature", false); + this.detectCanonicalLink = conf.getBoolean("fetcher.detect.canonical.link", + false); this.protocolFactory = new ProtocolFactory(conf); this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_FETCHER); this.maxCrawlDelay = conf.getInt("fetcher.max.crawl.delay", 30) * 1000; @@ -751,6 +759,14 @@ private ParseStatus output(Text key, CrawlDatum datum, Content content, .calculate(content, new ParseStatus().getEmptyParse(conf)); datum.setSignature(signature); } + + if (detectCanonicalLink) { + /* + * TODO: if parsing, then canonical links should be detected on the + * DOM tree. + */ + addCanonicalLink(key, datum, content); + } } /* @@ -936,6 +952,37 @@ private ParseStatus output(Text key, CrawlDatum datum, Content content, return null; } + private void addCanonicalLink(Text key, CrawlDatum datum, Content content) { + List canonicalLinks = CanonicalLinkDetector + .detectCanonicalLinks(content); + if (canonicalLinks.isEmpty() || canonicalLinks.get(0).isEmpty()) { + /* + * Add a null value, so that a CrawlDb update overwrites outdated + * canonical links. + */ + datum.getMetaData().put(Nutch.CANONICAL_LINK_KEY, EMPTY_VALUE); + } else { + LOG.debug("Found canonical links: {}", canonicalLinks); + String link = canonicalLinks.get(0); + String urlKey = key.toString(); + try { + if (!link.startsWith("http")) { + link = URLUtil.resolveURL(new URL(urlKey), link).toString(); + } + link = normalizers.normalize(link, URLNormalizers.SCOPE_FETCHER); + // do not filter, we just recording the canonical link + } catch (MalformedURLException e) { + link = null; + } + if (link != null) { + Text canonicalLink = new Text(link); + datum.getMetaData().put(Nutch.CANONICAL_LINK_KEY, canonicalLink); + } else { + datum.getMetaData().put(Nutch.CANONICAL_LINK_KEY, EMPTY_VALUE); + } + } + } + private void outputRobotsTxt(List robotsTxtContent) throws InterruptedException { for (Content robotsTxt : robotsTxtContent) { LOG.debug("Fetched and stored robots.txt {}", diff --git a/src/java/org/apache/nutch/metadata/Nutch.java b/src/java/org/apache/nutch/metadata/Nutch.java index 0cfb26369b..eea25e8e95 100644 --- a/src/java/org/apache/nutch/metadata/Nutch.java +++ b/src/java/org/apache/nutch/metadata/Nutch.java @@ -17,6 +17,7 @@ package org.apache.nutch.metadata; import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; /** * A collection of Nutch internal metadata constants. @@ -114,4 +115,6 @@ public interface Nutch { public static final String FETCH_EVENT_FETCHTIME = "fetchTime"; /** Content-lanueage key in the Pub/Sub event metadata for the content-language of the parsed page*/ public static final String FETCH_EVENT_CONTENTLANG = "content-language"; + + public static final Writable CANONICAL_LINK_KEY = new Text("canonical.link"); } diff --git a/src/java/org/commoncrawl/util/ByteArrayCharSequence.java b/src/java/org/commoncrawl/util/ByteArrayCharSequence.java new file mode 100644 index 0000000000..9946933a30 --- /dev/null +++ b/src/java/org/commoncrawl/util/ByteArrayCharSequence.java @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.commoncrawl.util; + +import java.nio.charset.StandardCharsets; + +/** + * Wrap a byte array as a {@link CharSequence} in + * {@link StandardCharsets#ISO_8859_1} encoding. + * + * For regular expression matching on ASCII characters only, the wrapper should + * be faster than creating a {@link String} from the byte array or a + * subsequence, because no bytes are converted to chars and no memory is + * allocated for a new String. + * + * Similar wrappers are part of + * extJWNL, + * BUbiNG, and other Java + * libraries. + */ +public class ByteArrayCharSequence implements CharSequence { + + private final byte[] data; + private final int length; + private final int offset; + + public ByteArrayCharSequence() { + this(new byte[0], 0, 0); + } + + public ByteArrayCharSequence(final byte[] data) { + this(data, 0, data.length); + } + + public ByteArrayCharSequence(final byte[] data, int length) { + this(data, 0, length); + } + + public ByteArrayCharSequence(final byte[] data, int offset, int length) { + this.data = data; + if (offset < 0) { + throw new ArrayIndexOutOfBoundsException("Negative offset: " + offset); + } + if (length < 0) { + throw new IllegalArgumentException("Negative length:" + length); + } + if ((offset + length) > data.length) { + throw new ArrayIndexOutOfBoundsException( + "(Offset + length) > array_length"); + } + this.length = length; + this.offset = offset; + } + + @Override + public int length() { + return this.length; + } + + @Override + public char charAt(int index) { + if (index >= length) { + throw new IndexOutOfBoundsException("" + index); + } + return (char) (data[offset + index] & 0xff); + } + + @Override + public CharSequence subSequence(int start, int end) { + return new ByteArrayCharSequence(data, offset + start, end - start); + } + + @Override + public String toString() { + return new String(data, offset, length, StandardCharsets.ISO_8859_1); + } +} \ No newline at end of file diff --git a/src/java/org/commoncrawl/util/CanonicalLinkDetector.java b/src/java/org/commoncrawl/util/CanonicalLinkDetector.java new file mode 100644 index 0000000000..8e40777013 --- /dev/null +++ b/src/java/org/commoncrawl/util/CanonicalLinkDetector.java @@ -0,0 +1,191 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.commoncrawl.util; + +import java.lang.invoke.MethodHandles; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.http.HeaderElement; +import org.apache.http.NameValuePair; +import org.apache.http.ParseException; +import org.apache.http.message.BasicHeaderValueParser; +import org.apache.nutch.protocol.Content; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class CanonicalLinkDetector { + + private static final Logger LOG = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + protected static Set SUPPORTED_CONTENT_TYPES = new HashSet<>(); + static { + SUPPORTED_CONTENT_TYPES.add("text/html"); + SUPPORTED_CONTENT_TYPES.add("application/xhtml+xml"); + } + + /** + * Pattern to match canonical link elements in HTML. The length of the + * canonical link URL inside the element is limited to max. 2048 characters. + */ + private static Pattern canonicalLinkPattern = Pattern.compile( + "]{0,2054}rel=(?:'canonical'|\"canonical\"|canonical\\b)[^>]{0,2054}>", + Pattern.CASE_INSENSITIVE | Pattern.MULTILINE); + private static Pattern hrefPattern = Pattern + .compile("href=['\"]?([^'\"\\s]{0,2048})", Pattern.CASE_INSENSITIVE); + + private static Pattern canonicalRelValuePattern = Pattern + .compile("\\bcanonical\\b", Pattern.CASE_INSENSITIVE); + private static final Pattern linkInParentheses = Pattern + .compile("^\\s*<\\s*(.*?)\\s*>\\s*$"); + + private static final List EMPTY_RESULT = List.of(); + + /** top-N bytes of HTML to look for canonical link */ + private static int CHUNK_SIZE = 65536; + + /** max. number canonical links to detect */ + private static int MAX_LINKS = 1; + + /** + * Extract canonical link from HTTP header. + * + * The extraction is delegated to {@link BasicHeaderValueParser} because + * parsing multi-valued link attributes is far from trivial, e.g. + * + *
+   Link: ; rel="canonical",; rel="shortlink",; rel="shortcut icon"
+   * 
+ * + * @param "Link" + * header values + * @return the canonical links found, or an empty list if no canonical link is + * found + */ + protected static List detectCanonicalLinksHttpHeader( + String[] linkHeaders, int maxResults) { + List result = EMPTY_RESULT; + for (String httpHeaderLink : linkHeaders) { + HeaderElement elem; + try { + elem = BasicHeaderValueParser.parseHeaderElement(httpHeaderLink, + BasicHeaderValueParser.INSTANCE); + } catch (ParseException e) { + LOG.error("Failed to parse Link HTTP header: {}", httpHeaderLink, e); + continue; + } + for (NameValuePair param : elem.getParameters()) { + if ("rel".equalsIgnoreCase(param.getName()) + && canonicalRelValuePattern.matcher(param.getValue()).find()) { + String link = elem.getName(); + // match inside < ... > + Matcher urlMatcher = linkInParentheses.matcher(link); + if (urlMatcher.matches()) { + link = urlMatcher.group(1); + if (result == EMPTY_RESULT) { + result = new ArrayList(1); + } + result.add(link); + if (result.size() >= maxResults) { + break; + } + } + } + } + } + return result; + } + + public static boolean isEligibleContentType(String contentType) { + return SUPPORTED_CONTENT_TYPES.contains(contentType); + } + + /** + * Extract canonical link from HTTP header. + * + * The extraction is delegated to {@link BasicHeaderValueParser} because + * parsing multi-valued link attributes is far from trivial, e.g. + * + *
+   Link: ; rel="canonical",; rel="shortlink",; rel="shortcut icon"
+   * 
+ * + * @param "Link" + * header values + * @return the canonical links found, or an empty list if no canonical link is + * found + */ + public static List detectCanonicalLinksHTML(byte[] content, int chunkSize, + int maxResults) { + List result = EMPTY_RESULT; + int length = content.length < chunkSize ? content.length : chunkSize; + CharSequence cs; + cs = new ByteArrayCharSequence(content, length); + Matcher clMatcher = canonicalLinkPattern.matcher(cs); + while (clMatcher.find()) { + CharSequence cls; + cls = cs.subSequence(clMatcher.start(), clMatcher.end()); + Matcher hrefMatcher = hrefPattern.matcher(cls); + if (hrefMatcher.find(5)) { + String cl = hrefMatcher.group(1); + if (result == EMPTY_RESULT) { + result = new ArrayList(1); + } + result.add(cl); + if (result.size() >= maxResults) { + break; + } + } + } + return result; + } + + public static List detectCanonicalLinks(Content content, + int chunkSize, int maxLinks) { + + /* + * Note: the HTTP header look-up is case-insensitive if + * CaseInsensitiveMetadata or SpellCheckedMetadata is used. + */ + String[] linkHeaders = content.getMetadata().getValues("Link"); + List canonicalLinks = detectCanonicalLinksHttpHeader(linkHeaders, + maxLinks); + + if (canonicalLinks.size() < maxLinks + && isEligibleContentType(content.getContentType())) { + List linksHtml = detectCanonicalLinksHTML(content.getContent(), chunkSize, maxLinks); + if (linksHtml.size() > 0) { + if (canonicalLinks == EMPTY_RESULT) { + canonicalLinks = linksHtml; + } else { + canonicalLinks.addAll(linksHtml); + } + } + } + return canonicalLinks; + } + + public static List detectCanonicalLinks(Content content) { + return detectCanonicalLinks(content, CHUNK_SIZE, MAX_LINKS); + } + +} diff --git a/src/test/org/commoncrawl/util/TestCanonicalLinkDetector.java b/src/test/org/commoncrawl/util/TestCanonicalLinkDetector.java new file mode 100644 index 0000000000..60843e5a6f --- /dev/null +++ b/src/test/org/commoncrawl/util/TestCanonicalLinkDetector.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.commoncrawl.util; + +import static org.junit.jupiter.api.Assertions.*; + +import java.net.URI; +import java.nio.charset.StandardCharsets; +import java.util.List; + +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +class TestCanonicalLinkDetector { + + @ParameterizedTest + @ValueSource(strings = { + "", // + "", // + "", // + "", // + "", // + "", // + "", // + "", // + "", // + }) + void testDetectHTML(String htmlSnippet) { + String canonicalLink = "https://www.example.org/canonical/"; + List canonicalLinks = CanonicalLinkDetector + .detectCanonicalLinksHTML(htmlSnippet.getBytes(StandardCharsets.UTF_8), + 1024, 1); + assertFalse(canonicalLinks.isEmpty()); + URI baseUri = URI.create("https://www.example.org/"); + assertEquals(canonicalLink, + baseUri.resolve(canonicalLinks.get(0)).toASCIIString()); + } + + @ParameterizedTest + @ValueSource(strings = { + "; rel=\"canonical\"", // + "; rel=\"canonical\",; rel=\"shortlink\"", // + "; rel='canonical'", // + "; rel=\"canonical\",; rel=\"shortlink\",; rel=\"shortcut icon\"", // + }) + void testDetectHTTP(String httpHeader) { + String canonicalLink = "https://www.example.org/canonical/"; + String[] linkHeaderValues = List.of(httpHeader).toArray(new String[0]); + List canonicalLinks = CanonicalLinkDetector + .detectCanonicalLinksHttpHeader(linkHeaderValues, 1); + assertFalse(canonicalLinks.isEmpty()); + assertEquals(canonicalLink, canonicalLinks.get(0)); + } +}