Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
import static com.digitalpebble.stormcrawler.Constants.StatusStreamName;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Calendar;
Expand All @@ -25,7 +28,13 @@
import java.util.List;
import java.util.Map;

import com.digitalpebble.stormcrawler.protocol.Protocol;
import com.digitalpebble.stormcrawler.protocol.ProtocolFactory;
import com.digitalpebble.stormcrawler.util.MetadataTransfer;
import crawlercommons.domains.EffectiveTldFinder;
import crawlercommons.robots.BaseRobotRules;
import org.apache.commons.lang.StringUtils;
import org.apache.storm.Config;
import org.apache.storm.metric.api.MeanReducer;
import org.apache.storm.metric.api.ReducedMetric;
import org.apache.storm.task.OutputCollector;
Expand Down Expand Up @@ -68,6 +77,25 @@
*/
@SuppressWarnings("serial")
public class NewsSiteMapParserBolt extends SiteMapParserBolt {
public ProtocolFactory getProtocolFactory() {
return protocolFactory;
}

public void setProtocolFactory(ProtocolFactory protocolFactory) {
this.protocolFactory = protocolFactory;
}

public MetadataTransfer getMetadataTransfer() {
return metadataTransfer;
}

public void setMetadataTransfer(MetadataTransfer metadataTransfer) {
this.metadataTransfer = metadataTransfer;
}

private MetadataTransfer metadataTransfer;

private ProtocolFactory protocolFactory;
// TODO:
// this is a modified copy of c.d.s.bolt.SiteMapParserBolt
// - make parent class extensible and overridable
Expand Down Expand Up @@ -133,6 +161,9 @@ public static enum SitemapType {
/** Delay in minutes used for scheduling sub-sitemaps **/
private int scheduleSitemapsWithDelay = -1;

private boolean crossSubmitAllowed = false;
private boolean crossSubmitLenient = true;

@Override
public void execute(Tuple tuple) {
Metadata metadata = (Metadata) tuple.getValueByField("metadata");
Expand Down Expand Up @@ -257,6 +288,30 @@ public void execute(Tuple tuple) {

// send outlinks to status stream
for (Outlink ol : outlinks) {
try {
if (!this.crossSubmitAllowed && !crossSubmitCheck(ol, url, metadata)) {

String errorMessage = String.format("Cross Submit check failed for %s in %s", ol.getTargetURL(), url);
LOG.error(errorMessage);
ol.getMetadata().setValue(Constants.STATUS_ERROR_SOURCE,
"cross submit check");
ol.getMetadata().setValue(Constants.STATUS_ERROR_MESSAGE, errorMessage);
Values v = new Values(ol.getTargetURL(), ol.getMetadata(),
Status.ERROR);
collector.emit(StatusStreamName, tuple, v);

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good question, whether URLs failing the cross-submit check should be emitted at all:

  • it unnecessary fills up the status index (it's about URLs which are not considered to be fetched)
  • the error status is not necessarily a permanent one: depending on the configuration, a error page is retried after some time. Then the cross-submit check would be circumvented after a time delay.

continue;
}
} catch (MalformedURLException | URISyntaxException e) {
String errorMessage = String.format("Malformed URL in outlink %s: %s", url, e);
LOG.error(errorMessage);
ol.getMetadata().setValue(Constants.STATUS_ERROR_SOURCE,
"cross submit check");
ol.getMetadata().setValue(Constants.STATUS_ERROR_MESSAGE, errorMessage);
Values v = new Values(ol.getTargetURL(), ol.getMetadata(),
Status.ERROR);
collector.emit(StatusStreamName, tuple, v);
}

if (isSitemapIndex) {
ol.getMetadata().setValue(isSitemapKey, "true");
if (isSitemapVerified) {
Expand All @@ -278,6 +333,70 @@ public void execute(Tuple tuple) {
collector.ack(tuple);
}

public String getHost(URI url) {
if (this.crossSubmitLenient) {

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍 Nice. That's a good, lean solution.

/// www.example.com-> "example.com"
/// blog.subdomain.example.co.uk -> "example.co.uk"
/// www.myapp.github.io -> "myapp.github.io" (excludePrivate is false)
return EffectiveTldFinder.getAssignedDomain(url.getHost(), true,false);
}
return url.getHost();
}

/**
* Checks whether a sitemap URL is allowed to submit URLs for another host.
* If the sitemap and target URLs are on the same host, submission is allowed.
* For cross-host submissions, checks robots.txt rules of the target host.
*
* @param ol The outlink containing the target URL to check
* @param sitemap The URL of the sitemap
* @param metadata
* @return true if submission is allowed, false otherwise
* @throws MalformedURLException if URLs are malformed
*/
public boolean crossSubmitCheck(Outlink ol, String sitemap, Metadata metadata) throws URISyntaxException, MalformedURLException {
URI targetURL = new URI(ol.getTargetURL());
URI sitemapURL = new URI(sitemap);

String targetHost = this.getHost(targetURL);
String sitemapHost = this.getHost(sitemapURL);
// Same host - allow
if (targetHost.equals(sitemapHost)) {
return true;
}

// Cross-host checks
Metadata targetMetadata = metadataTransfer.getMetaForOutlink(targetURL.toString(), sitemapURL.toString(), metadata);

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My hint about the MetadataTransfer wasn't about using it for link checking, but to forward the robots.txt URL to the record of a sitemap URL.

This can be done by setting the configuration property metadata.track.path to true. The MetadataTransfer is called on all outlinks and also redirects from the StatusEmitterBolt.

Two caveats:

  • not the robots.txt URL is tracked but the URL which was going to be fetched and triggered the robots.txt to be fetched and checked. But this should be equivalent.
  • in case, there is a chain of redirects and/or a sitemap index record between robots.txt and the news sitemap, all intermediate "hops" are recorded. Then the first element of Metadata.urlPathKeyName is the original URL. Note: better use the string constant, not the literal "url.path".

String[] urlPaths = targetMetadata.getValues("url.path");

// Check url.path metadata first
if (urlPaths != null) {
for (String path : urlPaths) {
if (this.getHost(new URI(path)).equals(targetHost)) {
return true;
}
}
}

// Check robots.txt rules
Protocol protocol = protocolFactory.getProtocol(targetURL.toURL());
BaseRobotRules rules = protocol.getRobotRules(targetURL.toString());
if (rules != null) {
if (rules.getSitemaps().contains(sitemapURL.toString())) {
return true;
}
if (urlPaths != null) {
for (String path : urlPaths) {
if (rules.getSitemaps().contains(path)) {
return true;
}
}
}
}

return false;
}

public SitemapType detectContent(String url, byte[] content) {
// try to detect content based on the first n bytes
// works for XML and non-compressed documents
Expand Down Expand Up @@ -482,11 +601,15 @@ protected AbstractSiteMap parseSiteMap(String url, byte[] content,
public void prepare(Map stormConf, TopologyContext context,
OutputCollector collector) {
super.prepare(stormConf, context, collector);
Config conf = new Config();
conf.putAll(stormConf);
metadataTransfer = MetadataTransfer.getInstance(stormConf);
sniffContent = ConfUtils.getBoolean(stormConf,
"sitemap.sniffContent", false);
filterHoursSinceModified = ConfUtils.getInt(stormConf,
"sitemap.filter.hours.since.modified", -1);
parseFilters = ParseFilters.fromConf(stormConf);
protocolFactory = ProtocolFactory.getInstance(conf);
int maxOffsetGuess = ConfUtils.getInt(stormConf, "sitemap.offset.guess",
1024);
contentDetector = new ContentDetector(
Expand All @@ -498,6 +621,10 @@ public void prepare(Map stormConf, TopologyContext context,
new ReducedMetric(new MeanReducer()), 30);
scheduleSitemapsWithDelay = ConfUtils.getInt(stormConf,
"sitemap.schedule.delay", scheduleSitemapsWithDelay);
crossSubmitAllowed = ConfUtils.getBoolean(stormConf,
"crossSubmit.allowed", crossSubmitAllowed);
crossSubmitLenient = ConfUtils.getBoolean(stormConf,
"crossSubmit.lenient", crossSubmitLenient);;
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
package org.commoncrawl.stormcrawler.utils;

import org.apache.http.conn.util.PublicSuffixMatcher;
import org.apache.http.conn.util.PublicSuffixMatcherLoader;

public class DomainChecker {

public static String getPayLevelDomain(String domain) {
PublicSuffixMatcher matcher = PublicSuffixMatcherLoader.getDefault();
return matcher.getDomainRoot(domain); // Returns PLD (registered domain)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,22 @@
*/
package org.commoncrawl.stormcrawler.news;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotEquals;
import static org.junit.Assert.*;
import static org.mockito.Mockito.*;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.net.URISyntaxException;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.*;

import com.digitalpebble.stormcrawler.protocol.Protocol;
import com.digitalpebble.stormcrawler.protocol.ProtocolFactory;
import com.digitalpebble.stormcrawler.util.MetadataTransfer;
import crawlercommons.robots.BaseRobotRules;
import org.apache.commons.io.IOUtils;
import org.commoncrawl.stormcrawler.news.NewsSiteMapParserBolt.SitemapType;
import org.junit.Before;
Expand All @@ -46,6 +49,7 @@ public void setupParserBolt() {
config.put("sitemap.sniffContent", true);
// allow items published during the last week
config.put("sitemap.filter.hours.since.modified", 168);
config.put("http.agent.name", " Mozilla/5.0 (compatible; NewsBot/1.0; +http://example.com/bot)");
prepareParserBolt("test.parsefilters.json", config);
}

Expand Down Expand Up @@ -95,5 +99,125 @@ public void testFeedWithSitemapNamespace() throws IOException, UnknownFormatExce
assertNotEquals("RSS feed with sitemap namespace should not be detected as sitemap",
SitemapType.SITEMAP, type);
}

@Test

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍 Good to have a unit test!

public void testCrossHostSitemapVerification() throws IOException, UnknownFormatException, URISyntaxException {
String sitemapURL = "https://www.example.org/sitemap-news.xml";
String articleURL = "http://www.example.org/business/article55.html";
String adSitemapURL = "https://www.example.net/sitemap-ads.xml";

// Mock RobotRules and its dependencies
ProtocolFactory mockProtocolFactory = mock(ProtocolFactory.class);
Protocol mockProtocol = mock(Protocol.class);
when(mockProtocolFactory.getProtocol(any(URL.class))).thenReturn(mockProtocol);

BaseRobotRules mockRules = mock(BaseRobotRules.class);
when(mockProtocol.getRobotRules(articleURL)).thenReturn(mockRules);
when(mockRules.getSitemaps()).thenReturn(Collections.singletonList(sitemapURL));

BaseRobotRules mockRules1 = mock(BaseRobotRules.class);
when(mockProtocol.getRobotRules(any(String.class))).thenReturn(mockRules1);
when(mockRules.getSitemaps()).thenReturn(Collections.singletonList(adSitemapURL));
// Set up test data
byte[] content = readContent("cross-sitemap-news.xml");
String contentType = "";
Metadata parentMetadata = new Metadata();
List<Outlink> links = new ArrayList<>();

// Set recent publication date and cross-host URL
LocalDateTime yesterday = LocalDateTime.now().minusDays(1);
content = (new String(content, StandardCharsets.UTF_8))
.replace("<news:publication_date>2008-12-23</news:publication_date>",
"<news:publication_date>" + yesterday.format(DateTimeFormatter.ofPattern("yyyy-MM-dd")) + "</news:publication_date>")
.getBytes(StandardCharsets.UTF_8);

// Inject mocked protocol factory
((NewsSiteMapParserBolt) bolt).setProtocolFactory(mockProtocolFactory);

((NewsSiteMapParserBolt) bolt).parseSiteMap(sitemapURL, content, contentType, parentMetadata, links);
// Verify the cross-host link is allowed and included
assertEquals(3, links.size());
assertTrue(((NewsSiteMapParserBolt) bolt).crossSubmitCheck(links.get(0), sitemapURL, parentMetadata));
assertFalse(((NewsSiteMapParserBolt) bolt).crossSubmitCheck(links.get(1), sitemapURL, parentMetadata));
}

/**
* Tests cross-host sitemap submissions with the following structure:
*
* www.example.org/sitemap-news.xml
* └── www.example.com/sports/news1.html
* └── www.example.org/business/article55.html
* └── www.example.net/ads/sponsored-content.html
*
* www.example.org/robots.txt
* └── www.example.org/sitemap-index.xml
* └── www.example.org/sitemap-news.xml
*
* www.example.com/robots.txt
* └── www.example.org/sitemap-index.xml (shared with www.example.org)
*
* www.example.net/robots.txt
* └── www.example.net/sitemap.xml
* └── www.example.net/ads/sponsored-content.html
*
* URLs from example.org and example.com pass crossSubmitCheck since their robots.txt
* reference the same sitemap index which contains the sitemap from which the link is fetched.
* URLs from example.net fail since their robots reference a different sitemap index.
*/
@Test
public void test_cross_host_submission_sitemaps() throws IOException, UnknownFormatException, URISyntaxException {

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not a canonical Java method name.

String sitemapURL = "https://www.example.org/sitemap-news.xml";
String sitemapIndexURL = "https://www.example.org/sitemap-index.xml";
String adSitemapURL = "https://www.example.net/sitemap-ads.xml";


// Mock RobotRules and its dependencies
ProtocolFactory mockProtocolFactory = mock(ProtocolFactory.class);
Protocol mockProtocol = mock(Protocol.class);
when(mockProtocolFactory.getProtocol(any(URL.class))).thenReturn(mockProtocol);

BaseRobotRules mockRules = mock(BaseRobotRules.class);
when(mockProtocol.getRobotRules("http://www.example.org/business/article55.html")).thenReturn(mockRules);
when(mockRules.getSitemaps()).thenReturn(Collections.singletonList(sitemapIndexURL));

BaseRobotRules mockRules1 = mock(BaseRobotRules.class);
when(mockProtocol.getRobotRules("http://www.example.com/sports/news1.html")).thenReturn(mockRules1);
when(mockRules1.getSitemaps()).thenReturn(Collections.singletonList(sitemapIndexURL));

BaseRobotRules mockRules2 = mock(BaseRobotRules.class);
when(mockProtocol.getRobotRules("http://www.example.net/ads/sponsored-content.html")).thenReturn(mockRules2);
when(mockRules2.getSitemaps()).thenReturn(Collections.singletonList(adSitemapURL));

// Mocking MetadataTransfer to return specific url.path metadata
MetadataTransfer metadataTransferMock = mock(MetadataTransfer.class);
Metadata targetMetadata = new Metadata();
targetMetadata.addValues("url.path", Arrays.asList(sitemapIndexURL, sitemapURL));
when(metadataTransferMock.getMetaForOutlink(anyString(), anyString(), any(Metadata.class)))
.thenReturn(targetMetadata);

// Injecting the mock into the bolt
((NewsSiteMapParserBolt) bolt).setMetadataTransfer(metadataTransferMock);

// Set up test data
byte[] content = readContent("cross-sitemap-news.xml");
String contentType = "";
Metadata parentMetadata = new Metadata();
List<Outlink> links = new ArrayList<>();

// Set recent publication date and cross-host URL
LocalDateTime yesterday = LocalDateTime.now().minusDays(1);
content = (new String(content, StandardCharsets.UTF_8))
.replace("<news:publication_date>2008-12-23</news:publication_date>",
"<news:publication_date>" + yesterday.format(DateTimeFormatter.ofPattern("yyyy-MM-dd")) + "</news:publication_date>")
.getBytes(StandardCharsets.UTF_8);

// Inject mocked protocol factory
((NewsSiteMapParserBolt) bolt).setProtocolFactory(mockProtocolFactory);

((NewsSiteMapParserBolt) bolt).parseSiteMap(sitemapURL, content, contentType, parentMetadata, links);
// Verify the cross-host link is allowed and included
assertEquals(3, links.size());
assertTrue(((NewsSiteMapParserBolt) bolt).crossSubmitCheck(links.get(0), sitemapURL, parentMetadata));
assertFalse(((NewsSiteMapParserBolt) bolt).crossSubmitCheck(links.get(1), sitemapURL, parentMetadata));
assertTrue(((NewsSiteMapParserBolt) bolt).crossSubmitCheck(links.get(2), sitemapURL, parentMetadata));
}
}
Loading