From 63dafc185fd3f7daf94ceca00aa4eaff35bb5892 Mon Sep 17 00:00:00 2001 From: Julien Nioche Date: Wed, 13 Dec 2023 10:54:32 +0000 Subject: [PATCH 1/8] Have as many WARCBolt instances as there are workers, fix #64 Signed-off-by: Julien Nioche --- .../java/org/commoncrawl/stormcrawler/news/CrawlTopology.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java b/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java index 15f4074..8b7d54c 100644 --- a/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java +++ b/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java @@ -94,7 +94,7 @@ protected int run(String[] args) { // take it from feed default output so that the feed files themselves // don't get included - unless we want them too of course! - builder.setBolt("warc", warcbolt).localOrShuffleGrouping("feed"); + builder.setBolt("warc", warcbolt, numWorkers).localOrShuffleGrouping("feed"); BoltDeclarer statusBolt = builder.setBolt("status", new StatusUpdaterBolt(), numWorkers) .localOrShuffleGrouping("fetch", Constants.StatusStreamName) From efd0d2407acac5d7970cd915dbf114f76ccb024f Mon Sep 17 00:00:00 2001 From: Julien Nioche Date: Wed, 13 Dec 2023 11:23:15 +0000 Subject: [PATCH 2/8] Route tuples to the status updater bolt based on URLs,fixes #65 Signed-off-by: Julien Nioche --- conf/crawler.flux | 15 ++++++++++----- .../stormcrawler/news/CrawlTopology.java | 14 +++++++++----- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/conf/crawler.flux b/conf/crawler.flux index c116f0a..1d221c2 100644 --- a/conf/crawler.flux +++ b/conf/crawler.flux @@ -172,31 +172,36 @@ streams: - from: "prefilter" to: "status" grouping: - type: LOCAL_OR_SHUFFLE + type: FIELDS + args: ["url"] streamId: "status" - from: "fetcher" to: "status" grouping: - type: LOCAL_OR_SHUFFLE + type: FIELDS + args: ["url"] streamId: "status" - from: "sitemap" to: "status" grouping: - type: LOCAL_OR_SHUFFLE + type: FIELDS + args: ["url"] streamId: "status" - from: "feed" to: "status" grouping: - type: LOCAL_OR_SHUFFLE + type: FIELDS + args: ["url"] streamId: "status" - from: "ssbolt" to: "status" grouping: - type: LOCAL_OR_SHUFFLE + type: FIELDS + args: ["url"] streamId: "status" # part of the topology used to inject seeds diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java b/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java index 8b7d54c..bfba4b7 100644 --- a/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java +++ b/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java @@ -95,16 +95,20 @@ protected int run(String[] args) { // take it from feed default output so that the feed files themselves // don't get included - unless we want them too of course! builder.setBolt("warc", warcbolt, numWorkers).localOrShuffleGrouping("feed"); + + final Fields furl = new Fields("url"); BoltDeclarer statusBolt = builder.setBolt("status", new StatusUpdaterBolt(), numWorkers) - .localOrShuffleGrouping("fetch", Constants.StatusStreamName) - .localOrShuffleGrouping("sitemap", Constants.StatusStreamName) - .localOrShuffleGrouping("feed", Constants.StatusStreamName) - .localOrShuffleGrouping("ssb", Constants.StatusStreamName) - .localOrShuffleGrouping("prefilter", Constants.StatusStreamName).setNumTasks(numShards); + .fieldsGrouping("fetch", Constants.StatusStreamName, furl) + .fieldsGrouping("sitemap", Constants.StatusStreamName, furl) + .fieldsGrouping("feed", Constants.StatusStreamName, furl) + .fieldsGrouping("ssb", Constants.StatusStreamName, furl) + .fieldsGrouping("prefilter", Constants.StatusStreamName, furl); + if (args.length >= 2) { statusBolt.customGrouping("filter", Constants.StatusStreamName, new URLStreamGrouping()); } + statusBolt.setNumTasks(numShards); return submit(conf, builder); } From 4c854e0e2b42bf2a167fe9026458f72bb997d134 Mon Sep 17 00:00:00 2001 From: silentninja Date: Fri, 21 Feb 2025 18:36:15 +0400 Subject: [PATCH 3/8] Add cross submit verification for sitemaps --- .../news/NewsSiteMapParserBolt.java | 74 +++++++++++++++++++ .../news/NewsSiteMapParserTest.java | 52 +++++++++++-- src/test/resources/cross-sitemap-news.xml | 61 +++++++++++++++ 3 files changed, 180 insertions(+), 7 deletions(-) create mode 100644 src/test/resources/cross-sitemap-news.xml diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserBolt.java b/src/main/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserBolt.java index 42f4de3..d2bedc7 100644 --- a/src/main/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserBolt.java +++ b/src/main/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserBolt.java @@ -16,6 +16,7 @@ import static com.digitalpebble.stormcrawler.Constants.StatusStreamName; import java.io.IOException; +import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.Calendar; @@ -25,7 +26,11 @@ import java.util.List; import java.util.Map; +import com.digitalpebble.stormcrawler.protocol.Protocol; +import com.digitalpebble.stormcrawler.protocol.ProtocolFactory; +import crawlercommons.robots.BaseRobotRules; import org.apache.commons.lang.StringUtils; +import org.apache.storm.Config; import org.apache.storm.metric.api.MeanReducer; import org.apache.storm.metric.api.ReducedMetric; import org.apache.storm.task.OutputCollector; @@ -68,6 +73,15 @@ */ @SuppressWarnings("serial") public class NewsSiteMapParserBolt extends SiteMapParserBolt { + public ProtocolFactory getProtocolFactory() { + return protocolFactory; + } + + public void setProtocolFactory(ProtocolFactory protocolFactory) { + this.protocolFactory = protocolFactory; + } + + private ProtocolFactory protocolFactory; // TODO: // this is a modified copy of c.d.s.bolt.SiteMapParserBolt // - make parent class extensible and overridable @@ -257,6 +271,29 @@ public void execute(Tuple tuple) { // send outlinks to status stream for (Outlink ol : outlinks) { + try { + if (!crossSubmitCheck(ol, url)) { + String errorMessage = String.format("Cross Submit check failed for %s in %s", ol.getTargetURL(), url); + LOG.error(errorMessage); + ol.getMetadata().setValue(Constants.STATUS_ERROR_SOURCE, + "cross submit check"); + ol.getMetadata().setValue(Constants.STATUS_ERROR_MESSAGE, errorMessage); + Values v = new Values(ol.getTargetURL(), ol.getMetadata(), + Status.ERROR); + collector.emit(StatusStreamName, tuple, v); + continue; + } + } catch (MalformedURLException e) { + String errorMessage = String.format("Malformed URL in outlink %s: %s", url, e); + LOG.error(errorMessage); + ol.getMetadata().setValue(Constants.STATUS_ERROR_SOURCE, + "cross submit check"); + ol.getMetadata().setValue(Constants.STATUS_ERROR_MESSAGE, errorMessage); + Values v = new Values(ol.getTargetURL(), ol.getMetadata(), + Status.ERROR); + collector.emit(StatusStreamName, tuple, v); + } + if (isSitemapIndex) { ol.getMetadata().setValue(isSitemapKey, "true"); if (isSitemapVerified) { @@ -278,6 +315,40 @@ public void execute(Tuple tuple) { collector.ack(tuple); } + /** + * Checks whether a sitemap URL is allowed to submit URLs for another host. + * If the sitemap and target URLs are on the same host, submission is allowed. + * For cross-host submissions, checks robots.txt rules of the target host. + * + * @param ol The outlink containing the target URL to check + * @param sitemap The URL of the sitemap + * @return true if submission is allowed, false otherwise + * @throws MalformedURLException if URLs are malformed + */ + public boolean crossSubmitCheck(Outlink ol, String sitemap ) throws MalformedURLException{ + + URL targetURL; + URL sitemapURL; + sitemapURL = new URL(sitemap); + + targetURL = new URL(ol.getTargetURL()); + if (targetURL.getHost().equals(sitemapURL.getHost())) { + // same host, no need to check robots.txt + return true; + } + else { + Protocol protocol = protocolFactory.getProtocol(targetURL); + BaseRobotRules rules = protocol.getRobotRules(ol.getTargetURL()); + if (rules != null) { + return rules.getSitemaps().contains(sitemapURL.toString()) && rules.isAllowed(targetURL.toString()); + } + else { + // no robots.txt and a cross submit host, so disallow + return false; + } + } + } + public SitemapType detectContent(String url, byte[] content) { // try to detect content based on the first n bytes // works for XML and non-compressed documents @@ -482,11 +553,14 @@ protected AbstractSiteMap parseSiteMap(String url, byte[] content, public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) { super.prepare(stormConf, context, collector); + Config conf = new Config(); + conf.putAll(stormConf); sniffContent = ConfUtils.getBoolean(stormConf, "sitemap.sniffContent", false); filterHoursSinceModified = ConfUtils.getInt(stormConf, "sitemap.filter.hours.since.modified", -1); parseFilters = ParseFilters.fromConf(stormConf); + protocolFactory = ProtocolFactory.getInstance(conf); int maxOffsetGuess = ConfUtils.getInt(stormConf, "sitemap.offset.guess", 1024); contentDetector = new ContentDetector( diff --git a/src/test/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserTest.java b/src/test/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserTest.java index 7d64720..6e955b3 100644 --- a/src/test/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserTest.java +++ b/src/test/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserTest.java @@ -13,19 +13,20 @@ */ package org.commoncrawl.stormcrawler.news; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotEquals; +import static org.junit.Assert.*; +import static org.mockito.Mockito.*; import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.net.URL; import java.nio.charset.StandardCharsets; import java.time.LocalDateTime; import java.time.format.DateTimeFormatter; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; +import com.digitalpebble.stormcrawler.protocol.Protocol; +import com.digitalpebble.stormcrawler.protocol.ProtocolFactory; +import crawlercommons.robots.BaseRobotRules; import org.apache.commons.io.IOUtils; import org.commoncrawl.stormcrawler.news.NewsSiteMapParserBolt.SitemapType; import org.junit.Before; @@ -46,6 +47,7 @@ public void setupParserBolt() { config.put("sitemap.sniffContent", true); // allow items published during the last week config.put("sitemap.filter.hours.since.modified", 168); + config.put("http.agent.name", " Mozilla/5.0 (compatible; NewsBot/1.0; +http://example.com/bot)"); prepareParserBolt("test.parsefilters.json", config); } @@ -95,5 +97,41 @@ public void testFeedWithSitemapNamespace() throws IOException, UnknownFormatExce assertNotEquals("RSS feed with sitemap namespace should not be detected as sitemap", SitemapType.SITEMAP, type); } - + @Test + public void testCrossHostSitemapVerification() throws IOException, UnknownFormatException { + String sitemapURL = "https://example.org/sitemap-news.xml"; + String articleURL = "http://www.example.org/business/article55.html"; + String crossHostUrl = "http://www.example.net/ads/sponsored-content.html"; + + // Mock RobotRules and its dependencies + ProtocolFactory mockProtocolFactory = mock(ProtocolFactory.class); + Protocol mockProtocol = mock(Protocol.class); + BaseRobotRules mockRules = mock(BaseRobotRules.class); + when(mockProtocolFactory.getProtocol(any(URL.class))).thenReturn(mockProtocol); + when(mockProtocol.getRobotRules(any(String.class))).thenReturn(mockRules); + + when(mockRules.getSitemaps()).thenReturn(Collections.singletonList(sitemapURL)); + when(mockRules.isAllowed(articleURL)).thenReturn(true); + // Set up test data + byte[] content = readContent("cross-sitemap-news.xml"); + String contentType = ""; + Metadata parentMetadata = new Metadata(); + List links = new ArrayList<>(); + + // Set recent publication date and cross-host URL + LocalDateTime yesterday = LocalDateTime.now().minusDays(1); + content = (new String(content, StandardCharsets.UTF_8)) + .replace("2008-12-23", + "" + yesterday.format(DateTimeFormatter.ofPattern("yyyy-MM-dd")) + "") + .getBytes(StandardCharsets.UTF_8); + + // Inject mocked protocol factory + ((NewsSiteMapParserBolt) bolt).setProtocolFactory(mockProtocolFactory); + + ((NewsSiteMapParserBolt) bolt).parseSiteMap(sitemapURL, content, contentType, parentMetadata, links); + // Verify the cross-host link is allowed and included + assertEquals(2, links.size()); + assertTrue(((NewsSiteMapParserBolt) bolt).crossSubmitCheck(links.get(0), sitemapURL)); + assertFalse(((NewsSiteMapParserBolt) bolt).crossSubmitCheck(links.get(1), sitemapURL)); + } } diff --git a/src/test/resources/cross-sitemap-news.xml b/src/test/resources/cross-sitemap-news.xml new file mode 100644 index 0000000..5309d85 --- /dev/null +++ b/src/test/resources/cross-sitemap-news.xml @@ -0,0 +1,61 @@ + + + + + + + http://www.example.org/business/article55.html + + + The Example Times + en + + PressRelease, Blog + 2008-12-23 + Companies A, B in Merger Talks + business, merger, acquisition, A, B + NASDAQ:A, NASDAQ:B + + + + http://example.org/image.jpg + + + http://example.org/photo.jpg + This is the caption. + Limerick, Ireland + Example photo shot in Limerick, Ireland + https://creativecommons.org/licenses/by/4.0/legalcode + + + + + http://www.example.net/ads/sponsored-content.html + + + Example News + en + + Advertisement, Sponsored + 2008-12-23 + Special Holiday Deals from Our Partners + sponsored, advertisement, deals, holiday + + + http://www.example.net/ads/promo-banner.jpg + Holiday Season Special Offers + Promotional Banner + + + + From 0a58e3b5ba43730f611ce93d69ca6fb0a1a71f94 Mon Sep 17 00:00:00 2001 From: silentninja Date: Wed, 26 Feb 2025 17:51:45 +0400 Subject: [PATCH 4/8] Add cross submit verification check for sitemap indexes --- .../stormcrawler/news/CrawlTopology.java | 21 ++++ .../news/NewsSiteMapParserBolt.java | 67 ++++++++---- .../news/NewsSiteMapParserTest.java | 101 ++++++++++++++++-- src/test/resources/cross-sitemap-news.xml | 22 +++- 4 files changed, 183 insertions(+), 28 deletions(-) diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java b/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java index ce21c7a..77d2e37 100644 --- a/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java +++ b/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java @@ -21,6 +21,11 @@ import java.util.LinkedHashMap; import java.util.Map; +import com.digitalpebble.stormcrawler.Metadata; +import com.digitalpebble.stormcrawler.persistence.Status; +import org.apache.storm.Config; +import org.apache.storm.LocalCluster; +import org.apache.storm.StormSubmitter; import org.apache.storm.topology.BoltDeclarer; import org.apache.storm.topology.TopologyBuilder; import org.apache.storm.tuple.Fields; @@ -156,4 +161,20 @@ protected WARCHdfsBolt getWarcBolt(String filePrefix) { return warcbolt; } + @Override + protected int submit(String name, Config conf, TopologyBuilder builder) { + // register for serialization with Kryo + Config.registerSerialization(conf, Metadata.class); + Config.registerSerialization(conf, Status.class); + + try { + LocalCluster cluster = new LocalCluster(); + cluster.submitTopology(name, conf, builder.createTopology()); + } catch (Exception e) { + e.printStackTrace(); + return -1; + } + return 0; + + } } diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserBolt.java b/src/main/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserBolt.java index d2bedc7..3bb62bc 100644 --- a/src/main/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserBolt.java +++ b/src/main/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserBolt.java @@ -28,6 +28,7 @@ import com.digitalpebble.stormcrawler.protocol.Protocol; import com.digitalpebble.stormcrawler.protocol.ProtocolFactory; +import com.digitalpebble.stormcrawler.util.MetadataTransfer; import crawlercommons.robots.BaseRobotRules; import org.apache.commons.lang.StringUtils; import org.apache.storm.Config; @@ -81,6 +82,16 @@ public void setProtocolFactory(ProtocolFactory protocolFactory) { this.protocolFactory = protocolFactory; } + public MetadataTransfer getMetadataTransfer() { + return metadataTransfer; + } + + public void setMetadataTransfer(MetadataTransfer metadataTransfer) { + this.metadataTransfer = metadataTransfer; + } + + private MetadataTransfer metadataTransfer; + private ProtocolFactory protocolFactory; // TODO: // this is a modified copy of c.d.s.bolt.SiteMapParserBolt @@ -272,7 +283,8 @@ public void execute(Tuple tuple) { // send outlinks to status stream for (Outlink ol : outlinks) { try { - if (!crossSubmitCheck(ol, url)) { + if (!crossSubmitCheck(ol, url, metadata)) { + String errorMessage = String.format("Cross Submit check failed for %s in %s", ol.getTargetURL(), url); LOG.error(errorMessage); ol.getMetadata().setValue(Constants.STATUS_ERROR_SOURCE, @@ -320,33 +332,51 @@ public void execute(Tuple tuple) { * If the sitemap and target URLs are on the same host, submission is allowed. * For cross-host submissions, checks robots.txt rules of the target host. * - * @param ol The outlink containing the target URL to check - * @param sitemap The URL of the sitemap + * @param ol The outlink containing the target URL to check + * @param sitemap The URL of the sitemap + * @param metadata * @return true if submission is allowed, false otherwise * @throws MalformedURLException if URLs are malformed */ - public boolean crossSubmitCheck(Outlink ol, String sitemap ) throws MalformedURLException{ - - URL targetURL; - URL sitemapURL; - sitemapURL = new URL(sitemap); + public boolean crossSubmitCheck(Outlink ol, String sitemap, Metadata metadata) throws MalformedURLException { + URL targetURL = new URL(ol.getTargetURL()); + URL sitemapURL = new URL(sitemap); - targetURL = new URL(ol.getTargetURL()); + // Same host - allow if (targetURL.getHost().equals(sitemapURL.getHost())) { - // same host, no need to check robots.txt return true; } - else { - Protocol protocol = protocolFactory.getProtocol(targetURL); - BaseRobotRules rules = protocol.getRobotRules(ol.getTargetURL()); - if (rules != null) { - return rules.getSitemaps().contains(sitemapURL.toString()) && rules.isAllowed(targetURL.toString()); + + // Cross-host checks + Metadata targetMetadata = metadataTransfer.getMetaForOutlink(ol.getTargetURL(), sitemapURL.toString(), metadata); + String[] urlPaths = targetMetadata.getValues("url.path"); + + // Check url.path metadata first + if (urlPaths != null) { + for (String path : urlPaths) { + if (new URL(path).getHost().equals(targetURL.getHost())) { + return true; + } } - else { - // no robots.txt and a cross submit host, so disallow - return false; + } + + // Check robots.txt rules + Protocol protocol = protocolFactory.getProtocol(targetURL); + BaseRobotRules rules = protocol.getRobotRules(ol.getTargetURL()); + if (rules != null) { + if (rules.getSitemaps().contains(sitemapURL.toString())) { + return true; + } + if (urlPaths != null) { + for (String path : urlPaths) { + if (rules.getSitemaps().contains(path)) { + return true; + } + } } } + + return false; } public SitemapType detectContent(String url, byte[] content) { @@ -555,6 +585,7 @@ public void prepare(Map stormConf, TopologyContext context, super.prepare(stormConf, context, collector); Config conf = new Config(); conf.putAll(stormConf); + metadataTransfer = MetadataTransfer.getInstance(stormConf); sniffContent = ConfUtils.getBoolean(stormConf, "sitemap.sniffContent", false); filterHoursSinceModified = ConfUtils.getInt(stormConf, diff --git a/src/test/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserTest.java b/src/test/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserTest.java index 6e955b3..7db4f19 100644 --- a/src/test/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserTest.java +++ b/src/test/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserTest.java @@ -26,6 +26,7 @@ import com.digitalpebble.stormcrawler.protocol.Protocol; import com.digitalpebble.stormcrawler.protocol.ProtocolFactory; +import com.digitalpebble.stormcrawler.util.MetadataTransfer; import crawlercommons.robots.BaseRobotRules; import org.apache.commons.io.IOUtils; import org.commoncrawl.stormcrawler.news.NewsSiteMapParserBolt.SitemapType; @@ -99,19 +100,22 @@ public void testFeedWithSitemapNamespace() throws IOException, UnknownFormatExce } @Test public void testCrossHostSitemapVerification() throws IOException, UnknownFormatException { - String sitemapURL = "https://example.org/sitemap-news.xml"; + String sitemapURL = "https://www.example.org/sitemap-news.xml"; String articleURL = "http://www.example.org/business/article55.html"; - String crossHostUrl = "http://www.example.net/ads/sponsored-content.html"; + String adSitemapURL = "https://www.example.net/sitemap-ads.xml"; // Mock RobotRules and its dependencies ProtocolFactory mockProtocolFactory = mock(ProtocolFactory.class); Protocol mockProtocol = mock(Protocol.class); - BaseRobotRules mockRules = mock(BaseRobotRules.class); when(mockProtocolFactory.getProtocol(any(URL.class))).thenReturn(mockProtocol); - when(mockProtocol.getRobotRules(any(String.class))).thenReturn(mockRules); + BaseRobotRules mockRules = mock(BaseRobotRules.class); + when(mockProtocol.getRobotRules(articleURL)).thenReturn(mockRules); when(mockRules.getSitemaps()).thenReturn(Collections.singletonList(sitemapURL)); - when(mockRules.isAllowed(articleURL)).thenReturn(true); + + BaseRobotRules mockRules1 = mock(BaseRobotRules.class); + when(mockProtocol.getRobotRules(any(String.class))).thenReturn(mockRules1); + when(mockRules.getSitemaps()).thenReturn(Collections.singletonList(adSitemapURL)); // Set up test data byte[] content = readContent("cross-sitemap-news.xml"); String contentType = ""; @@ -130,8 +134,89 @@ public void testCrossHostSitemapVerification() throws IOException, UnknownFormat ((NewsSiteMapParserBolt) bolt).parseSiteMap(sitemapURL, content, contentType, parentMetadata, links); // Verify the cross-host link is allowed and included - assertEquals(2, links.size()); - assertTrue(((NewsSiteMapParserBolt) bolt).crossSubmitCheck(links.get(0), sitemapURL)); - assertFalse(((NewsSiteMapParserBolt) bolt).crossSubmitCheck(links.get(1), sitemapURL)); + assertEquals(3, links.size()); + assertTrue(((NewsSiteMapParserBolt) bolt).crossSubmitCheck(links.get(0), sitemapURL, parentMetadata)); + assertFalse(((NewsSiteMapParserBolt) bolt).crossSubmitCheck(links.get(1), sitemapURL, parentMetadata)); } + + /** + * Tests cross-host sitemap submissions with the following structure: + * + * www.example.org/sitemap-news.xml + * └── www.example.com/sports/news1.html + * └── www.example.org/business/article55.html + * └── www.example.net/ads/sponsored-content.html + * + * www.example.org/robots.txt + * └── www.example.org/sitemap-index.xml + * └── www.example.org/sitemap-news.xml + * + * www.example.com/robots.txt + * └── www.example.org/sitemap-index.xml (shared with www.example.org) + * + * www.example.net/robots.txt + * └── www.example.net/sitemap.xml + * └── www.example.net/ads/sponsored-content.html + * + * URLs from example.org and example.com pass crossSubmitCheck since their robots.txt + * reference the same sitemap index which contains the sitemap from which the link is fetched. + * URLs from example.net fail since their robots reference a different sitemap index. + */ + @Test + public void test_cross_host_submission_sitemaps() throws IOException, UnknownFormatException { + String sitemapURL = "https://www.example.org/sitemap-news.xml"; + String sitemapIndexURL = "https://www.example.org/sitemap-index.xml"; + String adSitemapURL = "https://www.example.net/sitemap-ads.xml"; + + + // Mock RobotRules and its dependencies + ProtocolFactory mockProtocolFactory = mock(ProtocolFactory.class); + Protocol mockProtocol = mock(Protocol.class); + when(mockProtocolFactory.getProtocol(any(URL.class))).thenReturn(mockProtocol); + + BaseRobotRules mockRules = mock(BaseRobotRules.class); + when(mockProtocol.getRobotRules("http://www.example.org/business/article55.html")).thenReturn(mockRules); + when(mockRules.getSitemaps()).thenReturn(Collections.singletonList(sitemapIndexURL)); + + BaseRobotRules mockRules1 = mock(BaseRobotRules.class); + when(mockProtocol.getRobotRules("http://www.example.com/sports/news1.html")).thenReturn(mockRules1); + when(mockRules1.getSitemaps()).thenReturn(Collections.singletonList(sitemapIndexURL)); + + BaseRobotRules mockRules2 = mock(BaseRobotRules.class); + when(mockProtocol.getRobotRules("http://www.example.net/ads/sponsored-content.html")).thenReturn(mockRules2); + when(mockRules2.getSitemaps()).thenReturn(Collections.singletonList(adSitemapURL)); + + // Mocking MetadataTransfer to return specific url.path metadata + MetadataTransfer metadataTransferMock = mock(MetadataTransfer.class); + Metadata targetMetadata = new Metadata(); + targetMetadata.addValues("url.path", Arrays.asList(sitemapIndexURL, sitemapURL)); + when(metadataTransferMock.getMetaForOutlink(anyString(), anyString(), any(Metadata.class))) + .thenReturn(targetMetadata); + + // Injecting the mock into the bolt + ((NewsSiteMapParserBolt) bolt).setMetadataTransfer(metadataTransferMock); + + // Set up test data + byte[] content = readContent("cross-sitemap-news.xml"); + String contentType = ""; + Metadata parentMetadata = new Metadata(); + List links = new ArrayList<>(); + + // Set recent publication date and cross-host URL + LocalDateTime yesterday = LocalDateTime.now().minusDays(1); + content = (new String(content, StandardCharsets.UTF_8)) + .replace("2008-12-23", + "" + yesterday.format(DateTimeFormatter.ofPattern("yyyy-MM-dd")) + "") + .getBytes(StandardCharsets.UTF_8); + + // Inject mocked protocol factory + ((NewsSiteMapParserBolt) bolt).setProtocolFactory(mockProtocolFactory); + + ((NewsSiteMapParserBolt) bolt).parseSiteMap(sitemapURL, content, contentType, parentMetadata, links); + // Verify the cross-host link is allowed and included + assertEquals(3, links.size()); + assertTrue(((NewsSiteMapParserBolt) bolt).crossSubmitCheck(links.get(0), sitemapURL, parentMetadata)); + assertFalse(((NewsSiteMapParserBolt) bolt).crossSubmitCheck(links.get(1), sitemapURL, parentMetadata)); + assertTrue(((NewsSiteMapParserBolt) bolt).crossSubmitCheck(links.get(2), sitemapURL, parentMetadata)); + } } diff --git a/src/test/resources/cross-sitemap-news.xml b/src/test/resources/cross-sitemap-news.xml index 5309d85..02ef434 100644 --- a/src/test/resources/cross-sitemap-news.xml +++ b/src/test/resources/cross-sitemap-news.xml @@ -28,7 +28,7 @@ - http://example.org/image.jpg + http://www.example.org/image.jpg http://example.org/photo.jpg @@ -39,7 +39,7 @@ - + http://www.example.net/ads/sponsored-content.html @@ -57,5 +57,23 @@ Promotional Banner + + http://www.example.com/sports/news1.html + + + Example News + en + + Advertisement, Sponsored + 2008-12-23 + Special Holiday Deals from Our Partners + sponsored, advertisement, deals, holiday + + + http://www.example.net/ads/promo-banner.jpg + Holiday Season Special Offers + Promotional Banner + + From 8cf52f53ab1f205fe7f06e5f88fd39ba3f49100e Mon Sep 17 00:00:00 2001 From: silentninja Date: Mon, 3 Mar 2025 15:12:20 +0400 Subject: [PATCH 5/8] Use URI instead of URL --- .../stormcrawler/news/NewsSiteMapParserBolt.java | 16 +++++++++------- .../stormcrawler/news/NewsSiteMapParserTest.java | 5 +++-- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserBolt.java b/src/main/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserBolt.java index 3bb62bc..5ad9e28 100644 --- a/src/main/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserBolt.java +++ b/src/main/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserBolt.java @@ -17,6 +17,8 @@ import java.io.IOException; import java.net.MalformedURLException; +import java.net.URI; +import java.net.URISyntaxException; import java.net.URL; import java.util.ArrayList; import java.util.Calendar; @@ -295,7 +297,7 @@ public void execute(Tuple tuple) { collector.emit(StatusStreamName, tuple, v); continue; } - } catch (MalformedURLException e) { + } catch (MalformedURLException | URISyntaxException e) { String errorMessage = String.format("Malformed URL in outlink %s: %s", url, e); LOG.error(errorMessage); ol.getMetadata().setValue(Constants.STATUS_ERROR_SOURCE, @@ -338,9 +340,9 @@ public void execute(Tuple tuple) { * @return true if submission is allowed, false otherwise * @throws MalformedURLException if URLs are malformed */ - public boolean crossSubmitCheck(Outlink ol, String sitemap, Metadata metadata) throws MalformedURLException { - URL targetURL = new URL(ol.getTargetURL()); - URL sitemapURL = new URL(sitemap); + public boolean crossSubmitCheck(Outlink ol, String sitemap, Metadata metadata) throws URISyntaxException, MalformedURLException { + URI targetURL = new URI(ol.getTargetURL()); + URI sitemapURL = new URI(sitemap); // Same host - allow if (targetURL.getHost().equals(sitemapURL.getHost())) { @@ -348,7 +350,7 @@ public boolean crossSubmitCheck(Outlink ol, String sitemap, Metadata metadata) t } // Cross-host checks - Metadata targetMetadata = metadataTransfer.getMetaForOutlink(ol.getTargetURL(), sitemapURL.toString(), metadata); + Metadata targetMetadata = metadataTransfer.getMetaForOutlink(targetURL.toString(), sitemapURL.toString(), metadata); String[] urlPaths = targetMetadata.getValues("url.path"); // Check url.path metadata first @@ -361,8 +363,8 @@ public boolean crossSubmitCheck(Outlink ol, String sitemap, Metadata metadata) t } // Check robots.txt rules - Protocol protocol = protocolFactory.getProtocol(targetURL); - BaseRobotRules rules = protocol.getRobotRules(ol.getTargetURL()); + Protocol protocol = protocolFactory.getProtocol(targetURL.toURL()); + BaseRobotRules rules = protocol.getRobotRules(targetURL.toString()); if (rules != null) { if (rules.getSitemaps().contains(sitemapURL.toString())) { return true; diff --git a/src/test/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserTest.java b/src/test/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserTest.java index 7db4f19..be60942 100644 --- a/src/test/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserTest.java +++ b/src/test/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserTest.java @@ -18,6 +18,7 @@ import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.net.URISyntaxException; import java.net.URL; import java.nio.charset.StandardCharsets; import java.time.LocalDateTime; @@ -99,7 +100,7 @@ public void testFeedWithSitemapNamespace() throws IOException, UnknownFormatExce SitemapType.SITEMAP, type); } @Test - public void testCrossHostSitemapVerification() throws IOException, UnknownFormatException { + public void testCrossHostSitemapVerification() throws IOException, UnknownFormatException, URISyntaxException { String sitemapURL = "https://www.example.org/sitemap-news.xml"; String articleURL = "http://www.example.org/business/article55.html"; String adSitemapURL = "https://www.example.net/sitemap-ads.xml"; @@ -163,7 +164,7 @@ public void testCrossHostSitemapVerification() throws IOException, UnknownFormat * URLs from example.net fail since their robots reference a different sitemap index. */ @Test - public void test_cross_host_submission_sitemaps() throws IOException, UnknownFormatException { + public void test_cross_host_submission_sitemaps() throws IOException, UnknownFormatException, URISyntaxException { String sitemapURL = "https://www.example.org/sitemap-news.xml"; String sitemapIndexURL = "https://www.example.org/sitemap-index.xml"; String adSitemapURL = "https://www.example.net/sitemap-ads.xml"; From 29e5f3d0e7ab8a2f5cf52fdbc0e6c6c06cc723f1 Mon Sep 17 00:00:00 2001 From: silentninja Date: Mon, 3 Mar 2025 15:29:45 +0400 Subject: [PATCH 6/8] Revert accidental changes --- .../stormcrawler/news/CrawlTopology.java | 21 ------------------- 1 file changed, 21 deletions(-) diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java b/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java index 77d2e37..ce21c7a 100644 --- a/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java +++ b/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java @@ -21,11 +21,6 @@ import java.util.LinkedHashMap; import java.util.Map; -import com.digitalpebble.stormcrawler.Metadata; -import com.digitalpebble.stormcrawler.persistence.Status; -import org.apache.storm.Config; -import org.apache.storm.LocalCluster; -import org.apache.storm.StormSubmitter; import org.apache.storm.topology.BoltDeclarer; import org.apache.storm.topology.TopologyBuilder; import org.apache.storm.tuple.Fields; @@ -161,20 +156,4 @@ protected WARCHdfsBolt getWarcBolt(String filePrefix) { return warcbolt; } - @Override - protected int submit(String name, Config conf, TopologyBuilder builder) { - // register for serialization with Kryo - Config.registerSerialization(conf, Metadata.class); - Config.registerSerialization(conf, Status.class); - - try { - LocalCluster cluster = new LocalCluster(); - cluster.submitTopology(name, conf, builder.createTopology()); - } catch (Exception e) { - e.printStackTrace(); - return -1; - } - return 0; - - } } From 3a2bd46ecb575d485b6f65cbd271efc52286e88c Mon Sep 17 00:00:00 2001 From: silentninja Date: Tue, 4 Mar 2025 20:19:13 +0400 Subject: [PATCH 7/8] Allow lenient cross submits as config --- .../news/NewsSiteMapParserBolt.java | 23 ++++++++++++++++--- .../stormcrawler/utils/DomainChecker.java | 12 ++++++++++ 2 files changed, 32 insertions(+), 3 deletions(-) create mode 100644 src/main/java/org/commoncrawl/stormcrawler/utils/DomainChecker.java diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserBolt.java b/src/main/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserBolt.java index 5ad9e28..f990b94 100644 --- a/src/main/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserBolt.java +++ b/src/main/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserBolt.java @@ -40,6 +40,7 @@ import org.apache.storm.task.TopologyContext; import org.apache.storm.tuple.Tuple; import org.apache.storm.tuple.Values; +import org.commoncrawl.stormcrawler.utils.DomainChecker; import org.slf4j.LoggerFactory; import com.digitalpebble.stormcrawler.Constants; @@ -160,6 +161,9 @@ public static enum SitemapType { /** Delay in minutes used for scheduling sub-sitemaps **/ private int scheduleSitemapsWithDelay = -1; + private boolean crossSubmitAllowed; + private boolean crossSubmitLenient; + @Override public void execute(Tuple tuple) { Metadata metadata = (Metadata) tuple.getValueByField("metadata"); @@ -285,7 +289,7 @@ public void execute(Tuple tuple) { // send outlinks to status stream for (Outlink ol : outlinks) { try { - if (!crossSubmitCheck(ol, url, metadata)) { + if (!this.crossSubmitAllowed && !crossSubmitCheck(ol, url, metadata)) { String errorMessage = String.format("Cross Submit check failed for %s in %s", ol.getTargetURL(), url); LOG.error(errorMessage); @@ -329,6 +333,13 @@ public void execute(Tuple tuple) { collector.ack(tuple); } + public String getHost(URI url) { + if (this.crossSubmitLenient) { + return DomainChecker.getPayLevelDomain(url.getHost()); + } + return url.getHost(); + } + /** * Checks whether a sitemap URL is allowed to submit URLs for another host. * If the sitemap and target URLs are on the same host, submission is allowed. @@ -344,8 +355,10 @@ public boolean crossSubmitCheck(Outlink ol, String sitemap, Metadata metadata) t URI targetURL = new URI(ol.getTargetURL()); URI sitemapURL = new URI(sitemap); + String targetHost = this.getHost(targetURL); + String sitemapHost = this.getHost(sitemapURL); // Same host - allow - if (targetURL.getHost().equals(sitemapURL.getHost())) { + if (targetHost.equals(sitemapHost)) { return true; } @@ -356,7 +369,7 @@ public boolean crossSubmitCheck(Outlink ol, String sitemap, Metadata metadata) t // Check url.path metadata first if (urlPaths != null) { for (String path : urlPaths) { - if (new URL(path).getHost().equals(targetURL.getHost())) { + if (this.getHost(new URI(path)).equals(targetHost)) { return true; } } @@ -605,6 +618,10 @@ public void prepare(Map stormConf, TopologyContext context, new ReducedMetric(new MeanReducer()), 30); scheduleSitemapsWithDelay = ConfUtils.getInt(stormConf, "sitemap.schedule.delay", scheduleSitemapsWithDelay); + crossSubmitAllowed = ConfUtils.getBoolean(stormConf, + "crossSubmit.allowed", crossSubmitAllowed); + crossSubmitLenient = ConfUtils.getBoolean(stormConf, + "crossSubmit.lenient", crossSubmitLenient);; } } diff --git a/src/main/java/org/commoncrawl/stormcrawler/utils/DomainChecker.java b/src/main/java/org/commoncrawl/stormcrawler/utils/DomainChecker.java new file mode 100644 index 0000000..a1e2561 --- /dev/null +++ b/src/main/java/org/commoncrawl/stormcrawler/utils/DomainChecker.java @@ -0,0 +1,12 @@ +package org.commoncrawl.stormcrawler.utils; + +import org.apache.http.conn.util.PublicSuffixMatcher; +import org.apache.http.conn.util.PublicSuffixMatcherLoader; + +public class DomainChecker { + + public static String getPayLevelDomain(String domain) { + PublicSuffixMatcher matcher = PublicSuffixMatcherLoader.getDefault(); + return matcher.getDomainRoot(domain); // Returns PLD (registered domain) + } +} \ No newline at end of file From 72a74c35caa88ed481438e7f17d0346d54739f80 Mon Sep 17 00:00:00 2001 From: silentninja Date: Wed, 5 Mar 2025 17:05:48 +0400 Subject: [PATCH 8/8] use crawlercommons.domains.EffectiveTldFinder instead of org.apache.http.conn.util.PublicSuffixMatcher to get the hostnames when checking for cross submit --- .../stormcrawler/news/NewsSiteMapParserBolt.java | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserBolt.java b/src/main/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserBolt.java index f990b94..0500244 100644 --- a/src/main/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserBolt.java +++ b/src/main/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserBolt.java @@ -31,6 +31,7 @@ import com.digitalpebble.stormcrawler.protocol.Protocol; import com.digitalpebble.stormcrawler.protocol.ProtocolFactory; import com.digitalpebble.stormcrawler.util.MetadataTransfer; +import crawlercommons.domains.EffectiveTldFinder; import crawlercommons.robots.BaseRobotRules; import org.apache.commons.lang.StringUtils; import org.apache.storm.Config; @@ -40,7 +41,6 @@ import org.apache.storm.task.TopologyContext; import org.apache.storm.tuple.Tuple; import org.apache.storm.tuple.Values; -import org.commoncrawl.stormcrawler.utils.DomainChecker; import org.slf4j.LoggerFactory; import com.digitalpebble.stormcrawler.Constants; @@ -161,8 +161,8 @@ public static enum SitemapType { /** Delay in minutes used for scheduling sub-sitemaps **/ private int scheduleSitemapsWithDelay = -1; - private boolean crossSubmitAllowed; - private boolean crossSubmitLenient; + private boolean crossSubmitAllowed = false; + private boolean crossSubmitLenient = true; @Override public void execute(Tuple tuple) { @@ -335,7 +335,10 @@ public void execute(Tuple tuple) { public String getHost(URI url) { if (this.crossSubmitLenient) { - return DomainChecker.getPayLevelDomain(url.getHost()); + /// www.example.com-> "example.com" + /// blog.subdomain.example.co.uk -> "example.co.uk" + /// www.myapp.github.io -> "myapp.github.io" (excludePrivate is false) + return EffectiveTldFinder.getAssignedDomain(url.getHost(), true,false); } return url.getHost(); }