diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserBolt.java b/src/main/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserBolt.java index 42f4de3..0500244 100644 --- a/src/main/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserBolt.java +++ b/src/main/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserBolt.java @@ -16,6 +16,9 @@ import static com.digitalpebble.stormcrawler.Constants.StatusStreamName; import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URI; +import java.net.URISyntaxException; import java.net.URL; import java.util.ArrayList; import java.util.Calendar; @@ -25,7 +28,13 @@ import java.util.List; import java.util.Map; +import com.digitalpebble.stormcrawler.protocol.Protocol; +import com.digitalpebble.stormcrawler.protocol.ProtocolFactory; +import com.digitalpebble.stormcrawler.util.MetadataTransfer; +import crawlercommons.domains.EffectiveTldFinder; +import crawlercommons.robots.BaseRobotRules; import org.apache.commons.lang.StringUtils; +import org.apache.storm.Config; import org.apache.storm.metric.api.MeanReducer; import org.apache.storm.metric.api.ReducedMetric; import org.apache.storm.task.OutputCollector; @@ -68,6 +77,25 @@ */ @SuppressWarnings("serial") public class NewsSiteMapParserBolt extends SiteMapParserBolt { + public ProtocolFactory getProtocolFactory() { + return protocolFactory; + } + + public void setProtocolFactory(ProtocolFactory protocolFactory) { + this.protocolFactory = protocolFactory; + } + + public MetadataTransfer getMetadataTransfer() { + return metadataTransfer; + } + + public void setMetadataTransfer(MetadataTransfer metadataTransfer) { + this.metadataTransfer = metadataTransfer; + } + + private MetadataTransfer metadataTransfer; + + private ProtocolFactory protocolFactory; // TODO: // this is a modified copy of c.d.s.bolt.SiteMapParserBolt // - make parent class extensible and overridable @@ -133,6 +161,9 @@ public static enum SitemapType { /** Delay in minutes used for scheduling sub-sitemaps **/ private int scheduleSitemapsWithDelay = -1; + private boolean crossSubmitAllowed = false; + private boolean crossSubmitLenient = true; + @Override public void execute(Tuple tuple) { Metadata metadata = (Metadata) tuple.getValueByField("metadata"); @@ -257,6 +288,30 @@ public void execute(Tuple tuple) { // send outlinks to status stream for (Outlink ol : outlinks) { + try { + if (!this.crossSubmitAllowed && !crossSubmitCheck(ol, url, metadata)) { + + String errorMessage = String.format("Cross Submit check failed for %s in %s", ol.getTargetURL(), url); + LOG.error(errorMessage); + ol.getMetadata().setValue(Constants.STATUS_ERROR_SOURCE, + "cross submit check"); + ol.getMetadata().setValue(Constants.STATUS_ERROR_MESSAGE, errorMessage); + Values v = new Values(ol.getTargetURL(), ol.getMetadata(), + Status.ERROR); + collector.emit(StatusStreamName, tuple, v); + continue; + } + } catch (MalformedURLException | URISyntaxException e) { + String errorMessage = String.format("Malformed URL in outlink %s: %s", url, e); + LOG.error(errorMessage); + ol.getMetadata().setValue(Constants.STATUS_ERROR_SOURCE, + "cross submit check"); + ol.getMetadata().setValue(Constants.STATUS_ERROR_MESSAGE, errorMessage); + Values v = new Values(ol.getTargetURL(), ol.getMetadata(), + Status.ERROR); + collector.emit(StatusStreamName, tuple, v); + } + if (isSitemapIndex) { ol.getMetadata().setValue(isSitemapKey, "true"); if (isSitemapVerified) { @@ -278,6 +333,70 @@ public void execute(Tuple tuple) { collector.ack(tuple); } + public String getHost(URI url) { + if (this.crossSubmitLenient) { + /// www.example.com-> "example.com" + /// blog.subdomain.example.co.uk -> "example.co.uk" + /// www.myapp.github.io -> "myapp.github.io" (excludePrivate is false) + return EffectiveTldFinder.getAssignedDomain(url.getHost(), true,false); + } + return url.getHost(); + } + + /** + * Checks whether a sitemap URL is allowed to submit URLs for another host. + * If the sitemap and target URLs are on the same host, submission is allowed. + * For cross-host submissions, checks robots.txt rules of the target host. + * + * @param ol The outlink containing the target URL to check + * @param sitemap The URL of the sitemap + * @param metadata + * @return true if submission is allowed, false otherwise + * @throws MalformedURLException if URLs are malformed + */ + public boolean crossSubmitCheck(Outlink ol, String sitemap, Metadata metadata) throws URISyntaxException, MalformedURLException { + URI targetURL = new URI(ol.getTargetURL()); + URI sitemapURL = new URI(sitemap); + + String targetHost = this.getHost(targetURL); + String sitemapHost = this.getHost(sitemapURL); + // Same host - allow + if (targetHost.equals(sitemapHost)) { + return true; + } + + // Cross-host checks + Metadata targetMetadata = metadataTransfer.getMetaForOutlink(targetURL.toString(), sitemapURL.toString(), metadata); + String[] urlPaths = targetMetadata.getValues("url.path"); + + // Check url.path metadata first + if (urlPaths != null) { + for (String path : urlPaths) { + if (this.getHost(new URI(path)).equals(targetHost)) { + return true; + } + } + } + + // Check robots.txt rules + Protocol protocol = protocolFactory.getProtocol(targetURL.toURL()); + BaseRobotRules rules = protocol.getRobotRules(targetURL.toString()); + if (rules != null) { + if (rules.getSitemaps().contains(sitemapURL.toString())) { + return true; + } + if (urlPaths != null) { + for (String path : urlPaths) { + if (rules.getSitemaps().contains(path)) { + return true; + } + } + } + } + + return false; + } + public SitemapType detectContent(String url, byte[] content) { // try to detect content based on the first n bytes // works for XML and non-compressed documents @@ -482,11 +601,15 @@ protected AbstractSiteMap parseSiteMap(String url, byte[] content, public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) { super.prepare(stormConf, context, collector); + Config conf = new Config(); + conf.putAll(stormConf); + metadataTransfer = MetadataTransfer.getInstance(stormConf); sniffContent = ConfUtils.getBoolean(stormConf, "sitemap.sniffContent", false); filterHoursSinceModified = ConfUtils.getInt(stormConf, "sitemap.filter.hours.since.modified", -1); parseFilters = ParseFilters.fromConf(stormConf); + protocolFactory = ProtocolFactory.getInstance(conf); int maxOffsetGuess = ConfUtils.getInt(stormConf, "sitemap.offset.guess", 1024); contentDetector = new ContentDetector( @@ -498,6 +621,10 @@ public void prepare(Map stormConf, TopologyContext context, new ReducedMetric(new MeanReducer()), 30); scheduleSitemapsWithDelay = ConfUtils.getInt(stormConf, "sitemap.schedule.delay", scheduleSitemapsWithDelay); + crossSubmitAllowed = ConfUtils.getBoolean(stormConf, + "crossSubmit.allowed", crossSubmitAllowed); + crossSubmitLenient = ConfUtils.getBoolean(stormConf, + "crossSubmit.lenient", crossSubmitLenient);; } } diff --git a/src/main/java/org/commoncrawl/stormcrawler/utils/DomainChecker.java b/src/main/java/org/commoncrawl/stormcrawler/utils/DomainChecker.java new file mode 100644 index 0000000..a1e2561 --- /dev/null +++ b/src/main/java/org/commoncrawl/stormcrawler/utils/DomainChecker.java @@ -0,0 +1,12 @@ +package org.commoncrawl.stormcrawler.utils; + +import org.apache.http.conn.util.PublicSuffixMatcher; +import org.apache.http.conn.util.PublicSuffixMatcherLoader; + +public class DomainChecker { + + public static String getPayLevelDomain(String domain) { + PublicSuffixMatcher matcher = PublicSuffixMatcherLoader.getDefault(); + return matcher.getDomainRoot(domain); // Returns PLD (registered domain) + } +} \ No newline at end of file diff --git a/src/test/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserTest.java b/src/test/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserTest.java index 7d64720..be60942 100644 --- a/src/test/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserTest.java +++ b/src/test/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserTest.java @@ -13,19 +13,22 @@ */ package org.commoncrawl.stormcrawler.news; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotEquals; +import static org.junit.Assert.*; +import static org.mockito.Mockito.*; import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.net.URISyntaxException; +import java.net.URL; import java.nio.charset.StandardCharsets; import java.time.LocalDateTime; import java.time.format.DateTimeFormatter; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; +import com.digitalpebble.stormcrawler.protocol.Protocol; +import com.digitalpebble.stormcrawler.protocol.ProtocolFactory; +import com.digitalpebble.stormcrawler.util.MetadataTransfer; +import crawlercommons.robots.BaseRobotRules; import org.apache.commons.io.IOUtils; import org.commoncrawl.stormcrawler.news.NewsSiteMapParserBolt.SitemapType; import org.junit.Before; @@ -46,6 +49,7 @@ public void setupParserBolt() { config.put("sitemap.sniffContent", true); // allow items published during the last week config.put("sitemap.filter.hours.since.modified", 168); + config.put("http.agent.name", " Mozilla/5.0 (compatible; NewsBot/1.0; +http://example.com/bot)"); prepareParserBolt("test.parsefilters.json", config); } @@ -95,5 +99,125 @@ public void testFeedWithSitemapNamespace() throws IOException, UnknownFormatExce assertNotEquals("RSS feed with sitemap namespace should not be detected as sitemap", SitemapType.SITEMAP, type); } - + @Test + public void testCrossHostSitemapVerification() throws IOException, UnknownFormatException, URISyntaxException { + String sitemapURL = "https://www.example.org/sitemap-news.xml"; + String articleURL = "http://www.example.org/business/article55.html"; + String adSitemapURL = "https://www.example.net/sitemap-ads.xml"; + + // Mock RobotRules and its dependencies + ProtocolFactory mockProtocolFactory = mock(ProtocolFactory.class); + Protocol mockProtocol = mock(Protocol.class); + when(mockProtocolFactory.getProtocol(any(URL.class))).thenReturn(mockProtocol); + + BaseRobotRules mockRules = mock(BaseRobotRules.class); + when(mockProtocol.getRobotRules(articleURL)).thenReturn(mockRules); + when(mockRules.getSitemaps()).thenReturn(Collections.singletonList(sitemapURL)); + + BaseRobotRules mockRules1 = mock(BaseRobotRules.class); + when(mockProtocol.getRobotRules(any(String.class))).thenReturn(mockRules1); + when(mockRules.getSitemaps()).thenReturn(Collections.singletonList(adSitemapURL)); + // Set up test data + byte[] content = readContent("cross-sitemap-news.xml"); + String contentType = ""; + Metadata parentMetadata = new Metadata(); + List links = new ArrayList<>(); + + // Set recent publication date and cross-host URL + LocalDateTime yesterday = LocalDateTime.now().minusDays(1); + content = (new String(content, StandardCharsets.UTF_8)) + .replace("2008-12-23", + "" + yesterday.format(DateTimeFormatter.ofPattern("yyyy-MM-dd")) + "") + .getBytes(StandardCharsets.UTF_8); + + // Inject mocked protocol factory + ((NewsSiteMapParserBolt) bolt).setProtocolFactory(mockProtocolFactory); + + ((NewsSiteMapParserBolt) bolt).parseSiteMap(sitemapURL, content, contentType, parentMetadata, links); + // Verify the cross-host link is allowed and included + assertEquals(3, links.size()); + assertTrue(((NewsSiteMapParserBolt) bolt).crossSubmitCheck(links.get(0), sitemapURL, parentMetadata)); + assertFalse(((NewsSiteMapParserBolt) bolt).crossSubmitCheck(links.get(1), sitemapURL, parentMetadata)); + } + + /** + * Tests cross-host sitemap submissions with the following structure: + * + * www.example.org/sitemap-news.xml + * └── www.example.com/sports/news1.html + * └── www.example.org/business/article55.html + * └── www.example.net/ads/sponsored-content.html + * + * www.example.org/robots.txt + * └── www.example.org/sitemap-index.xml + * └── www.example.org/sitemap-news.xml + * + * www.example.com/robots.txt + * └── www.example.org/sitemap-index.xml (shared with www.example.org) + * + * www.example.net/robots.txt + * └── www.example.net/sitemap.xml + * └── www.example.net/ads/sponsored-content.html + * + * URLs from example.org and example.com pass crossSubmitCheck since their robots.txt + * reference the same sitemap index which contains the sitemap from which the link is fetched. + * URLs from example.net fail since their robots reference a different sitemap index. + */ + @Test + public void test_cross_host_submission_sitemaps() throws IOException, UnknownFormatException, URISyntaxException { + String sitemapURL = "https://www.example.org/sitemap-news.xml"; + String sitemapIndexURL = "https://www.example.org/sitemap-index.xml"; + String adSitemapURL = "https://www.example.net/sitemap-ads.xml"; + + + // Mock RobotRules and its dependencies + ProtocolFactory mockProtocolFactory = mock(ProtocolFactory.class); + Protocol mockProtocol = mock(Protocol.class); + when(mockProtocolFactory.getProtocol(any(URL.class))).thenReturn(mockProtocol); + + BaseRobotRules mockRules = mock(BaseRobotRules.class); + when(mockProtocol.getRobotRules("http://www.example.org/business/article55.html")).thenReturn(mockRules); + when(mockRules.getSitemaps()).thenReturn(Collections.singletonList(sitemapIndexURL)); + + BaseRobotRules mockRules1 = mock(BaseRobotRules.class); + when(mockProtocol.getRobotRules("http://www.example.com/sports/news1.html")).thenReturn(mockRules1); + when(mockRules1.getSitemaps()).thenReturn(Collections.singletonList(sitemapIndexURL)); + + BaseRobotRules mockRules2 = mock(BaseRobotRules.class); + when(mockProtocol.getRobotRules("http://www.example.net/ads/sponsored-content.html")).thenReturn(mockRules2); + when(mockRules2.getSitemaps()).thenReturn(Collections.singletonList(adSitemapURL)); + + // Mocking MetadataTransfer to return specific url.path metadata + MetadataTransfer metadataTransferMock = mock(MetadataTransfer.class); + Metadata targetMetadata = new Metadata(); + targetMetadata.addValues("url.path", Arrays.asList(sitemapIndexURL, sitemapURL)); + when(metadataTransferMock.getMetaForOutlink(anyString(), anyString(), any(Metadata.class))) + .thenReturn(targetMetadata); + + // Injecting the mock into the bolt + ((NewsSiteMapParserBolt) bolt).setMetadataTransfer(metadataTransferMock); + + // Set up test data + byte[] content = readContent("cross-sitemap-news.xml"); + String contentType = ""; + Metadata parentMetadata = new Metadata(); + List links = new ArrayList<>(); + + // Set recent publication date and cross-host URL + LocalDateTime yesterday = LocalDateTime.now().minusDays(1); + content = (new String(content, StandardCharsets.UTF_8)) + .replace("2008-12-23", + "" + yesterday.format(DateTimeFormatter.ofPattern("yyyy-MM-dd")) + "") + .getBytes(StandardCharsets.UTF_8); + + // Inject mocked protocol factory + ((NewsSiteMapParserBolt) bolt).setProtocolFactory(mockProtocolFactory); + + ((NewsSiteMapParserBolt) bolt).parseSiteMap(sitemapURL, content, contentType, parentMetadata, links); + // Verify the cross-host link is allowed and included + assertEquals(3, links.size()); + assertTrue(((NewsSiteMapParserBolt) bolt).crossSubmitCheck(links.get(0), sitemapURL, parentMetadata)); + assertFalse(((NewsSiteMapParserBolt) bolt).crossSubmitCheck(links.get(1), sitemapURL, parentMetadata)); + assertTrue(((NewsSiteMapParserBolt) bolt).crossSubmitCheck(links.get(2), sitemapURL, parentMetadata)); + } } diff --git a/src/test/resources/cross-sitemap-news.xml b/src/test/resources/cross-sitemap-news.xml new file mode 100644 index 0000000..02ef434 --- /dev/null +++ b/src/test/resources/cross-sitemap-news.xml @@ -0,0 +1,79 @@ + + + + + + + http://www.example.org/business/article55.html + + + The Example Times + en + + PressRelease, Blog + 2008-12-23 + Companies A, B in Merger Talks + business, merger, acquisition, A, B + NASDAQ:A, NASDAQ:B + + + + http://www.example.org/image.jpg + + + http://example.org/photo.jpg + This is the caption. + Limerick, Ireland + Example photo shot in Limerick, Ireland + https://creativecommons.org/licenses/by/4.0/legalcode + + + + + http://www.example.net/ads/sponsored-content.html + + + Example News + en + + Advertisement, Sponsored + 2008-12-23 + Special Holiday Deals from Our Partners + sponsored, advertisement, deals, holiday + + + http://www.example.net/ads/promo-banner.jpg + Holiday Season Special Offers + Promotional Banner + + + + http://www.example.com/sports/news1.html + + + Example News + en + + Advertisement, Sponsored + 2008-12-23 + Special Holiday Deals from Our Partners + sponsored, advertisement, deals, holiday + + + http://www.example.net/ads/promo-banner.jpg + Holiday Season Special Offers + Promotional Banner + + + +