-
Notifications
You must be signed in to change notification settings - Fork 41
Check for cross submits #68
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: 2.x
Are you sure you want to change the base?
Changes from all commits
63dafc1
efd0d24
4c854e0
0a58e3b
8cf52f5
29e5f3d
3a2bd46
72a74c3
7642a87
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,6 +16,9 @@ | |
| import static com.digitalpebble.stormcrawler.Constants.StatusStreamName; | ||
|
|
||
| import java.io.IOException; | ||
| import java.net.MalformedURLException; | ||
| import java.net.URI; | ||
| import java.net.URISyntaxException; | ||
| import java.net.URL; | ||
| import java.util.ArrayList; | ||
| import java.util.Calendar; | ||
|
|
@@ -25,7 +28,13 @@ | |
| import java.util.List; | ||
| import java.util.Map; | ||
|
|
||
| import com.digitalpebble.stormcrawler.protocol.Protocol; | ||
| import com.digitalpebble.stormcrawler.protocol.ProtocolFactory; | ||
| import com.digitalpebble.stormcrawler.util.MetadataTransfer; | ||
| import crawlercommons.domains.EffectiveTldFinder; | ||
| import crawlercommons.robots.BaseRobotRules; | ||
| import org.apache.commons.lang.StringUtils; | ||
| import org.apache.storm.Config; | ||
| import org.apache.storm.metric.api.MeanReducer; | ||
| import org.apache.storm.metric.api.ReducedMetric; | ||
| import org.apache.storm.task.OutputCollector; | ||
|
|
@@ -68,6 +77,25 @@ | |
| */ | ||
| @SuppressWarnings("serial") | ||
| public class NewsSiteMapParserBolt extends SiteMapParserBolt { | ||
| public ProtocolFactory getProtocolFactory() { | ||
| return protocolFactory; | ||
| } | ||
|
|
||
| public void setProtocolFactory(ProtocolFactory protocolFactory) { | ||
| this.protocolFactory = protocolFactory; | ||
| } | ||
|
|
||
| public MetadataTransfer getMetadataTransfer() { | ||
| return metadataTransfer; | ||
| } | ||
|
|
||
| public void setMetadataTransfer(MetadataTransfer metadataTransfer) { | ||
| this.metadataTransfer = metadataTransfer; | ||
| } | ||
|
|
||
| private MetadataTransfer metadataTransfer; | ||
|
|
||
| private ProtocolFactory protocolFactory; | ||
| // TODO: | ||
| // this is a modified copy of c.d.s.bolt.SiteMapParserBolt | ||
| // - make parent class extensible and overridable | ||
|
|
@@ -133,6 +161,9 @@ public static enum SitemapType { | |
| /** Delay in minutes used for scheduling sub-sitemaps **/ | ||
| private int scheduleSitemapsWithDelay = -1; | ||
|
|
||
| private boolean crossSubmitAllowed = false; | ||
| private boolean crossSubmitLenient = true; | ||
|
|
||
| @Override | ||
| public void execute(Tuple tuple) { | ||
| Metadata metadata = (Metadata) tuple.getValueByField("metadata"); | ||
|
|
@@ -257,6 +288,30 @@ public void execute(Tuple tuple) { | |
|
|
||
| // send outlinks to status stream | ||
| for (Outlink ol : outlinks) { | ||
| try { | ||
| if (!this.crossSubmitAllowed && !crossSubmitCheck(ol, url, metadata)) { | ||
|
|
||
| String errorMessage = String.format("Cross Submit check failed for %s in %s", ol.getTargetURL(), url); | ||
| LOG.error(errorMessage); | ||
| ol.getMetadata().setValue(Constants.STATUS_ERROR_SOURCE, | ||
| "cross submit check"); | ||
| ol.getMetadata().setValue(Constants.STATUS_ERROR_MESSAGE, errorMessage); | ||
| Values v = new Values(ol.getTargetURL(), ol.getMetadata(), | ||
| Status.ERROR); | ||
| collector.emit(StatusStreamName, tuple, v); | ||
| continue; | ||
| } | ||
| } catch (MalformedURLException | URISyntaxException e) { | ||
| String errorMessage = String.format("Malformed URL in outlink %s: %s", url, e); | ||
| LOG.error(errorMessage); | ||
| ol.getMetadata().setValue(Constants.STATUS_ERROR_SOURCE, | ||
| "cross submit check"); | ||
| ol.getMetadata().setValue(Constants.STATUS_ERROR_MESSAGE, errorMessage); | ||
| Values v = new Values(ol.getTargetURL(), ol.getMetadata(), | ||
| Status.ERROR); | ||
| collector.emit(StatusStreamName, tuple, v); | ||
| } | ||
|
|
||
| if (isSitemapIndex) { | ||
| ol.getMetadata().setValue(isSitemapKey, "true"); | ||
| if (isSitemapVerified) { | ||
|
|
@@ -278,6 +333,70 @@ public void execute(Tuple tuple) { | |
| collector.ack(tuple); | ||
| } | ||
|
|
||
| public String getHost(URI url) { | ||
| if (this.crossSubmitLenient) { | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 Nice. That's a good, lean solution. |
||
| /// www.example.com-> "example.com" | ||
| /// blog.subdomain.example.co.uk -> "example.co.uk" | ||
| /// www.myapp.github.io -> "myapp.github.io" (excludePrivate is false) | ||
| return EffectiveTldFinder.getAssignedDomain(url.getHost(), true,false); | ||
| } | ||
| return url.getHost(); | ||
| } | ||
|
|
||
| /** | ||
| * Checks whether a sitemap URL is allowed to submit URLs for another host. | ||
| * If the sitemap and target URLs are on the same host, submission is allowed. | ||
| * For cross-host submissions, checks robots.txt rules of the target host. | ||
| * | ||
| * @param ol The outlink containing the target URL to check | ||
| * @param sitemap The URL of the sitemap | ||
| * @param metadata | ||
| * @return true if submission is allowed, false otherwise | ||
| * @throws MalformedURLException if URLs are malformed | ||
| */ | ||
| public boolean crossSubmitCheck(Outlink ol, String sitemap, Metadata metadata) throws URISyntaxException, MalformedURLException { | ||
| URI targetURL = new URI(ol.getTargetURL()); | ||
| URI sitemapURL = new URI(sitemap); | ||
|
|
||
| String targetHost = this.getHost(targetURL); | ||
| String sitemapHost = this.getHost(sitemapURL); | ||
| // Same host - allow | ||
| if (targetHost.equals(sitemapHost)) { | ||
| return true; | ||
| } | ||
|
|
||
| // Cross-host checks | ||
| Metadata targetMetadata = metadataTransfer.getMetaForOutlink(targetURL.toString(), sitemapURL.toString(), metadata); | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My hint about the MetadataTransfer wasn't about using it for link checking, but to forward the robots.txt URL to the record of a sitemap URL. This can be done by setting the configuration property Two caveats:
|
||
| String[] urlPaths = targetMetadata.getValues("url.path"); | ||
|
|
||
| // Check url.path metadata first | ||
| if (urlPaths != null) { | ||
| for (String path : urlPaths) { | ||
| if (this.getHost(new URI(path)).equals(targetHost)) { | ||
| return true; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| // Check robots.txt rules | ||
| Protocol protocol = protocolFactory.getProtocol(targetURL.toURL()); | ||
| BaseRobotRules rules = protocol.getRobotRules(targetURL.toString()); | ||
| if (rules != null) { | ||
| if (rules.getSitemaps().contains(sitemapURL.toString())) { | ||
| return true; | ||
| } | ||
| if (urlPaths != null) { | ||
| for (String path : urlPaths) { | ||
| if (rules.getSitemaps().contains(path)) { | ||
| return true; | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| return false; | ||
| } | ||
|
|
||
| public SitemapType detectContent(String url, byte[] content) { | ||
| // try to detect content based on the first n bytes | ||
| // works for XML and non-compressed documents | ||
|
|
@@ -482,11 +601,15 @@ protected AbstractSiteMap parseSiteMap(String url, byte[] content, | |
| public void prepare(Map stormConf, TopologyContext context, | ||
| OutputCollector collector) { | ||
| super.prepare(stormConf, context, collector); | ||
| Config conf = new Config(); | ||
| conf.putAll(stormConf); | ||
| metadataTransfer = MetadataTransfer.getInstance(stormConf); | ||
| sniffContent = ConfUtils.getBoolean(stormConf, | ||
| "sitemap.sniffContent", false); | ||
| filterHoursSinceModified = ConfUtils.getInt(stormConf, | ||
| "sitemap.filter.hours.since.modified", -1); | ||
| parseFilters = ParseFilters.fromConf(stormConf); | ||
| protocolFactory = ProtocolFactory.getInstance(conf); | ||
| int maxOffsetGuess = ConfUtils.getInt(stormConf, "sitemap.offset.guess", | ||
| 1024); | ||
| contentDetector = new ContentDetector( | ||
|
|
@@ -498,6 +621,10 @@ public void prepare(Map stormConf, TopologyContext context, | |
| new ReducedMetric(new MeanReducer()), 30); | ||
| scheduleSitemapsWithDelay = ConfUtils.getInt(stormConf, | ||
| "sitemap.schedule.delay", scheduleSitemapsWithDelay); | ||
| crossSubmitAllowed = ConfUtils.getBoolean(stormConf, | ||
| "crossSubmit.allowed", crossSubmitAllowed); | ||
| crossSubmitLenient = ConfUtils.getBoolean(stormConf, | ||
| "crossSubmit.lenient", crossSubmitLenient);; | ||
| } | ||
|
|
||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,12 @@ | ||
| package org.commoncrawl.stormcrawler.utils; | ||
|
|
||
| import org.apache.http.conn.util.PublicSuffixMatcher; | ||
| import org.apache.http.conn.util.PublicSuffixMatcherLoader; | ||
|
|
||
| public class DomainChecker { | ||
|
|
||
| public static String getPayLevelDomain(String domain) { | ||
| PublicSuffixMatcher matcher = PublicSuffixMatcherLoader.getDefault(); | ||
| return matcher.getDomainRoot(domain); // Returns PLD (registered domain) | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -13,19 +13,22 @@ | |
| */ | ||
| package org.commoncrawl.stormcrawler.news; | ||
|
|
||
| import static org.junit.Assert.assertEquals; | ||
| import static org.junit.Assert.assertNotEquals; | ||
| import static org.junit.Assert.*; | ||
| import static org.mockito.Mockito.*; | ||
|
|
||
| import java.io.ByteArrayOutputStream; | ||
| import java.io.IOException; | ||
| import java.net.URISyntaxException; | ||
| import java.net.URL; | ||
| import java.nio.charset.StandardCharsets; | ||
| import java.time.LocalDateTime; | ||
| import java.time.format.DateTimeFormatter; | ||
| import java.util.ArrayList; | ||
| import java.util.HashMap; | ||
| import java.util.List; | ||
| import java.util.Map; | ||
| import java.util.*; | ||
|
|
||
| import com.digitalpebble.stormcrawler.protocol.Protocol; | ||
| import com.digitalpebble.stormcrawler.protocol.ProtocolFactory; | ||
| import com.digitalpebble.stormcrawler.util.MetadataTransfer; | ||
| import crawlercommons.robots.BaseRobotRules; | ||
| import org.apache.commons.io.IOUtils; | ||
| import org.commoncrawl.stormcrawler.news.NewsSiteMapParserBolt.SitemapType; | ||
| import org.junit.Before; | ||
|
|
@@ -46,6 +49,7 @@ public void setupParserBolt() { | |
| config.put("sitemap.sniffContent", true); | ||
| // allow items published during the last week | ||
| config.put("sitemap.filter.hours.since.modified", 168); | ||
| config.put("http.agent.name", " Mozilla/5.0 (compatible; NewsBot/1.0; +http://example.com/bot)"); | ||
| prepareParserBolt("test.parsefilters.json", config); | ||
| } | ||
|
|
||
|
|
@@ -95,5 +99,125 @@ public void testFeedWithSitemapNamespace() throws IOException, UnknownFormatExce | |
| assertNotEquals("RSS feed with sitemap namespace should not be detected as sitemap", | ||
| SitemapType.SITEMAP, type); | ||
| } | ||
|
|
||
| @Test | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 Good to have a unit test! |
||
| public void testCrossHostSitemapVerification() throws IOException, UnknownFormatException, URISyntaxException { | ||
| String sitemapURL = "https://www.example.org/sitemap-news.xml"; | ||
| String articleURL = "http://www.example.org/business/article55.html"; | ||
| String adSitemapURL = "https://www.example.net/sitemap-ads.xml"; | ||
|
|
||
| // Mock RobotRules and its dependencies | ||
| ProtocolFactory mockProtocolFactory = mock(ProtocolFactory.class); | ||
| Protocol mockProtocol = mock(Protocol.class); | ||
| when(mockProtocolFactory.getProtocol(any(URL.class))).thenReturn(mockProtocol); | ||
|
|
||
| BaseRobotRules mockRules = mock(BaseRobotRules.class); | ||
| when(mockProtocol.getRobotRules(articleURL)).thenReturn(mockRules); | ||
| when(mockRules.getSitemaps()).thenReturn(Collections.singletonList(sitemapURL)); | ||
|
|
||
| BaseRobotRules mockRules1 = mock(BaseRobotRules.class); | ||
| when(mockProtocol.getRobotRules(any(String.class))).thenReturn(mockRules1); | ||
| when(mockRules.getSitemaps()).thenReturn(Collections.singletonList(adSitemapURL)); | ||
| // Set up test data | ||
| byte[] content = readContent("cross-sitemap-news.xml"); | ||
| String contentType = ""; | ||
| Metadata parentMetadata = new Metadata(); | ||
| List<Outlink> links = new ArrayList<>(); | ||
|
|
||
| // Set recent publication date and cross-host URL | ||
| LocalDateTime yesterday = LocalDateTime.now().minusDays(1); | ||
| content = (new String(content, StandardCharsets.UTF_8)) | ||
| .replace("<news:publication_date>2008-12-23</news:publication_date>", | ||
| "<news:publication_date>" + yesterday.format(DateTimeFormatter.ofPattern("yyyy-MM-dd")) + "</news:publication_date>") | ||
| .getBytes(StandardCharsets.UTF_8); | ||
|
|
||
| // Inject mocked protocol factory | ||
| ((NewsSiteMapParserBolt) bolt).setProtocolFactory(mockProtocolFactory); | ||
|
|
||
| ((NewsSiteMapParserBolt) bolt).parseSiteMap(sitemapURL, content, contentType, parentMetadata, links); | ||
| // Verify the cross-host link is allowed and included | ||
| assertEquals(3, links.size()); | ||
| assertTrue(((NewsSiteMapParserBolt) bolt).crossSubmitCheck(links.get(0), sitemapURL, parentMetadata)); | ||
| assertFalse(((NewsSiteMapParserBolt) bolt).crossSubmitCheck(links.get(1), sitemapURL, parentMetadata)); | ||
| } | ||
|
|
||
| /** | ||
| * Tests cross-host sitemap submissions with the following structure: | ||
| * | ||
| * www.example.org/sitemap-news.xml | ||
| * └── www.example.com/sports/news1.html | ||
| * └── www.example.org/business/article55.html | ||
| * └── www.example.net/ads/sponsored-content.html | ||
| * | ||
| * www.example.org/robots.txt | ||
| * └── www.example.org/sitemap-index.xml | ||
| * └── www.example.org/sitemap-news.xml | ||
| * | ||
| * www.example.com/robots.txt | ||
| * └── www.example.org/sitemap-index.xml (shared with www.example.org) | ||
| * | ||
| * www.example.net/robots.txt | ||
| * └── www.example.net/sitemap.xml | ||
| * └── www.example.net/ads/sponsored-content.html | ||
| * | ||
| * URLs from example.org and example.com pass crossSubmitCheck since their robots.txt | ||
| * reference the same sitemap index which contains the sitemap from which the link is fetched. | ||
| * URLs from example.net fail since their robots reference a different sitemap index. | ||
| */ | ||
| @Test | ||
| public void test_cross_host_submission_sitemaps() throws IOException, UnknownFormatException, URISyntaxException { | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not a canonical Java method name. |
||
| String sitemapURL = "https://www.example.org/sitemap-news.xml"; | ||
| String sitemapIndexURL = "https://www.example.org/sitemap-index.xml"; | ||
| String adSitemapURL = "https://www.example.net/sitemap-ads.xml"; | ||
|
|
||
|
|
||
| // Mock RobotRules and its dependencies | ||
| ProtocolFactory mockProtocolFactory = mock(ProtocolFactory.class); | ||
| Protocol mockProtocol = mock(Protocol.class); | ||
| when(mockProtocolFactory.getProtocol(any(URL.class))).thenReturn(mockProtocol); | ||
|
|
||
| BaseRobotRules mockRules = mock(BaseRobotRules.class); | ||
| when(mockProtocol.getRobotRules("http://www.example.org/business/article55.html")).thenReturn(mockRules); | ||
| when(mockRules.getSitemaps()).thenReturn(Collections.singletonList(sitemapIndexURL)); | ||
|
|
||
| BaseRobotRules mockRules1 = mock(BaseRobotRules.class); | ||
| when(mockProtocol.getRobotRules("http://www.example.com/sports/news1.html")).thenReturn(mockRules1); | ||
| when(mockRules1.getSitemaps()).thenReturn(Collections.singletonList(sitemapIndexURL)); | ||
|
|
||
| BaseRobotRules mockRules2 = mock(BaseRobotRules.class); | ||
| when(mockProtocol.getRobotRules("http://www.example.net/ads/sponsored-content.html")).thenReturn(mockRules2); | ||
| when(mockRules2.getSitemaps()).thenReturn(Collections.singletonList(adSitemapURL)); | ||
|
|
||
| // Mocking MetadataTransfer to return specific url.path metadata | ||
| MetadataTransfer metadataTransferMock = mock(MetadataTransfer.class); | ||
| Metadata targetMetadata = new Metadata(); | ||
| targetMetadata.addValues("url.path", Arrays.asList(sitemapIndexURL, sitemapURL)); | ||
| when(metadataTransferMock.getMetaForOutlink(anyString(), anyString(), any(Metadata.class))) | ||
| .thenReturn(targetMetadata); | ||
|
|
||
| // Injecting the mock into the bolt | ||
| ((NewsSiteMapParserBolt) bolt).setMetadataTransfer(metadataTransferMock); | ||
|
|
||
| // Set up test data | ||
| byte[] content = readContent("cross-sitemap-news.xml"); | ||
| String contentType = ""; | ||
| Metadata parentMetadata = new Metadata(); | ||
| List<Outlink> links = new ArrayList<>(); | ||
|
|
||
| // Set recent publication date and cross-host URL | ||
| LocalDateTime yesterday = LocalDateTime.now().minusDays(1); | ||
| content = (new String(content, StandardCharsets.UTF_8)) | ||
| .replace("<news:publication_date>2008-12-23</news:publication_date>", | ||
| "<news:publication_date>" + yesterday.format(DateTimeFormatter.ofPattern("yyyy-MM-dd")) + "</news:publication_date>") | ||
| .getBytes(StandardCharsets.UTF_8); | ||
|
|
||
| // Inject mocked protocol factory | ||
| ((NewsSiteMapParserBolt) bolt).setProtocolFactory(mockProtocolFactory); | ||
|
|
||
| ((NewsSiteMapParserBolt) bolt).parseSiteMap(sitemapURL, content, contentType, parentMetadata, links); | ||
| // Verify the cross-host link is allowed and included | ||
| assertEquals(3, links.size()); | ||
| assertTrue(((NewsSiteMapParserBolt) bolt).crossSubmitCheck(links.get(0), sitemapURL, parentMetadata)); | ||
| assertFalse(((NewsSiteMapParserBolt) bolt).crossSubmitCheck(links.get(1), sitemapURL, parentMetadata)); | ||
| assertTrue(((NewsSiteMapParserBolt) bolt).crossSubmitCheck(links.get(2), sitemapURL, parentMetadata)); | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good question, whether URLs failing the cross-submit check should be emitted at all: