From c4519b85fb061865a60a83ef634f1169f89028d6 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Thu, 26 Mar 2026 21:14:23 +0100 Subject: [PATCH 1/8] Upgrade code to StormCrawler 3.5.1 Upgrade dependencies and Maven plugins to latest available versions Migrate from Elasticsearch to OpenSearch --- bin/{ES_IndexInit.sh => OS_IndexInit.sh} | 16 ++- conf/crawler-conf.yaml | 12 +- conf/crawler.flux | 24 ++-- conf/es-conf.yaml | 2 +- pom.xml | 107 +++++++----------- .../stormcrawler/filter/FastURLFilter.java | 12 +- .../stormcrawler/news/CrawlTopology.java | 35 +++--- .../stormcrawler/news/FeedDetectorBolt.java | 18 +-- .../news/NewsSiteMapParserBolt.java | 29 +++-- .../stormcrawler/news/PreFilterBolt.java | 11 +- .../news/PunycodeURLNormalizer.java | 4 +- .../news/bootstrap/BootstrapTopology.java | 28 ++--- .../news/bootstrap/FeedLinkParseFilter.java | 10 +- .../bootstrap/NewsSiteMapDetectorBolt.java | 16 +-- .../resources/bootstrap-parsefilters.json | 6 +- src/main/resources/bootstrap-urlfilters.json | 12 +- src/main/resources/inject-urlfilters.json | 14 +-- src/main/resources/parsefilters.json | 4 +- src/main/resources/pre-urlfilters.json | 2 +- src/main/resources/urlfilters.json | 14 +-- .../stormcrawler/FastURLFilterTest.java | 4 +- .../news/NewsSiteMapParserTest.java | 6 +- 22 files changed, 188 insertions(+), 198 deletions(-) rename bin/{ES_IndexInit.sh => OS_IndexInit.sh} (76%) diff --git a/bin/ES_IndexInit.sh b/bin/OS_IndexInit.sh similarity index 76% rename from bin/ES_IndexInit.sh rename to bin/OS_IndexInit.sh index 394faf7..60e72ec 100755 --- a/bin/ES_IndexInit.sh +++ b/bin/OS_IndexInit.sh @@ -1,5 +1,17 @@ -# modified version of -# https://github.com/DigitalPebble/storm-crawler/blob/master/external/elasticsearch/ES_IndexInit.sh +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. ESHOST="http://localhost:9200" #ESCREDENTIALS="-u elastic:passwordhere" diff --git a/conf/crawler-conf.yaml b/conf/crawler-conf.yaml index 00032e1..ef0f478 100644 --- a/conf/crawler-conf.yaml +++ b/conf/crawler-conf.yaml @@ -19,8 +19,8 @@ config: # mandatory when using Flux topology.kryo.register: - - com.digitalpebble.stormcrawler.Metadata - - com.digitalpebble.stormcrawler.persistence.Status + - org.apache.stormcrawler.Metadata + - org.apache.stormcrawler.persistence.Status topology.backpressure.enable: false @@ -31,7 +31,7 @@ config: topology.metrics.consumer.register: - class: "org.apache.storm.metric.LoggingMetricsConsumer" parallelism.hint: 1 - - class: "com.digitalpebble.stormcrawler.elasticsearch.metrics.MetricsConsumer" + - class: "org.apache.stormcrawler.elasticsearch.metrics.MetricsConsumer" parallelism.hint: 1 # status index and fetcher queues are partitioned by domain @@ -71,8 +71,8 @@ config: http.timeout: 30000 # use okhttp - http.protocol.implementation: com.digitalpebble.stormcrawler.protocol.okhttp.HttpProtocol - https.protocol.implementation: com.digitalpebble.stormcrawler.protocol.okhttp.HttpProtocol + http.protocol.implementation: org.apache.stormcrawler.protocol.okhttp.HttpProtocol + https.protocol.implementation: org.apache.stormcrawler.protocol.okhttp.HttpProtocol # do not fail on unknown SSL certificates http.trust.everything: true @@ -103,7 +103,7 @@ config: fetcher.max.urls.in.queues: 6000 # fetch Scheduler implementation - scheduler.class: "com.digitalpebble.stormcrawler.persistence.AdaptiveScheduler" + scheduler.class: "org.apache.stormcrawler.persistence.AdaptiveScheduler" # AdaptiveScheduler properties scheduler.adaptive.setLastModified: true # frequently changing feeds or news sitemaps are refetched after 90 min. diff --git a/conf/crawler.flux b/conf/crawler.flux index 5367184..73af7c7 100644 --- a/conf/crawler.flux +++ b/conf/crawler.flux @@ -21,7 +21,7 @@ config: components: - id: "WARCFileNameFormat" - className: "com.digitalpebble.stormcrawler.warc.WARCFileNameFormat" + className: "org.apache.stormcrawler.warc.WARCFileNameFormat" configMethods: - name: "withPath" args: @@ -30,7 +30,7 @@ components: args: - "CC-NEWS" - id: "WARCFileRotationPolicy" - className: "com.digitalpebble.stormcrawler.warc.FileTimeSizeRotationPolicy" + className: "org.apache.stormcrawler.warc.FileTimeSizeRotationPolicy" constructorArgs: - 1024 - MB @@ -77,10 +77,10 @@ components: spouts: - id: "spout" - className: "com.digitalpebble.stormcrawler.elasticsearch.persistence.AggregationSpout" + className: "org.apache.stormcrawler.elasticsearch.persistence.AggregationSpout" parallelism: 16 - id: "filespout" - className: "com.digitalpebble.stormcrawler.spout.FileSpout" + className: "org.apache.stormcrawler.spout.FileSpout" parallelism: 1 constructorArgs: - "/path/to/seeds/" @@ -89,7 +89,7 @@ spouts: bolts: - id: "filter" - className: "com.digitalpebble.stormcrawler.bolt.URLFilterBolt" + className: "org.apache.stormcrawler.bolt.URLFilterBolt" parallelism: 1 - id: "prefilter" className: "org.commoncrawl.stormcrawler.news.PreFilterBolt" @@ -97,22 +97,22 @@ bolts: constructorArgs: - "pre-urlfilters.json" - id: "partitioner" - className: "com.digitalpebble.stormcrawler.bolt.URLPartitionerBolt" + className: "org.apache.stormcrawler.bolt.URLPartitionerBolt" parallelism: 1 - id: "fetcher" - className: "com.digitalpebble.stormcrawler.bolt.FetcherBolt" + className: "org.apache.stormcrawler.bolt.FetcherBolt" parallelism: 1 - id: "sitemap" className: "org.commoncrawl.stormcrawler.news.NewsSiteMapParserBolt" parallelism: 1 - id: "feed" - className: "com.digitalpebble.stormcrawler.bolt.FeedParserBolt" + className: "org.apache.stormcrawler.bolt.FeedParserBolt" parallelism: 1 - id: "ssbolt" - className: "com.digitalpebble.stormcrawler.indexing.DummyIndexer" + className: "org.apache.stormcrawler.indexing.DummyIndexer" parallelism: 1 - id: "warc" - className: "com.digitalpebble.stormcrawler.warc.WARCHdfsBolt" + className: "org.apache.stormcrawler.warc.WARCHdfsBolt" parallelism: 1 configMethods: - name: "withFileNameFormat" @@ -129,7 +129,7 @@ bolts: args: - "warc" - id: "status" - className: "com.digitalpebble.stormcrawler.elasticsearch.persistence.StatusUpdaterBolt" + className: "org.apache.stormcrawler.elasticsearch.persistence.StatusUpdaterBolt" parallelism: 1 streams: @@ -219,7 +219,7 @@ streams: streamId: "status" type: CUSTOM customClass: - className: "com.digitalpebble.stormcrawler.util.URLStreamGrouping" + className: "org.apache.stormcrawler.util.URLStreamGrouping" constructorArgs: - "byDomain" diff --git a/conf/es-conf.yaml b/conf/es-conf.yaml index 9b02a56..7ea8b8b 100644 --- a/conf/es-conf.yaml +++ b/conf/es-conf.yaml @@ -74,7 +74,7 @@ config: es.status.recentDate.min.gap: -1 topology.metrics.consumer.register: - - class: "com.digitalpebble.stormcrawler.elasticsearch.metrics.MetricsConsumer" + - class: "org.apache.stormcrawler.elasticsearch.metrics.MetricsConsumer" parallelism.hint: 1 #whitelist: # - "fetcher_counter" diff --git a/pom.xml b/pom.xml index db70413..c470e83 100644 --- a/pom.xml +++ b/pom.xml @@ -1,10 +1,32 @@ - + + + + 4.0.0 org.commoncrawl.stormcrawler.news crawler - 2.10.0 + 3.5.1 jar @@ -17,14 +39,14 @@ UTF-8 - 2.10 - 2.5.0 - 1.12.467 + 3.5.1 + 2.8.4 + 1.12.797 2.11.1 - 1.1 - 5.5.0 - 2.26.3 - 4.13 + 1.6 + 5.23.0 + 3.0.1 + 4.13.2 @@ -32,16 +54,16 @@ org.apache.maven.plugins maven-compiler-plugin - 3.11.0 + 3.15.0 - 11 - 11 + 17 + 17 org.codehaus.mojo exec-maven-plugin - 3.1.0 + 3.6.3 @@ -59,7 +81,7 @@ org.apache.maven.plugins maven-shade-plugin - 3.5.0 + 3.6.2 package @@ -105,29 +127,13 @@ - - - org.owasp - dependency-check-maven - 6.1.0 - - true - - - - - aggregate - - - - - com.digitalpebble.stormcrawler - storm-crawler-core + org.apache.stormcrawler + stormcrawler-core ${stormcrawler.version} @@ -144,30 +150,15 @@ - com.digitalpebble.stormcrawler - storm-crawler-elasticsearch + org.apache.stormcrawler + stormcrawler-opensearch ${stormcrawler.version} - com.digitalpebble.stormcrawler - storm-crawler-warc + org.apache.stormcrawler + stormcrawler-warc ${stormcrawler.version} - - - - jdk.tools - jdk.tools - - - - - - - com.fasterxml.jackson.core - jackson-databind - ${jackson-databind.version} - com.github.crawler-commons crawler-commons @@ -182,8 +173,8 @@ - com.digitalpebble.stormcrawler - storm-crawler-core + org.apache.stormcrawler + stormcrawler-core ${stormcrawler.version} test-jar test @@ -210,14 +201,4 @@ test - - - - - commons-io - commons-io - 2.11.0 - - - diff --git a/src/main/java/org/commoncrawl/stormcrawler/filter/FastURLFilter.java b/src/main/java/org/commoncrawl/stormcrawler/filter/FastURLFilter.java index 6478ec1..7c52716 100644 --- a/src/main/java/org/commoncrawl/stormcrawler/filter/FastURLFilter.java +++ b/src/main/java/org/commoncrawl/stormcrawler/filter/FastURLFilter.java @@ -30,7 +30,11 @@ import java.util.regex.Pattern; import java.util.zip.GZIPInputStream; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.stormcrawler.JSONResource; +import org.apache.stormcrawler.Metadata; +import org.apache.stormcrawler.filtering.URLFilter; +import org.apache.stormcrawler.util.ConfUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -39,10 +43,6 @@ import com.amazonaws.services.s3.model.GetObjectRequest; import com.amazonaws.services.s3.model.ObjectMetadata; import com.amazonaws.services.s3.model.S3Object; -import com.digitalpebble.stormcrawler.JSONResource; -import com.digitalpebble.stormcrawler.Metadata; -import com.digitalpebble.stormcrawler.filtering.URLFilter; -import com.digitalpebble.stormcrawler.util.ConfUtils; import com.fasterxml.jackson.core.JsonParseException; import com.fasterxml.jackson.databind.JsonMappingException; import com.fasterxml.jackson.databind.JsonNode; @@ -419,4 +419,4 @@ public boolean match(URL url) { return pattern.matcher(haystack).find(); } } -} \ No newline at end of file +} diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java b/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java index ce21c7a..54c69eb 100644 --- a/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java +++ b/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java @@ -24,26 +24,25 @@ import org.apache.storm.topology.BoltDeclarer; import org.apache.storm.topology.TopologyBuilder; import org.apache.storm.tuple.Fields; +import org.apache.stormcrawler.ConfigurableTopology; +import org.apache.stormcrawler.Constants; +import org.apache.stormcrawler.bolt.FeedParserBolt; +import org.apache.stormcrawler.bolt.FetcherBolt; +import org.apache.stormcrawler.bolt.URLFilterBolt; +import org.apache.stormcrawler.bolt.URLPartitionerBolt; +import org.apache.stormcrawler.indexing.DummyIndexer; +import org.apache.stormcrawler.opensearch.persistence.AggregationSpout; +import org.apache.stormcrawler.opensearch.persistence.StatusUpdaterBolt; +import org.apache.stormcrawler.protocol.AbstractHttpProtocol; +import org.apache.stormcrawler.spout.FileSpout; +import org.apache.stormcrawler.util.ConfUtils; +import org.apache.stormcrawler.util.URLStreamGrouping; +import org.apache.stormcrawler.warc.FileTimeSizeRotationPolicy; +import org.apache.stormcrawler.warc.FileTimeSizeRotationPolicy.Units; +import org.apache.stormcrawler.warc.WARCFileNameFormat; +import org.apache.stormcrawler.warc.WARCHdfsBolt; import org.slf4j.LoggerFactory; -import com.digitalpebble.stormcrawler.ConfigurableTopology; -import com.digitalpebble.stormcrawler.Constants; -import com.digitalpebble.stormcrawler.bolt.FeedParserBolt; -import com.digitalpebble.stormcrawler.bolt.FetcherBolt; -import com.digitalpebble.stormcrawler.bolt.URLFilterBolt; -import com.digitalpebble.stormcrawler.bolt.URLPartitionerBolt; -import com.digitalpebble.stormcrawler.elasticsearch.persistence.AggregationSpout; -import com.digitalpebble.stormcrawler.elasticsearch.persistence.StatusUpdaterBolt; -import com.digitalpebble.stormcrawler.indexing.DummyIndexer; -import com.digitalpebble.stormcrawler.protocol.AbstractHttpProtocol; -import com.digitalpebble.stormcrawler.spout.FileSpout; -import com.digitalpebble.stormcrawler.util.ConfUtils; -import com.digitalpebble.stormcrawler.util.URLStreamGrouping; -import com.digitalpebble.stormcrawler.warc.FileTimeSizeRotationPolicy; -import com.digitalpebble.stormcrawler.warc.FileTimeSizeRotationPolicy.Units; -import com.digitalpebble.stormcrawler.warc.WARCFileNameFormat; -import com.digitalpebble.stormcrawler.warc.WARCHdfsBolt; - /** * Dummy topology to play with the spouts and bolts on ElasticSearch */ diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/FeedDetectorBolt.java b/src/main/java/org/commoncrawl/stormcrawler/news/FeedDetectorBolt.java index bdf5f58..c365525 100644 --- a/src/main/java/org/commoncrawl/stormcrawler/news/FeedDetectorBolt.java +++ b/src/main/java/org/commoncrawl/stormcrawler/news/FeedDetectorBolt.java @@ -21,15 +21,15 @@ import org.apache.storm.tuple.Values; import org.slf4j.LoggerFactory; -import com.digitalpebble.stormcrawler.Constants; -import com.digitalpebble.stormcrawler.Metadata; -import com.digitalpebble.stormcrawler.bolt.FeedParserBolt; -import com.digitalpebble.stormcrawler.parse.ParseData; -import com.digitalpebble.stormcrawler.parse.ParseFilter; -import com.digitalpebble.stormcrawler.parse.ParseFilters; -import com.digitalpebble.stormcrawler.parse.ParseResult; -import com.digitalpebble.stormcrawler.persistence.Status; -import com.digitalpebble.stormcrawler.protocol.HttpHeaders; +import org.apache.stormcrawler.Constants; +import org.apache.stormcrawler.Metadata; +import org.apache.stormcrawler.bolt.FeedParserBolt; +import org.apache.stormcrawler.parse.ParseData; +import org.apache.stormcrawler.parse.ParseFilter; +import org.apache.stormcrawler.parse.ParseFilters; +import org.apache.stormcrawler.parse.ParseResult; +import org.apache.stormcrawler.persistence.Status; +import org.apache.http.HttpHeaders; /** Detect RSS and Atom feeds, but do not parse and extract links */ @SuppressWarnings("serial") diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserBolt.java b/src/main/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserBolt.java index 42f4de3..3c4cf55 100644 --- a/src/main/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserBolt.java +++ b/src/main/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserBolt.java @@ -13,7 +13,7 @@ */ package org.commoncrawl.stormcrawler.news; -import static com.digitalpebble.stormcrawler.Constants.StatusStreamName; +import static org.apache.stormcrawler.Constants.StatusStreamName; import java.io.IOException; import java.net.URL; @@ -25,28 +25,27 @@ import java.util.List; import java.util.Map; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.http.HttpHeaders; import org.apache.storm.metric.api.MeanReducer; import org.apache.storm.metric.api.ReducedMetric; import org.apache.storm.task.OutputCollector; import org.apache.storm.task.TopologyContext; import org.apache.storm.tuple.Tuple; import org.apache.storm.tuple.Values; +import org.apache.stormcrawler.Constants; +import org.apache.stormcrawler.Metadata; +import org.apache.stormcrawler.bolt.SiteMapParserBolt; +import org.apache.stormcrawler.parse.Outlink; +import org.apache.stormcrawler.parse.ParseData; +import org.apache.stormcrawler.parse.ParseFilter; +import org.apache.stormcrawler.parse.ParseFilters; +import org.apache.stormcrawler.parse.ParseResult; +import org.apache.stormcrawler.persistence.DefaultScheduler; +import org.apache.stormcrawler.persistence.Status; +import org.apache.stormcrawler.util.ConfUtils; import org.slf4j.LoggerFactory; -import com.digitalpebble.stormcrawler.Constants; -import com.digitalpebble.stormcrawler.Metadata; -import com.digitalpebble.stormcrawler.bolt.SiteMapParserBolt; -import com.digitalpebble.stormcrawler.parse.Outlink; -import com.digitalpebble.stormcrawler.parse.ParseData; -import com.digitalpebble.stormcrawler.parse.ParseFilter; -import com.digitalpebble.stormcrawler.parse.ParseFilters; -import com.digitalpebble.stormcrawler.parse.ParseResult; -import com.digitalpebble.stormcrawler.persistence.DefaultScheduler; -import com.digitalpebble.stormcrawler.persistence.Status; -import com.digitalpebble.stormcrawler.protocol.HttpHeaders; -import com.digitalpebble.stormcrawler.util.ConfUtils; - import crawlercommons.sitemaps.AbstractSiteMap; import crawlercommons.sitemaps.Namespace; import crawlercommons.sitemaps.SiteMap; diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/PreFilterBolt.java b/src/main/java/org/commoncrawl/stormcrawler/news/PreFilterBolt.java index 5a880da..b986506 100644 --- a/src/main/java/org/commoncrawl/stormcrawler/news/PreFilterBolt.java +++ b/src/main/java/org/commoncrawl/stormcrawler/news/PreFilterBolt.java @@ -4,7 +4,7 @@ import java.lang.invoke.MethodHandles; import java.util.Map; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.storm.task.OutputCollector; import org.apache.storm.task.TopologyContext; import org.apache.storm.topology.OutputFieldsDeclarer; @@ -12,13 +12,12 @@ import org.apache.storm.tuple.Fields; import org.apache.storm.tuple.Tuple; import org.apache.storm.tuple.Values; +import org.apache.stormcrawler.Metadata; +import org.apache.stormcrawler.filtering.URLFilters; +import org.apache.stormcrawler.persistence.Status; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.digitalpebble.stormcrawler.Metadata; -import com.digitalpebble.stormcrawler.filtering.URLFilters; -import com.digitalpebble.stormcrawler.persistence.Status; - /** * Variant of the URLFilterBolt to go upstream of the fetching to catch anything * before it goes further into the topology. If filtered, a URL gets an ERROR @@ -34,7 +33,7 @@ public class PreFilterBolt extends BaseRichBolt { private final String filterConfigFile; - private static final String _s = com.digitalpebble.stormcrawler.Constants.StatusStreamName; + private static final String _s = org.apache.stormcrawler.Constants.StatusStreamName; public PreFilterBolt(String filterConfigFile) { this.filterConfigFile = filterConfigFile; diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/PunycodeURLNormalizer.java b/src/main/java/org/commoncrawl/stormcrawler/news/PunycodeURLNormalizer.java index a28e2e9..4adf03d 100644 --- a/src/main/java/org/commoncrawl/stormcrawler/news/PunycodeURLNormalizer.java +++ b/src/main/java/org/commoncrawl/stormcrawler/news/PunycodeURLNormalizer.java @@ -18,8 +18,8 @@ import java.net.URL; import java.util.Map; -import com.digitalpebble.stormcrawler.Metadata; -import com.digitalpebble.stormcrawler.filtering.URLFilter; +import org.apache.stormcrawler.Metadata; +import org.apache.stormcrawler.filtering.URLFilter; import com.fasterxml.jackson.databind.JsonNode; public class PunycodeURLNormalizer extends URLFilter { diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/BootstrapTopology.java b/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/BootstrapTopology.java index 51aa116..821551e 100644 --- a/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/BootstrapTopology.java +++ b/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/BootstrapTopology.java @@ -23,19 +23,19 @@ import org.commoncrawl.stormcrawler.news.FeedDetectorBolt; import org.slf4j.LoggerFactory; -import com.digitalpebble.stormcrawler.ConfigurableTopology; -import com.digitalpebble.stormcrawler.Constants; -import com.digitalpebble.stormcrawler.bolt.FetcherBolt; -import com.digitalpebble.stormcrawler.bolt.JSoupParserBolt; -import com.digitalpebble.stormcrawler.bolt.URLFilterBolt; -import com.digitalpebble.stormcrawler.bolt.URLPartitionerBolt; -import com.digitalpebble.stormcrawler.elasticsearch.persistence.AggregationSpout; -import com.digitalpebble.stormcrawler.elasticsearch.persistence.StatusUpdaterBolt; -import com.digitalpebble.stormcrawler.indexing.DummyIndexer; -import com.digitalpebble.stormcrawler.spout.FileSpout; -import com.digitalpebble.stormcrawler.util.ConfUtils; -import com.digitalpebble.stormcrawler.util.URLStreamGrouping; -import com.digitalpebble.stormcrawler.warc.WARCHdfsBolt; +import org.apache.stormcrawler.ConfigurableTopology; +import org.apache.stormcrawler.Constants; +import org.apache.stormcrawler.bolt.FetcherBolt; +import org.apache.stormcrawler.bolt.JSoupParserBolt; +import org.apache.stormcrawler.bolt.URLFilterBolt; +import org.apache.stormcrawler.bolt.URLPartitionerBolt; +import org.apache.stormcrawler.opensearch.persistence.AggregationSpout; +import org.apache.stormcrawler.opensearch.persistence.StatusUpdaterBolt; +import org.apache.stormcrawler.indexing.DummyIndexer; +import org.apache.stormcrawler.spout.FileSpout; +import org.apache.stormcrawler.util.ConfUtils; +import org.apache.stormcrawler.util.URLStreamGrouping; +import org.apache.stormcrawler.warc.WARCHdfsBolt; /** * Dummy topology to play with the spouts and bolts on ElasticSearch @@ -114,4 +114,4 @@ protected int run(String[] args) { return submit(conf, builder); } -} \ No newline at end of file +} diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/FeedLinkParseFilter.java b/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/FeedLinkParseFilter.java index bc36e64..5707189 100644 --- a/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/FeedLinkParseFilter.java +++ b/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/FeedLinkParseFilter.java @@ -18,10 +18,10 @@ import org.slf4j.LoggerFactory; import org.w3c.dom.DocumentFragment; -import com.digitalpebble.stormcrawler.bolt.FeedParserBolt; -import com.digitalpebble.stormcrawler.parse.Outlink; -import com.digitalpebble.stormcrawler.parse.ParseResult; -import com.digitalpebble.stormcrawler.parse.filter.LinkParseFilter; +import org.apache.stormcrawler.bolt.FeedParserBolt; +import org.apache.stormcrawler.parse.Outlink; +import org.apache.stormcrawler.parse.ParseResult; +import org.apache.stormcrawler.parse.filter.LinkParseFilter; /** * ParseFilter which extracts exclusively RSS links via Xpath, all other links @@ -67,4 +67,4 @@ public static void logLinks(ParseResult parse, String URL, String message) { } } -} \ No newline at end of file +} diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/NewsSiteMapDetectorBolt.java b/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/NewsSiteMapDetectorBolt.java index 2b773be..1160201 100644 --- a/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/NewsSiteMapDetectorBolt.java +++ b/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/NewsSiteMapDetectorBolt.java @@ -23,14 +23,14 @@ import org.commoncrawl.stormcrawler.news.NewsSiteMapParserBolt; import org.slf4j.LoggerFactory; -import com.digitalpebble.stormcrawler.Constants; -import com.digitalpebble.stormcrawler.Metadata; -import com.digitalpebble.stormcrawler.bolt.SiteMapParserBolt; -import com.digitalpebble.stormcrawler.parse.ParseData; -import com.digitalpebble.stormcrawler.parse.ParseFilter; -import com.digitalpebble.stormcrawler.parse.ParseFilters; -import com.digitalpebble.stormcrawler.parse.ParseResult; -import com.digitalpebble.stormcrawler.persistence.Status; +import org.apache.stormcrawler.Constants; +import org.apache.stormcrawler.Metadata; +import org.apache.stormcrawler.bolt.SiteMapParserBolt; +import org.apache.stormcrawler.parse.ParseData; +import org.apache.stormcrawler.parse.ParseFilter; +import org.apache.stormcrawler.parse.ParseFilters; +import org.apache.stormcrawler.parse.ParseResult; +import org.apache.stormcrawler.persistence.Status; /** * Detector for Date: Thu, 26 Mar 2026 21:34:02 +0100 Subject: [PATCH 2/8] Sync GitHub CI build workflow with upstream StormCrawler Prepare for renaming of development branch to "main". --- .github/workflows/maven.yml | 67 ++++++++++++++++++++++++++++++------- 1 file changed, 54 insertions(+), 13 deletions(-) diff --git a/.github/workflows/maven.yml b/.github/workflows/maven.yml index 10acad1..dc52ef3 100644 --- a/.github/workflows/maven.yml +++ b/.github/workflows/maven.yml @@ -1,26 +1,67 @@ -# This workflow will build a Java project with Maven, and cache/restore any dependencies to improve the workflow execution time -# For more information see: https://help.github.com/actions/language-and-framework-guides/building-and-testing-java-with-maven +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. name: Java CI with Maven on: + # Run CI on Pushes to "main"" or on pull requests targeting "main". push: - branches: [ master ] + branches: + - main pull_request: - branches: [ master ] + branches: + - main jobs: - build: - + rat: runs-on: ubuntu-latest + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + - uses: actions/cache@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4 + with: + path: ~/.m2/repository + key: rat-maven-${{ hashFiles('**/pom.xml') }} + - name: Set up JDK ${{ matrix.java }} + uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5.2.0 + with: + distribution: adopt + java-version: 17 + - name: Build with Maven + run: mvn -B --no-transfer-progress -Prat -DskipTests verify -Dskip.format.code=false + build: + needs: rat + runs-on: ${{ matrix.os }} + continue-on-error: ${{ matrix.experimental }} + strategy: + matrix: + os: [ubuntu-latest] + java: [ 17 ] + experimental: [false] steps: - - uses: actions/checkout@v2 - - name: Set up JDK 8 - uses: actions/setup-java@v2 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + - uses: actions/cache@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4 + with: + path: ~/.m2/repository + key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }} + restore-keys: | + ${{ runner.os }}-maven- + - name: Set up JDK ${{ matrix.java }} + uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5.2.0 with: - java-version: '8' - distribution: 'adopt' - cache: maven + distribution: adopt + java-version: ${{ matrix.java }} - name: Build with Maven - run: mvn -B package --file pom.xml + run: mvn -B --no-transfer-progress package --file pom.xml -DCI_ENV=true verify From 6bd1e31c8cc1139823611949857e6c60701f1b8c Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Tue, 31 Mar 2026 16:44:17 +0200 Subject: [PATCH 3/8] Replace Dockerfile by docker compose config Make the configuration work. --- Dockerfile | 134 +++-------------------- README.md | 102 +++++++++-------- bin/OS_IndexInit.sh | 123 +++------------------ bin/dashboards/OS_ImportDashboards.sh | 29 +++++ bin/dashboards/metrics.ndjson | 10 ++ bin/dashboards/status.ndjson | 5 + bin/dashboards/storm.ndjson | 5 + bin/run-crawler.sh | 43 -------- bin/{es_status => status} | 0 conf/crawler-conf.yaml | 23 +++- conf/crawler.flux | 10 +- conf/es-conf.yaml | 83 -------------- conf/opensearch-conf.yaml | 126 +++++++++++++++++++++ docker-compose.yaml | 122 +++++++++++++++++++++ etc/supervisor/conf.d/elasticsearch.conf | 8 -- etc/supervisor/conf.d/kibana.conf | 7 -- etc/sysctl.d/60-elasticsearch.conf | 7 -- seeds/feeds.txt | 29 +---- src/main/resources/indexer.mapping | 40 +++++++ src/main/resources/metrics.mapping | 40 +++++++ src/main/resources/status.mapping | 39 +++++++ 21 files changed, 535 insertions(+), 450 deletions(-) create mode 100755 bin/dashboards/OS_ImportDashboards.sh create mode 100644 bin/dashboards/metrics.ndjson create mode 100644 bin/dashboards/status.ndjson create mode 100644 bin/dashboards/storm.ndjson delete mode 100755 bin/run-crawler.sh rename bin/{es_status => status} (100%) delete mode 100644 conf/es-conf.yaml create mode 100644 conf/opensearch-conf.yaml create mode 100644 docker-compose.yaml delete mode 100644 etc/supervisor/conf.d/elasticsearch.conf delete mode 100644 etc/supervisor/conf.d/kibana.conf delete mode 100644 etc/sysctl.d/60-elasticsearch.conf create mode 100644 src/main/resources/indexer.mapping create mode 100644 src/main/resources/metrics.mapping create mode 100644 src/main/resources/status.mapping diff --git a/Dockerfile b/Dockerfile index eaad382..683df8f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,133 +1,29 @@ -FROM ubuntu:22.04 +FROM storm:2.8.4 RUN apt-get update -qq && \ - apt-get upgrade -yq && \ -# apt-mark hold openjdk-11-jre-headless && \ apt-get install -yq --no-install-recommends \ - apt-transport-https \ - apt-utils \ - ca-certificates \ curl \ - git-core \ - gnupg \ jq \ less \ - maven \ -# openjdk-8-jdk-headless \ - sudo \ - supervisor \ - wget \ - tar \ - vim -# zookeeperd + vim # -# Elasticsearch and Kibana +# news-crawler # -ENV ES_VERSION=7.10.2 -RUN wget -qO - https://artifacts.elastic.co/GPG-KEY-elasticsearch \ - | apt-key add - -RUN echo "deb https://artifacts.elastic.co/packages/7.x/apt stable main" \ - >> /etc/apt/sources.list.d/elasticsearch-7.x.list -RUN apt-get update -qq && \ - apt-get install -yq --no-install-recommends \ - elasticsearch=$ES_VERSION \ - kibana=$ES_VERSION -RUN ln -s /usr/share/elasticsearch/bin/elasticsearch /usr/bin/elasticsearch -RUN ln -s /usr/share/kibana/bin/kibana /usr/bin/kibana -USER root -# system configuration, see https://www.elastic.co/guide/en/elasticsearch/reference/current/deb.html -ADD etc/sysctl.d/60-elasticsearch.conf /etc/sysctl.d/60-elasticsearch.conf -ADD etc/supervisor/conf.d/elasticsearch.conf /etc/supervisor/conf.d/elasticsearch.conf -ADD etc/supervisor/conf.d/kibana.conf /etc/supervisor/conf.d/kibana.conf -RUN chmod -R 644 /etc/sysctl.d/60-elasticsearch.conf /etc/supervisor/conf.d/*.conf -ENV ES_HEAP_SIZE=20g -# set Elasticsearch data path -RUN sed -Ei 's@^path\.data: .*@path.data: /data/elasticsearch@' /etc/elasticsearch/elasticsearch.yml -# TODO: enable updates via scripting - - -# Zookeeper - -ENV ZOOKEEPER_VERSION=3.8.3 -RUN wget -q -O - https://downloads.apache.org/zookeeper/zookeeper-$ZOOKEEPER_VERSION/apache-zookeeper-$ZOOKEEPER_VERSION-bin.tar.gz \ - | sudo tar -xzf - -C /opt -ENV ZOOKEEPER_HOME=/opt/apache-zookeeper-$ZOOKEEPER_VERSION-bin -RUN ln -s $ZOOKEEPER_HOME/conf/zoo_sample.cfg $ZOOKEEPER_HOME/conf/zoo.cfg -# prevent ZK's admin UI to run on 8080 -RUN echo "admin.enableServer=false" >> $ZOOKEEPER_HOME/conf/zoo.cfg -RUN ln -s $ZOOKEEPER_HOME /usr/share/zookeeper - -# -# Apache Storm -# -ENV STORM_VERSION=2.5.0 -COPY downloads/apache-storm-$STORM_VERSION.tar.gz /tmp/apache-storm-$STORM_VERSION.tar.gz -RUN tar -xzf /tmp/apache-storm-$STORM_VERSION.tar.gz -C /opt -RUN rm /tmp/apache-storm-$STORM_VERSION.tar.gz -ENV STORM_HOME /opt/apache-storm-$STORM_VERSION -RUN groupadd storm && \ - useradd --gid storm --home-dir /home/storm \ - --create-home --shell /bin/bash storm && \ - chown -R storm:storm $STORM_HOME && \ - mkdir /var/log/storm && \ - chown -R storm:storm /var/log/storm -RUN ln -s /var/log/storm $STORM_HOME/logs -RUN ln -s $STORM_HOME/bin/storm /usr/bin/storm - -ADD etc/supervisor/conf.d/storm-*.conf /etc/supervisor/conf.d/ -ADD etc/supervisor/conf.d/zookeeper.conf /etc/supervisor/conf.d/ -RUN chmod -R 644 /etc/supervisor/conf.d/*.conf +ENV CRAWLER_VERSION=3.5.1 - -# -# Storm crawler / news crawler -# -ENV CRAWLER_VERSION=2.10.0 -RUN groupadd ubuntu && \ - useradd --gid ubuntu --home-dir /home/ubuntu \ - --create-home --shell /bin/bash ubuntu && \ - chown -R ubuntu:ubuntu /home/ubuntu -USER ubuntu -WORKDIR /home/ubuntu -RUN mkdir news-crawler/ && \ - mkdir news-crawler/conf/ && \ - mkdir news-crawler/lib/ && \ - mkdir news-crawler/bin/ && \ - mkdir news-crawler/seeds/ && \ - chmod -R a+rx news-crawler/ +RUN mkdir /news-crawler/ && \ + mkdir /news-crawler/conf/ && \ + mkdir /news-crawler/lib/ && \ + mkdir /news-crawler/bin/ && \ + chmod -R a+rx /news-crawler/ # add the news crawler uber-jar -ADD target/crawler-$CRAWLER_VERSION.jar news-crawler/lib/crawler.jar +ADD target/crawler-$CRAWLER_VERSION.jar /news-crawler/lib/crawler.jar # and configuration files -ADD conf/*.* news-crawler/conf/ -ADD seeds/*.txt news-crawler/seeds/ -ADD bin/*.sh news-crawler/bin/ -ADD bin/es_status news-crawler/bin/ - -USER root -RUN chown -R ubuntu:ubuntu /home/ubuntu && \ - chmod -R a+r /home/ubuntu && \ - chmod u+x news-crawler/bin/* - - -# Ports: -# 8080 - Storm UI -# 9200 - Elasticsearch http -# 9300 - Elasticsearch java -# 5601 - Kibana -EXPOSE 8080 9200 9300 5601 - -# volumes for persistent data -USER root -RUN mkdir /data -RUN mkdir /data/elasticsearch && chown elasticsearch:elasticsearch /data/elasticsearch -VOLUME ["/data/elasticsearch"] -RUN mkdir /data/warc && chown storm:storm /data/warc -VOLUME ["/data/warc"] - -# start all services -CMD ["/usr/bin/supervisord"] +ADD conf/*.* /news-crawler/conf/ +ADD bin/*.sh /news-crawler/bin/ +ADD bin/status /news-crawler/bin/ -# launch the crawl -# CMD ["/home/ubuntu/news-crawler/bin/run-crawler.sh"] +USER storm +WORKDIR /news-crawler/ diff --git a/README.md b/README.md index 58f0d4d..ad990bf 100644 --- a/README.md +++ b/README.md @@ -1,30 +1,29 @@ -# NEWS-CRAWL +# News Crawler -Crawler for news based on [StormCrawler](https://stormcrawler.net/). Produces WARC files to be stored as part of the [Common Crawl](https://commoncrawl.org/). The data is hosted as [AWS Open Data Set](https://registry.opendata.aws/) – if you want to use the data and not the crawler software please read [the announcement of the news dataset](https://commoncrawl.org/2016/10/news-dataset-available/). +Crawler for news based on [StormCrawler](https://stormcrawler.apache.org/). Produces WARC files to be stored as part of the [Common Crawl](https://commoncrawl.org/). The data is hosted as [AWS Open Data Set](https://registry.opendata.aws/) – if you want to use the data and not the crawler software please read [the announcement of the news dataset](https://commoncrawl.org/2016/10/news-dataset-available/). -Prerequisites -------------- +## Prerequisites -* Install Elasticsearch 7.10.2 (ev. also Kibana) -* Install Apache Storm 2.5.0 -* Start Elasticsearch and Storm -* Build ES indices by running `bin/ES_IndexInit.sh` +* Install OpenSearch 2.19.4 +* Install Apache Storm 2.8.4 +* Start OpenSearch and Storm +* Create the OpenSearch indices by running [bin/OS_IndexInit.sh](bin/OS_IndexInit.sh) and the dashboards by [OS_ImportDashboards.sh](bin/OS_ImportDashboards.sh) -Crawler Seeds -------------- +Alternatively, use the Docker Compose setup, see below. -The crawler relies on [RSS](https://en.wikipedia.org/wiki/RSS)/[Atom](https://en.wikipedia.org/wiki/Atom_(Web_standard)) feeds and [news sitemaps](https://en.wikipedia.org/wiki/Sitemaps#Google_News_Sitemaps) to find links to news articles on news sites. A small collection of example seeds (feeds and sitemaps) is provided in [./seeds/](./seeds/). Adding support for news sites which do not provide a news feed or sitemap is an open issue, see [#41](//github.com/commoncrawl/news-crawl/issues/41). +## Crawler Seeds -Configuration -------------- +The crawler relies on [RSS](https://en.wikipedia.org/wiki/RSS)/[Atom](https://en.wikipedia.org/wiki/Atom_(Web_standard)) feeds and [news sitemaps](https://en.wikipedia.org/wiki/Sitemaps#Google_News_Sitemaps) to find links to news articles on news sites. A small collection of example seeds (feeds and sitemaps) is provided in [./seeds/](./seeds/). Adding support for news sites which do not provide a news feed or sitemap is an open issue, see [#41](https://github.com/commoncrawl/news-crawl/issues/41). + + +## Configuration The default configuration should work out-of-the-box. The only thing to do is to configure the user agent properties send in the HTTP request header. Open the file `conf/crawler-conf.yaml` in an editor and fill in the values for `http.agent.name` and all further properties starting with the `http.agent.` prefix. -Run the crawl -------------- +## Run the crawl Generate an uberjar: ``` sh @@ -33,23 +32,23 @@ mvn clean package And run ... ``` sh -storm local target/crawler-2.10.0.jar --local-ttl 60 -- org.commoncrawl.stormcrawler.news.CrawlTopology -conf $PWD/conf/es-conf.yaml -conf $PWD/conf/crawler-conf.yaml $PWD/seeds/ feeds.txt +storm local target/crawler-3.5.1.jar --local-ttl 60 -- org.commoncrawl.stormcrawler.news.CrawlTopology -conf $PWD/conf/opensearch-conf.yaml -conf $PWD/conf/crawler-conf.yaml $PWD/seeds/ feeds.txt ``` This will launch the crawl topology in local mode for 60 seconds. It will also "inject" all URLs found in the file `./seeds/feeds.txt` in the status index. The URLs point to news feeds and sitemaps from which links to news articles are extracted and fetched. The topology will create WARC files in the directory specified in the configuration under the key `warc.dir`. This directory must be created beforehand. Of course, it's also possible to add (or remove) the seeds (feeds and sitemaps) using the Elasticsearch API. In this case, the can topology can be run without the last two arguments. -Alternatively, the topology can be run from the [crawler.flux](./conf/crawler.flux), please see the [Storm Flux documentation](https://storm.apache.org/releases/2.5.0/flux.html). Make sure to adapt the Flux definition to your needs! +Alternatively, the topology can be run from the [crawler.flux](./conf/crawler.flux), please see the [Storm Flux documentation](https://storm.apache.org/releases/2.8.4/flux.html). Make sure to adapt the Flux definition to your needs! In production, you should use `storm jar ...` to run the topology in distributed mode and continuously (no time limit) including the Storm UI and logging. -Monitor the crawl ------------------ -When the topology is running you can check that URLs have been injected and news are getting fetched on [http://localhost:9200/status/_search?pretty]. Or use StormCrawler's Kibana dashboards to monitor the crawling process. Please follow the instructions to install the templates for Kibana provided as part of [StormCrawler's Elasticsearch module documentation](//github.com/DigitalPebble/storm-crawler/tree/master/external/elasticsearch). +## Monitor the crawl -There is also a shell script [bin/es_status](./bin/es_status) to get aggregated counts from the status index, and to add, delete or force a re-fetch of URLs. E.g., +When the topology is running you can check that URLs have been injected and news are getting fetched on . Or use StormCrawler's OpenSearch dashboards to monitor the crawling process on . + +There is also a shell script [bin/status](./bin/status) to get aggregated counts from the status index, and to add, delete or force a re-fetch of URLs. E.g., ``` $> bin/es_status aggregate_status @@ -59,15 +58,7 @@ $> bin/es_status aggregate_status ``` -Run Crawl from Docker Container -------------------------------- - -First, download Apache Storm 2.5.0. from the [download page](https://storm.apache.org/downloads.html) and place it in the directory `downloads`: -``` -STORM_VERSION=2.5.0 -mkdir downloads -wget -q -P downloads --timestamping https://downloads.apache.org/storm/apache-storm-$STORM_VERSION/apache-storm-$STORM_VERSION.tar.gz -``` +## Run Crawl with Docker Compose Do not forget to create the uberjar (see above) which is included in the Docker image. Simply run: @@ -75,31 +66,56 @@ Do not forget to create the uberjar (see above) which is included in the Docker mvn clean package ``` -Then build the Docker image from the [Dockerfile](./Dockerfile): +Verify the configuration in the file [docker-compose.yaml](docker-compose.yaml) and [conf/](conf/) is correct: +- Don't forget to adapt the paths to mounted volumes used to persist data (OpenSearch indexes and WARC files). +- Make sure to add the user agent configuration in conf/crawler-conf.yaml. -Note: the uberjar is included in the Docker image and needs to be built first (see above). +Then download and build the Docker images: ``` -docker build -t newscrawler:2.10.0 . +docker compose -f docker-compose.yaml up --build --renew-anon-volumes --remove-orphans ``` -To launch an interactive container: +Wait until the containers are running, then initialize the OpenSearch index and the dashboards: ``` -docker run --net=host \ - -v $PWD/data/elasticsearch:/data/elasticsearch \ - -v $PWD/data/warc:/data/warc \ - --rm --name newscrawler -i -t newscrawler:2.10.0 /bin/bash +./bin/OS_IndexInit.sh +./bin/dashboards/OS_ImportDashboards.sh ``` -NOTE: don't forget to adapt the paths to mounted volumes used to persist data on the host. Make sure to add the user agent configuration in conf/crawler-conf.yaml. +NOTE: +- This will delete existing indexes! +- Make sure that the OpenSearch port 9200 is not already in use or mapped by a running OpenSearch instance. Otherwise OpenSearch commands may affect the running instance! + + +To launch the topology using [Storm Flux](https://storm.apache.org/releases/2.8.4/flux.html): +``` +docker compose run --rm news-crawler \ + storm jar lib/crawler.jar org.apache.storm.flux.Flux --remote /news-crawler/conf/crawler.flux +``` +Or using the Java topology: +``` +docker compose run --rm news-crawler \ + storm jar lib/crawler.jar -- org.commoncrawl.stormcrawler.news.CrawlTopology \ + /data/seeds '*' -conf conf/opensearch-conf.yaml -conf conf/crawler-conf.yaml +``` -CAVEAT: Make sure that the Elasticsearch port 9200 is not already in use or mapped by a running ES instance. Otherwise Elasticsearch commands may affect the running instance! +After 1-2 minutes if everything is up, connect to OpenSearch on port [9200](http://localhost:9200/) or the OpenSearch dashboards on port [5601](http://localhost:5601/). -Once you are logged onto the Docker container, start the services and crawl with +For inspecting the worker log files: +``` +docker exec storm-supervisor /bin/bash -c 'cat /logs/workers-artifacts/*/*/worker.log' +``` +To stop the topology: ``` -/home/ubuntu/news-crawler/bin/run-crawler.sh +docker compose run --rm -ti news-crawler /bin/bash + +$> storm list +Topology_name Status Num_tasks Num_workers Uptime_secs Topology_Id Owner +---------------------------------------------------------------------------------------- +NewsCrawl ACTIVE 48 1 146 NewsCrawl-1-1774977605 storm + +$> storm kill NewsCrawl ``` -After 1-2 minutes if everything is up, connect to Elasticsearch on port [9200](http://127.0.0.1:9200/) or Kibana on port [5601](http://127.0.0.1:5601/). diff --git a/bin/OS_IndexInit.sh b/bin/OS_IndexInit.sh index 60e72ec..81b4066 100755 --- a/bin/OS_IndexInit.sh +++ b/bin/OS_IndexInit.sh @@ -13,121 +13,30 @@ # See the License for the specific language governing permissions and # limitations under the License. -ESHOST="http://localhost:9200" -#ESCREDENTIALS="-u elastic:passwordhere" +#!/bin/bash -# deletes and recreates a status index with a bespoke schema +# set -e -curl $ESCREDENTIALS -s -XDELETE "$ESHOST/status/" > /dev/null +OSHOST=${1:-"http://localhost:9200"} +OSCREDENTIALS=${2:-"-u opensearch:passwordhere"} -echo "Deleted status index" +curl $OSCREDENTIALS -s -XDELETE "$OSHOST/status/" > /dev/null +echo "Deleted 'status' index, now recreating it..." +curl $OSCREDENTIALS -s -XPUT "$OSHOST/status" -H 'Content-Type: application/json' --upload-file src/main/resources/status.mapping -# http://localhost:9200/status/_mapping/status?pretty +echo -echo "Creating status index with mapping" +curl $OSCREDENTIALS -s -XDELETE "$OSHOST/content/" > /dev/null +echo "Deleted 'content' index, now recreating it..." +curl $OSCREDENTIALS -s -XPUT "$OSHOST/content" -H 'Content-Type: application/json' --upload-file src/main/resources/indexer.mapping -curl $ESCREDENTIALS -s -XPUT $ESHOST/status -H 'Content-Type: application/json' -d ' -{ - "settings": { - "index": { - "number_of_shards": 16, - "number_of_replicas": 1, - "refresh_interval": "5s" - } - }, - "mappings": { - "dynamic_templates": [{ - "metadata": { - "path_match": "metadata.*", - "match_mapping_type": "string", - "mapping": { - "type": "keyword" - } - } - }], - "_source": { - "enabled": true - }, - "properties": { - "nextFetchDate": { - "type": "date", - "format": "dateOptionalTime" - }, - "status": { - "type": "keyword" - }, - "url": { - "type": "keyword" - } - } - } -}' +echo -# deletes and recreates a status index with a bespoke schema +curl $OSCREDENTIALS -s -XDELETE "$OSHOST/metrics*/" > /dev/null -curl $ESCREDENTIALS -s -XDELETE "$ESHOST/metrics*/" > /dev/null - -echo "" -echo "Deleted metrics index" - -curl $ESCREDENTIALS -s -XPUT $ESHOST/_ilm/policy/14d-deletion_policy -H 'Content-Type:application/json' -d ' -{ - "policy": { - "phases": { - "delete": { - "min_age": "14d", - "actions": { - "delete": {} - } - } - } - } -} -' - -echo "Creating metrics index with mapping" +echo "Deleted 'metrics' index, now recreating it..." # http://localhost:9200/metrics/_mapping/status?pretty -curl $ESCREDENTIALS -s -XPOST $ESHOST/_template/storm-metrics-template -H 'Content-Type: application/json' -d ' -{ - "index_patterns": "metrics*", - "settings": { - "index": { - "number_of_shards": 1, - "refresh_interval": "30s" - }, - "number_of_replicas": 0, - "lifecycle.name": "14d-deletion_policy" - }, - "mappings": { - "_source": { "enabled": true }, - "properties": { - "name": { - "type": "keyword" - }, - "stormId": { - "type": "keyword" - }, - "srcComponentId": { - "type": "keyword" - }, - "srcTaskId": { - "type": "short" - }, - "srcWorkerHost": { - "type": "keyword" - }, - "srcWorkerPort": { - "type": "integer" - }, - "timestamp": { - "type": "date", - "format": "dateOptionalTime" - }, - "value": { - "type": "double" - } - } - } -}' +curl $OSCREDENTIALS -s -XPOST "$OSHOST/_template/metrics-template" -H 'Content-Type: application/json' --upload-file src/main/resources/metrics.mapping +echo diff --git a/bin/dashboards/OS_ImportDashboards.sh b/bin/dashboards/OS_ImportDashboards.sh new file mode 100755 index 0000000..561f739 --- /dev/null +++ b/bin/dashboards/OS_ImportDashboards.sh @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/sh + +BIN=$(dirname $0) + +echo "Importing status dashboard into OpenSearch Dashboards" +curl -X POST "localhost:5601/api/saved_objects/_import" -H "osd-xsrf: true" --form file=@$BIN/status.ndjson +echo "" + +echo "Importing metrics dashboard into OpenSearch Dashboards" +curl -X POST "localhost:5601/api/saved_objects/_import" -H "osd-xsrf: true" --form file=@$BIN/metrics.ndjson +echo "" + +# Storm internal metrics +# curl -X POST "localhost:5601/api/saved_objects/_import" -H "kbn-xsrf: true" --form file=@$BIN/storm.ndjson diff --git a/bin/dashboards/metrics.ndjson b/bin/dashboards/metrics.ndjson new file mode 100644 index 0000000..20cbb2b --- /dev/null +++ b/bin/dashboards/metrics.ndjson @@ -0,0 +1,10 @@ +{"attributes":{"fields":"[{\"name\":\"_id\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_index\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_score\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_source\",\"type\":\"_source\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_type\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"name\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcComponentId\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcTaskId\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcWorkerHost\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcWorkerPort\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"stormId\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"timestamp\",\"type\":\"date\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"value\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true}]","timeFieldName":"timestamp","title":"metrics"},"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","migrationVersion":{"index-pattern":"6.5.0"},"references":[],"type":"index-pattern","updated_at":"2020-01-06T11:06:58.178Z","version":"WzMsMV0="} +{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:activethreads\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : # active threads","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"histogram\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":false,\"scale\":\"linear\",\"mode\":\"grouped\",\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"2\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"3\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"4\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"srcTaskId\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"2\"}}],\"listeners\":{},\"title\":\"Fetcher : # active threads\"}"},"id":"Fetcher-:-#-active-threads","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.178Z","version":"WzksMV0="} +{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:num_queues\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : num queues","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"histogram\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":false,\"scale\":\"linear\",\"mode\":\"grouped\",\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{},\"spyPerPage\":10},\"aggs\":[{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"3\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"4\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"srcTaskId\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"3\"}}],\"listeners\":{},\"title\":\"Fetcher : num queues\"}"},"id":"Fetcher-:-num-queues","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.175Z","version":"WzgsMV0="} +{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:fetcher_counter.fetched\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : pages fetched","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"histogram\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":false,\"scale\":\"linear\",\"mode\":\"grouped\",\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{},\"spyPerPage\":10},\"aggs\":[{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"3\",\"type\":\"sum\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"4\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"srcTaskId\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"3\"}}],\"listeners\":{},\"title\":\"Fetcher : pages fetched\"}"},"id":"Fetcher-:-pages-fetched","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.170Z","version":"WzcsMV0="} +{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:in_queues\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : URLs waiting in queues","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"histogram\",\"params\":{\"addLegend\":false,\"addTimeMarker\":false,\"addTooltip\":true,\"defaultYExtents\":false,\"mode\":\"grouped\",\"scale\":\"linear\",\"setYExtents\":false,\"shareYAxis\":true,\"spyPerPage\":10,\"times\":[],\"yAxis\":{}},\"aggs\":[{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"4\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"5\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"srcTaskId\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"4\"}}],\"listeners\":{},\"title\":\"Fetcher : URLs waiting in queues\"}"},"id":"Fetcher-:-URLs-waiting-in-queues","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.160Z","version":"WzUsMV0="} +{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:fetcher_average_persec.bytes_fetched_perSec\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : average bytes per second","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"line\",\"params\":{\"shareYAxis\":true,\"addTooltip\":false,\"addLegend\":false,\"showCircles\":true,\"smoothLines\":true,\"interpolate\":\"linear\",\"scale\":\"linear\",\"drawLinesBetweenPoints\":true,\"radiusRatio\":9,\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"3\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}}],\"listeners\":{},\"title\":\"Fetcher : average bytes per second\"}"},"id":"Fetcher-:-average-bytes-per-second","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.173Z","version":"WzYsMV0="} +{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:fetcher_average_persec.fetched_perSec\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : average pages per second","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"line\",\"params\":{\"shareYAxis\":true,\"addTooltip\":false,\"addLegend\":false,\"showCircles\":true,\"smoothLines\":true,\"interpolate\":\"linear\",\"scale\":\"linear\",\"drawLinesBetweenPoints\":true,\"radiusRatio\":9,\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"2\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"3\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}}],\"listeners\":{},\"title\":\"Fetcher : average pages per second\"}"},"id":"Fetcher-:-average-pages-per-second","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.820Z","version":"WzEwLDFd"} +{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:fetcher_counter.bytes_fetched\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Total bytes fetched","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"line\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":false,\"showCircles\":true,\"smoothLines\":true,\"interpolate\":\"linear\",\"scale\":\"linear\",\"drawLinesBetweenPoints\":true,\"radiusRatio\":9,\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"1\",\"type\":\"sum\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"m\",\"min_doc_count\":1,\"extended_bounds\":{}}}],\"listeners\":{},\"title\":\"Total bytes fetched\"}"},"id":"Total-bytes-fetched","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.823Z","version":"WzExLDFd"} +{"attributes":{"description":"","hits":0,"kibanaSavedObjectMeta":{"searchSourceJSON":"{\"filter\":[],\"query\":{\"query\":{\"query_string\":{\"analyze_wildcard\":true,\"query\":\"*\"}},\"language\":\"lucene\"}}"},"optionsJSON":"{\"useMargins\":true,\"hidePanelTitles\":false}","panelsJSON":"[{\"panelIndex\":\"1\",\"gridData\":{\"x\":24,\"y\":20,\"w\":12,\"h\":12,\"i\":\"1\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_0\"},{\"panelIndex\":\"2\",\"gridData\":{\"x\":12,\"y\":20,\"w\":12,\"h\":12,\"i\":\"2\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_1\"},{\"panelIndex\":\"3\",\"gridData\":{\"x\":0,\"y\":0,\"w\":36,\"h\":12,\"i\":\"3\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_2\"},{\"panelIndex\":\"4\",\"gridData\":{\"x\":0,\"y\":20,\"w\":12,\"h\":12,\"i\":\"4\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_3\"},{\"panelIndex\":\"5\",\"gridData\":{\"x\":0,\"y\":40,\"w\":36,\"h\":8,\"i\":\"5\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_4\"},{\"panelIndex\":\"6\",\"gridData\":{\"x\":0,\"y\":32,\"w\":36,\"h\":8,\"i\":\"6\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_5\"},{\"panelIndex\":\"7\",\"gridData\":{\"x\":0,\"y\":12,\"w\":36,\"h\":8,\"i\":\"7\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_6\"}]","timeRestore":false,"title":"Crawl metrics","version":1},"id":"Crawl-metrics","migrationVersion":{"dashboard":"7.3.0"},"references":[{"id":"Fetcher-:-#-active-threads","name":"panel_0","type":"visualization"},{"id":"Fetcher-:-num-queues","name":"panel_1","type":"visualization"},{"id":"Fetcher-:-pages-fetched","name":"panel_2","type":"visualization"},{"id":"Fetcher-:-URLs-waiting-in-queues","name":"panel_3","type":"visualization"},{"id":"Fetcher-:-average-bytes-per-second","name":"panel_4","type":"visualization"},{"id":"Fetcher-:-average-pages-per-second","name":"panel_5","type":"visualization"},{"id":"Total-bytes-fetched","name":"panel_6","type":"visualization"}],"type":"dashboard","updated_at":"2020-01-06T11:06:58.830Z","version":"WzQsMV0="} +{"exportedCount":9,"missingRefCount":0,"missingReferences":[]} diff --git a/bin/dashboards/status.ndjson b/bin/dashboards/status.ndjson new file mode 100644 index 0000000..b3d0122 --- /dev/null +++ b/bin/dashboards/status.ndjson @@ -0,0 +1,5 @@ +{"attributes":{"fields":"[{\"name\":\"_id\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_index\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_score\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_source\",\"type\":\"_source\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_type\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"key\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"metadata._redirTo\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.depth\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.error%2Ecause\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.error%2Esource\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.fetch%2Eerror%2Ecount\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.isFeed\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.isSitemap\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.url%2Epath\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"nextFetchDate\",\"type\":\"date\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"status\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"url\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":true,\"readFromDocValues\":true}]","title":"status"},"id":"7445c390-7339-11e9-9289-ffa3ee6775e4","migrationVersion":{"index-pattern":"6.5.0"},"references":[],"type":"index-pattern","updated_at":"2020-01-06T11:07:47.130Z","version":"WzEzLDFd"} +{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"*\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"status count","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"table\",\"params\":{\"perPage\":10,\"showPartialRows\":false,\"showMeticsAtAllLevels\":false},\"aggs\":[{\"id\":\"1\",\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"type\":\"terms\",\"schema\":\"bucket\",\"params\":{\"field\":\"status\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\"}}],\"listeners\":{},\"title\":\"status count\"}"},"id":"status-count","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"7445c390-7339-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:07:48.278Z","version":"WzE1LDFd"} +{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"*\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Top Hosts","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"table\",\"params\":{\"perPage\":10,\"showPartialRows\":false,\"showMeticsAtAllLevels\":false},\"aggs\":[{\"id\":\"1\",\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"type\":\"terms\",\"schema\":\"bucket\",\"params\":{\"field\":\"key\",\"size\":50,\"order\":\"desc\",\"orderBy\":\"1\"}}],\"listeners\":{},\"title\":\"Top Hosts\"}"},"id":"Top-Hosts","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"7445c390-7339-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:07:48.281Z","version":"WzE2LDFd"} +{"attributes":{"description":"","hits":0,"kibanaSavedObjectMeta":{"searchSourceJSON":"{\"filter\":[],\"query\":{\"query\":\"*\",\"language\":\"lucene\"}}"},"optionsJSON":"{\"useMargins\":true,\"hidePanelTitles\":false}","panelsJSON":"[{\"panelRefName\":\"panel_0\",\"version\":\"7.3.0\",\"panelIndex\":\"37874bbf-6607-435a-a231-94d81e9193e7\",\"gridData\":{\"x\":0,\"y\":0,\"w\":16,\"h\":20,\"i\":\"37874bbf-6607-435a-a231-94d81e9193e7\"},\"embeddableConfig\":{}},{\"panelRefName\":\"panel_1\",\"version\":\"7.3.0\",\"panelIndex\":\"4faa5b74-1660-44f7-9227-89d900c8231e\",\"gridData\":{\"x\":16,\"y\":0,\"w\":16,\"h\":20,\"i\":\"4faa5b74-1660-44f7-9227-89d900c8231e\"},\"embeddableConfig\":{}}]","timeRestore":false,"title":"Crawl status","version":1},"id":"Crawl-status","migrationVersion":{"dashboard":"7.3.0"},"references":[{"id":"status-count","name":"panel_0","type":"visualization"},{"id":"Top-Hosts","name":"panel_1","type":"visualization"}],"type":"dashboard","updated_at":"2020-01-06T11:07:47.948Z","version":"WzE0LDFd"} +{"exportedCount":4,"missingRefCount":0,"missingReferences":[]} diff --git a/bin/dashboards/storm.ndjson b/bin/dashboards/storm.ndjson new file mode 100644 index 0000000..880c232 --- /dev/null +++ b/bin/dashboards/storm.ndjson @@ -0,0 +1,5 @@ +{"attributes":{"fields":"[{\"name\":\"_id\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_index\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_score\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_source\",\"type\":\"_source\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_type\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"name\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcComponentId\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcTaskId\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcWorkerHost\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcWorkerPort\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"stormId\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"timestamp\",\"type\":\"date\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"value\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true}]","timeFieldName":"timestamp","title":"metrics"},"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","migrationVersion":{"index-pattern":"6.5.0"},"references":[],"type":"index-pattern","updated_at":"2020-01-06T11:06:58.178Z","version":"WzMsMV0="} +{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name: \\\"__receive.population\\\"\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Storm Receive Queues","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"table\",\"params\":{\"perPage\":10,\"showPartialRows\":false,\"showMeticsAtAllLevels\":false},\"aggs\":[{\"id\":\"1\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"2\",\"type\":\"terms\",\"schema\":\"bucket\",\"params\":{\"field\":\"srcTaskId\",\"size\":10,\"order\":\"desc\",\"orderBy\":\"1\"}},{\"id\":\"3\",\"type\":\"terms\",\"schema\":\"bucket\",\"params\":{\"field\":\"srcComponentId\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\"}}],\"listeners\":{}}"},"id":"Storm-Receive-Queues","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:09:12.875Z","version":"WzIwLDFd"} +{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"+srcComponentId: \\\"__system\\\" +name: memory\\\\/heap*\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Memory Heap","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"histogram\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":true,\"scale\":\"linear\",\"mode\":\"grouped\",\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":true,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"1\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"3\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"name\",\"size\":10,\"order\":\"desc\",\"orderBy\":\"1\"}},{\"id\":\"4\",\"type\":\"terms\",\"schema\":\"split\",\"params\":{\"field\":\"srcWorkerHost\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\",\"row\":true}}],\"listeners\":{}}"},"id":"Memory-Heap","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:09:12.877Z","version":"WzIxLDFd"} +{"attributes":{"description":"","hits":0,"kibanaSavedObjectMeta":{"searchSourceJSON":"{\"filter\":[],\"query\":{\"query\":\"*\",\"language\":\"lucene\"}}"},"optionsJSON":"{\"useMargins\":true,\"hidePanelTitles\":false}","panelsJSON":"[{\"panelRefName\":\"panel_0\",\"version\":\"7.3.0\",\"panelIndex\":\"19123ee9-8f49-4621-a4dc-716b5ff9fcaf\",\"gridData\":{\"x\":0,\"y\":0,\"w\":32,\"h\":8,\"i\":\"19123ee9-8f49-4621-a4dc-716b5ff9fcaf\"},\"embeddableConfig\":{}},{\"panelRefName\":\"panel_1\",\"version\":\"7.3.0\",\"panelIndex\":\"5fd83542-b7e6-48e0-8679-2ffcacf453a3\",\"gridData\":{\"x\":0,\"y\":8,\"w\":32,\"h\":16,\"i\":\"5fd83542-b7e6-48e0-8679-2ffcacf453a3\"},\"embeddableConfig\":{}}]","timeRestore":false,"title":"Storm metrics","version":1},"id":"Storm-metrics","migrationVersion":{"dashboard":"7.3.0"},"references":[{"id":"Storm-Receive-Queues","name":"panel_0","type":"visualization"},{"id":"Memory-Heap","name":"panel_1","type":"visualization"}],"type":"dashboard","updated_at":"2020-01-06T11:08:33.810Z","version":"WzE5LDFd"} +{"exportedCount":4,"missingRefCount":0,"missingReferences":[]} \ No newline at end of file diff --git a/bin/run-crawler.sh b/bin/run-crawler.sh deleted file mode 100755 index ca32e47..0000000 --- a/bin/run-crawler.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash - -# in case volumes are on the host need to adjust permissions -chown -R elasticsearch:elasticsearch /data/elasticsearch -chown -R storm:storm /data/warc - -# export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 - -# as root -/usr/bin/supervisord - -# wait until Storm and Elasticsearch are running -sleep 60 - -mkdir /tmp/seeds -cp -rf /home/ubuntu/news-crawler/seeds /tmp/ -chmod -R a+r /tmp/seeds - -# start the news crawler as user ubuntu -sudo -iu ubuntu /bin/bash <<"EOF" - -set -e - -cd $HOME/news-crawler/ - -# initialize Elasticsearch indices -# CAVEAT: this deletes existing indices! -bin/ES_IndexInit.sh -sleep 10 - -STORMCRAWLER="storm jar $PWD/lib/crawler.jar" - -# run the crawler -$STORMCRAWLER -- org.commoncrawl.stormcrawler.news.CrawlTopology \ - /tmp/seeds '*' -conf $PWD/conf/es-conf.yaml -conf $PWD/conf/crawler-conf.yaml -# alternatively running the flux -#$STORMCRAWLER org.apache.storm.flux.Flux --remote $PWD/conf/crawler.flux -# suppress warnings about malformed XML in sitemaps -storm set_log_level NewsCrawl \ - -l crawlercommons.sitemaps.SiteMapParser=ERROR - - -EOF diff --git a/bin/es_status b/bin/status similarity index 100% rename from bin/es_status rename to bin/status diff --git a/conf/crawler-conf.yaml b/conf/crawler-conf.yaml index ef0f478..eb2594b 100644 --- a/conf/crawler-conf.yaml +++ b/conf/crawler-conf.yaml @@ -31,7 +31,7 @@ config: topology.metrics.consumer.register: - class: "org.apache.storm.metric.LoggingMetricsConsumer" parallelism.hint: 1 - - class: "org.apache.stormcrawler.elasticsearch.metrics.MetricsConsumer" + - class: "org.apache.stormcrawler.opensearch.metrics.MetricsConsumer" parallelism.hint: 1 # status index and fetcher queues are partitioned by domain @@ -70,6 +70,9 @@ config: # increased network timeout (ms) for news sites from Asia and eastern Europe http.timeout: 30000 + # allowed URL protocols + protocols: "http,https" + # use okhttp http.protocol.implementation: org.apache.stormcrawler.protocol.okhttp.HttpProtocol https.protocol.implementation: org.apache.stormcrawler.protocol.okhttp.HttpProtocol @@ -82,10 +85,26 @@ config: # or transferred protocol metadata must also be prefixed. protocol.md.prefix: "protocol." + # number of instances for each protocol implementation + protocol.instances.num: 8 + # connection pool configuration of OkHttp protocol + okhttp.protocol.connection.pool: + # maximum number of idle connections (in addition to active connections) + max.idle.connections: 256 + # maximum keep-alive time of the connections in seconds + connection.keep.alive: 300 + # See also + # https://square.github.io/okhttp/3.x/okhttp/okhttp3/ConnectionPool.html + # Note that OkHttp's connection pool (v4.9.1) is not optimized for fast + # look-up of connections, the pool size (idle and active connections) + # should not exceed 1000. To allow for efficient pooling in large and + # diverse crawls, it's recommended to increase also the number of protocol + # instances, see `protocol.instance.num`. + # delay between successive requests to the same host/domain # (be defensive, a delay of 5 sec. means about 1000 fetches per hour # which should be enough even for large news sites) - fetcher.server.delay: 6.0 + fetcher.server.delay: 9.0 # generous max. crawl delay # (fetch content even if the robots.txt specifies a large host-specific crawl delay: diff --git a/conf/crawler.flux b/conf/crawler.flux index 73af7c7..9f48a01 100644 --- a/conf/crawler.flux +++ b/conf/crawler.flux @@ -6,11 +6,11 @@ includes: override: false - resource: false - file: "crawler-conf.yaml" + file: "conf/crawler-conf.yaml" override: true - resource: false - file: "es-conf.yaml" + file: "conf/opensearch-conf.yaml" override: true config: @@ -77,13 +77,13 @@ components: spouts: - id: "spout" - className: "org.apache.stormcrawler.elasticsearch.persistence.AggregationSpout" + className: "org.apache.stormcrawler.opensearch.persistence.AggregationSpout" parallelism: 16 - id: "filespout" className: "org.apache.stormcrawler.spout.FileSpout" parallelism: 1 constructorArgs: - - "/path/to/seeds/" + - "/data/seeds/" - "feeds.txt" - true @@ -129,7 +129,7 @@ bolts: args: - "warc" - id: "status" - className: "org.apache.stormcrawler.elasticsearch.persistence.StatusUpdaterBolt" + className: "org.apache.stormcrawler.opensearch.persistence.StatusUpdaterBolt" parallelism: 1 streams: diff --git a/conf/es-conf.yaml b/conf/es-conf.yaml deleted file mode 100644 index 7ea8b8b..0000000 --- a/conf/es-conf.yaml +++ /dev/null @@ -1,83 +0,0 @@ -# configuration for Elasticsearch resources - -config: - # ES metricsConsumer - es.metrics.addresses: "http://localhost:9200" - es.metrics.index.name: "metrics" - - # ES spout and persistence bolt - es.status.addresses: "http://localhost:9200" - es.status.index.name: "status" - #es.status.user: "USERNAME" - #es.status.password: "PASSWORD" - # the routing is done on the value of 'partition.url.mode' - es.status.routing: true - # stores the value used for grouping the URLs as a separate field - # needed by the spout implementations - # also used for routing if the value above is set to true - es.status.routing.fieldname: "metadata.hostname" - es.status.bulkActions: 500 - es.status.flushInterval: "5s" - es.status.concurrentRequests: 1 - - ################ - # spout config # - ################ - - # positive or negative filter parsable by the Lucene Query Parser - # es.status.filterQuery: - # - "-(metadata.hostname:stormcrawler.net)" - # - "-(key:digitalpebble.com)" - - # time in secs for which the URLs will be considered for fetching after a ack or fail - # need a high value to avoid duplicates by URLs added multiple times to the fetcher - # queues, should be close to - # fetcher.max.crawl.delay * fetcher.max.queue.size - spout.ttl.purgatory: 1200 - - # Min time (in msecs) to allow between 2 successive queries (per bucket) to ES - spout.min.delay.queries: 30000 - - # Delay since previous query date (in secs) after which the nextFetchDate value will be reset to the current time - # Setting this to -1 or a large value means that the ES will cache the results but also that less and less results - # might be returned. - # - should reset to avoid that a bucket with many URLs blocks incrementing the date to look - # for next fetches for a specific bucket. May happen if a news sitemap adds 1000s of URLs. - spout.reset.fetchdate.after: 240 - - es.status.max.buckets: 200 - # max. URLs per bucket (= domain name): 30 sec. / 5 sec. fetch delay = 6 - # but set to a lower number for domains with longer crawl-delay - # cf. also fetcher.max.queue.size and fetcher.max.urls.in.queues - es.status.max.urls.per.bucket: 5 - # field to group the URLs into buckets - es.status.bucket.field: "metadata.hostname" - # fields to sort the URLs within a bucket - es.status.bucket.sort.field: - - "nextFetchDate" - - "url" - # field to sort the buckets - es.status.global.sort.field: "nextFetchDate" - - # CollapsingSpout : limits the deep paging by resetting the start offset for the ES query - es.status.max.start.offset: 500 - - # AggregationSpout : sampling improves the performance on large crawls - es.status.sample: false - - # max allowed duration of a query in sec - es.status.query.timeout: -1 - - # AggregationSpout (expert): adds this value in mins to the latest date returned in the results and - # use it as nextFetchDate - es.status.recentDate.increase: -1 - es.status.recentDate.min.gap: -1 - - topology.metrics.consumer.register: - - class: "org.apache.stormcrawler.elasticsearch.metrics.MetricsConsumer" - parallelism.hint: 1 - #whitelist: - # - "fetcher_counter" - # - "fetcher_average.bytes_fetched" - #blacklist: - # - "__receive.*" diff --git a/conf/opensearch-conf.yaml b/conf/opensearch-conf.yaml new file mode 100644 index 0000000..e6d2025 --- /dev/null +++ b/conf/opensearch-conf.yaml @@ -0,0 +1,126 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# configuration for OpenSearch resources + +config: + + # address to use unless a more specific one has been + # defined for a component + # also accepts a list or multiple values in a single line + # separated by a semi-colon e.g. "opensearch1:9200; opensearch2:9200" + # Note: here the address from inside the docker-compose cluster is required + opensearch.addresses: "http://opensearch-news-crawl:9200" + #opensearch.user: "USERNAME" + #opensearch.password: "PASSWORD" + opensearch.concurrentRequests: 2 + + # Disable TLS validation for connection to OpenSearch + # opensearch.disable.tls.validation: false + + # Indexer bolt + # addresses can be specified as a full URL + # if not we assume that the protocol is http and the port 9200 + opensearch.indexer.addresses: "http://opensearch-news-crawl:9200" + opensearch.indexer.index.name: "content" + # opensearch.indexer.pipeline: "_PIPELINE_" + opensearch.indexer.create: false + opensearch.indexer.bulkActions: 100 + opensearch.indexer.flushInterval: "2s" + opensearch.indexer.concurrentRequests: 1 + opensearch.indexer.sniff: true + + # MetricsConsumer + opensearch.metrics.addresses: "http://opensearch-news-crawl:9200" + opensearch.metrics.index.name: "metrics" + opensearch.metrics.sniff: true + + # Spout and persistence bolt + opensearch.status.addresses: "http://opensearch-news-crawl:9200" + opensearch.status.index.name: "status" + #opensearch.status.user: "USERNAME" + #opensearch.status.password: "PASSWORD" + # the routing is done on the value of 'partition.url.mode' + opensearch.status.routing: true + # stores the value used for grouping the URLs as a separate field + # needed by the spout implementations + # also used for routing if the value above is set to true + opensearch.status.routing.fieldname: "key" + opensearch.status.bulkActions: 500 + opensearch.status.flushInterval: "5s" + opensearch.status.concurrentRequests: 1 + opensearch.status.sniff: true + + ################ + # spout config # + ################ + + # positive or negative filters parsable by the Lucene Query Parser + # opensearch.status.filterQuery: + # - "-(key:stormcrawler.net)" + # - "-(key:stormcrawler.apache.org)" + + # time in secs for which the URLs will be considered for fetching after a ack or fail + # need a high value to avoid duplicates by URLs added multiple times to the fetcher + # queues, should be close to + # fetcher.max.crawl.delay * fetcher.max.queue.size + spout.ttl.purgatory: 1200 + + # Min time (in msecs) to allow between 2 successive queries to OpenSearch + spout.min.delay.queries: 30000 + + # Max time (in msecs) to allow between 2 successive queries to OpenSearch + spout.max.delay.queries: 60000 + + # Delay since previous query date (in secs) after which the nextFetchDate value will be reset to the current time + # Setting this to -1 or a large value means that OpenSearch will cache the results but also that fewer and fewer results + # might be returned. + # - should reset to avoid that a bucket with many URLs blocks incrementing the date to look + # for next fetches for a specific bucket. May happen if a news sitemap adds 1000s of URLs. + spout.reset.fetchdate.after: 240 + + opensearch.status.max.buckets: 200 + # max. URLs per bucket (= domain name): 30 sec. / 5 sec. fetch delay = 6 + # but set to a lower number for domains with longer crawl-delay + # cf. also fetcher.max.queue.size and fetcher.max.urls.in.queues + opensearch.status.max.urls.per.bucket: 5 + # field to group the URLs into buckets + opensearch.status.bucket.field: "key" + # fields to sort the URLs within a bucket + opensearch.status.bucket.sort.field: + - "nextFetchDate" + - "url" + # field to sort the buckets + opensearch.status.global.sort.field: "nextFetchDate" + + # AggregationSpout : sampling improves the performance on large crawls + opensearch.status.sample: false + + # max allowed duration of a query in sec + opensearch.status.query.timeout: -1 + + # AggregationSpout (expert): adds this value in mins to the latest date returned in the results and + # use it as nextFetchDate + opensearch.status.recentDate.increase: -1 + opensearch.status.recentDate.min.gap: -1 + + topology.metrics.consumer.register: + - class: "org.apache.stormcrawler.opensearch.metrics.MetricsConsumer" + parallelism.hint: 1 + #whitelist: + # - "fetcher_counter" + # - "fetcher_average.bytes_fetched" + #blacklist: + # - "__receive.*" diff --git a/docker-compose.yaml b/docker-compose.yaml new file mode 100644 index 0000000..24d048c --- /dev/null +++ b/docker-compose.yaml @@ -0,0 +1,122 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +services: + + # Apache Storm components + # - Zookeeper coordinates the communication between Nimbus and the Supervisors + zookeeper: + image: zookeeper:${ZOOKEEPER_VERSION:-3.9.3} + container_name: zookeeper + restart: always + + # - the daemon Nimbus runs on the master node + storm-nimbus: + image: storm:${STORM_VERSION:-2.8.4} + container_name: storm-nimbus + hostname: nimbus + command: storm nimbus + depends_on: + - zookeeper + links: + - zookeeper + ports: + - 6627:6627 + restart: always + + # - the Supervisors run on the worker nodes + storm-supervisor: + image: storm:${STORM_VERSION:-2.8.4} + container_name: storm-supervisor + command: storm supervisor -c worker.childopts=-Xmx%HEAP-MEM%m + depends_on: + - zookeeper + - storm-nimbus + links: + - zookeeper + - storm-nimbus:nimbus + # supervisor launches the worker processes + # which need to be able to access + # - (in case a indexing topology is run) the + # OpenSearch (http://opensearch:9200/) and + - opensearch-news-crawl + # - the WARC output folder + # - and the seed folder + volumes: + - ${WARCOUTPUT:-./warcdata}:/data/warc + - ${SEEDDIR:-./seeds}:/data/seeds + restart: always + + # - the Storm UI provides diagnostics about the Storm cluster + storm-ui: + image: storm:${STORM_VERSION:-2.8.4} + container_name: storm-ui + command: storm ui + depends_on: + - storm-nimbus + links: + - storm-nimbus:nimbus + ports: + - "127.0.0.1:8080:8080" + restart: always + + opensearch-news-crawl: + image: opensearchproject/opensearch:${OPENSEARCH_VERSION:-2.19.4} + container_name: opensearch-news-crawl + environment: + - cluster.name=opensearch-news-crawl-cluster + - node.name=opensearch-news-crawl + - discovery.type=single-node + - bootstrap.memory_lock=true # along with the memlock settings below, disables swapping + - "OPENSEARCH_JAVA_OPTS=-Xms4G -Xmx4G" + - plugins.security.disabled=true + - "DISABLE_INSTALL_DEMO_CONFIG=true" + volumes: + - ${OPENSEARCHDATA:-./opensearchdata}:/usr/share/opensearch/data + ulimits: + memlock: + soft: -1 + hard: -1 + nofile: + soft: 65536 # maximum number of open files for the OpenSearch user, set to at least 65536 on modern systems + hard: 65536 + ports: + - "127.0.0.1:9200:9200" # REST API + + opensearch-dashboard-news-crawl: + image: opensearchproject/opensearch-dashboards:${OPENSEARCH_VERSION:-2.19.4} + container_name: opensearch-dashboard-news-crawl + ports: + - "127.0.0.1:5601:5601" + expose: + - "5601" + environment: + - 'OPENSEARCH_HOSTS=["http://opensearch-news-crawl:9200"]' + - "DISABLE_SECURITY_DASHBOARDS_PLUGIN=true" # disables security dashboards plugin in OpenSearch Dashboards + + # - to launch a topology + # - will exit on startup + news-crawler: + build: . + container_name: news-crawler + command: /bin/bash + depends_on: + - storm-nimbus + links: + - storm-nimbus:nimbus + volumes: + - ${WARCOUTPUT:-./warcdata}:/data/warc + - ${SEEDDIR:-./seeds}:/data/seeds + restart: "no" + diff --git a/etc/supervisor/conf.d/elasticsearch.conf b/etc/supervisor/conf.d/elasticsearch.conf deleted file mode 100644 index a4f0020..0000000 --- a/etc/supervisor/conf.d/elasticsearch.conf +++ /dev/null @@ -1,8 +0,0 @@ -[program:elasticsearch] -command=/usr/share/elasticsearch/bin/elasticsearch -Enetwork.host=127.0.0.1 -Ehttp.port=9200 -Etransport.tcp.port=9300 -numprocs=1 -autostart=true -autorestart=true -user=elasticsearch -echo environment=ES_HEAP_SIZE="20g" -environment=ES_PATH_CONF=/etc/elasticsearch \ No newline at end of file diff --git a/etc/supervisor/conf.d/kibana.conf b/etc/supervisor/conf.d/kibana.conf deleted file mode 100644 index 45e1812..0000000 --- a/etc/supervisor/conf.d/kibana.conf +++ /dev/null @@ -1,7 +0,0 @@ -[program:kibana] -command=/usr/share/kibana/bin/kibana -c /etc/kibana/kibana.yml -numprocs=1 -autostart=true -autorestart=true -user=kibana -directory=/usr/share/kibana/ diff --git a/etc/sysctl.d/60-elasticsearch.conf b/etc/sysctl.d/60-elasticsearch.conf deleted file mode 100644 index ae43f01..0000000 --- a/etc/sysctl.d/60-elasticsearch.conf +++ /dev/null @@ -1,7 +0,0 @@ - -# Elasticsearch settings -# see -# https://www.elastic.co/guide/en/elasticsearch/reference/current/setup-configuration-memory.html#swappiness -# https://www.elastic.co/guide/en/elasticsearch/reference/current/vm-max-map-count.html -vm.swappiness=1 -vm.max_map_count=262144 diff --git a/seeds/feeds.txt b/seeds/feeds.txt index b74ad0e..88468a8 100644 --- a/seeds/feeds.txt +++ b/seeds/feeds.txt @@ -1,26 +1,3 @@ -https://www.usatoday.com/news-sitemap.xml isSitemapNews=true -https://www.theguardian.com/sitemaps/news.xml isSitemapNews=true -https://www.theguardian.com/international/rss isFeed=true -https://www.theguardian.com/world/rss isFeed=true -https://www.theguardian.com/uk/rss isFeed=true -https://www.theguardian.com/us/rss isFeed=true -https://www.theguardian.com/world/eu/rss isFeed=true -https://www.theguardian.com/politics/rss isFeed=true -https://www.theguardian.com/science/rss isFeed=true -https://www.theguardian.com/education/rss isFeed=true -https://www.theguardian.com/football/rss isFeed=true -https://www.elwatannews.com/home/rssfeeds isFeed=true -https://www.corriere.it/rss/sitemap_v2.xml isSitemapIndex=true -https://www.repubblica.it/rss/homepage/rss2.0.xml isFeed=true -https://www.repubblica.it/rss/economia/rss2.0.xml isFeed=true -https://www.repubblica.it/rss/politica/rss2.0.xml isFeed=true -https://www.lemonde.fr/sitemap_news.xml isSitemapNews=true -https://www.lemonde.fr/economie/rss_full.xml isFeed=true -https://www.lemonde.fr/rss/une.xml isFeed=true -https://www.lemonde.fr/international/rss_full.xml isFeed=true -https://www.lemonde.fr/politique/rss_full.xml isFeed=true -https://www.lemonde.fr/livres/rss_full.xml isFeed=true -https://www.lemonde.fr/afrique/rss_full.xml isFeed=true -https://www.lemonde.fr/ameriques/rss_full.xml isFeed=true -https://www.cnn.com/sitemaps/cnn/news.xml isSitemapNews=true -https://www.bbc.com/sitemaps/https-index-com-news.xml isSitemapNews=true +https://commoncrawl.org/blog/rss.xml + +# Please, add your news feeds and sitemaps below - one line, one URL. \ No newline at end of file diff --git a/src/main/resources/indexer.mapping b/src/main/resources/indexer.mapping new file mode 100644 index 0000000..b788e6b --- /dev/null +++ b/src/main/resources/indexer.mapping @@ -0,0 +1,40 @@ +{ + "settings": { + "index": { + "number_of_shards": 5, + "number_of_replicas": 1, + "refresh_interval": "60s" + } + }, + "mappings": { + "_source": { + "enabled": true + }, + "properties": { + "content": { + "type": "text" + }, + "description": { + "type": "text" + }, + "domain": { + "type": "keyword" + }, + "format": { + "type": "keyword" + }, + "keywords": { + "type": "keyword" + }, + "host": { + "type": "keyword" + }, + "title": { + "type": "text" + }, + "url": { + "type": "keyword" + } + } + } +} \ No newline at end of file diff --git a/src/main/resources/metrics.mapping b/src/main/resources/metrics.mapping new file mode 100644 index 0000000..5b2ac15 --- /dev/null +++ b/src/main/resources/metrics.mapping @@ -0,0 +1,40 @@ +{ + "index_patterns": "metrics*", + "settings": { + "index": { + "number_of_shards": 1, + "refresh_interval": "30s" + }, + "number_of_replicas": 0 + }, + "mappings": { + "_source": { "enabled": true }, + "properties": { + "name": { + "type": "keyword" + }, + "stormId": { + "type": "keyword" + }, + "srcComponentId": { + "type": "keyword" + }, + "srcTaskId": { + "type": "short" + }, + "srcWorkerHost": { + "type": "keyword" + }, + "srcWorkerPort": { + "type": "integer" + }, + "timestamp": { + "type": "date", + "format": "date_optional_time" + }, + "value": { + "type": "double" + } + } + } +} \ No newline at end of file diff --git a/src/main/resources/status.mapping b/src/main/resources/status.mapping new file mode 100644 index 0000000..e5b14fe --- /dev/null +++ b/src/main/resources/status.mapping @@ -0,0 +1,39 @@ +{ + "settings": { + "index": { + "number_of_shards": 10, + "number_of_replicas": 1, + "refresh_interval": "5s" + } + }, + "mappings": { + "dynamic_templates": [{ + "metadata": { + "path_match": "metadata.*", + "match_mapping_type": "string", + "mapping": { + "type": "keyword" + } + } + }], + "_source": { + "enabled": true + }, + "properties": { + "key": { + "type": "keyword", + "index": true + }, + "nextFetchDate": { + "type": "date", + "format": "date_optional_time" + }, + "status": { + "type": "keyword" + }, + "url": { + "type": "keyword" + } + } + } +} From 12a5b503b2ba35beb6180674cb1554f77c226c3c Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Tue, 31 Mar 2026 19:12:03 +0200 Subject: [PATCH 4/8] Pin version of Jackson: needs to be same as used by Storm --- pom.xml | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index c470e83..9deb934 100644 --- a/pom.xml +++ b/pom.xml @@ -42,7 +42,7 @@ under the License. 3.5.1 2.8.4 1.12.797 - 2.11.1 + 2.18.1 1.6 5.23.0 3.0.1 @@ -171,6 +171,23 @@ under the License. ${aws.version} + + + com.fasterxml.jackson.core + jackson-annotations + ${jackson.version} + + + com.fasterxml.jackson.core + jackson-core + ${jackson.version} + + + com.fasterxml.jackson.core + jackson-databind + ${jackson.version} + + org.apache.stormcrawler From 0d68ae50016c595104992fe85d5d931534ac084b Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Thu, 11 Jun 2026 19:24:11 +0200 Subject: [PATCH 5/8] Upgrade to StormCrawler 3.6.0, Storm 2.8.8 --- Dockerfile | 4 ++-- README.md | 12 ++++++------ bin/status | 8 ++++---- conf/crawler-conf.yaml | 7 +++++++ conf/crawler.flux | 2 +- docker-compose.yaml | 12 ++++++------ pom.xml | 19 ++++--------------- .../stormcrawler/news/CrawlTopology.java | 2 +- 8 files changed, 31 insertions(+), 35 deletions(-) diff --git a/Dockerfile b/Dockerfile index 683df8f..6b469b3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM storm:2.8.4 +FROM storm:2.8.8 RUN apt-get update -qq && \ apt-get install -yq --no-install-recommends \ @@ -10,7 +10,7 @@ RUN apt-get update -qq && \ # # news-crawler # -ENV CRAWLER_VERSION=3.5.1 +ENV CRAWLER_VERSION=3.6.0 RUN mkdir /news-crawler/ && \ mkdir /news-crawler/conf/ && \ diff --git a/README.md b/README.md index ad990bf..2d18792 100644 --- a/README.md +++ b/README.md @@ -5,8 +5,8 @@ Crawler for news based on [StormCrawler](https://stormcrawler.apache.org/). Prod ## Prerequisites -* Install OpenSearch 2.19.4 -* Install Apache Storm 2.8.4 +* Install OpenSearch 2.19.5 +* Install Apache Storm 2.8.8 * Start OpenSearch and Storm * Create the OpenSearch indices by running [bin/OS_IndexInit.sh](bin/OS_IndexInit.sh) and the dashboards by [OS_ImportDashboards.sh](bin/OS_ImportDashboards.sh) @@ -32,14 +32,14 @@ mvn clean package And run ... ``` sh -storm local target/crawler-3.5.1.jar --local-ttl 60 -- org.commoncrawl.stormcrawler.news.CrawlTopology -conf $PWD/conf/opensearch-conf.yaml -conf $PWD/conf/crawler-conf.yaml $PWD/seeds/ feeds.txt +storm local target/crawler-3.6.0.jar --local-ttl 60 -- org.commoncrawl.stormcrawler.news.CrawlTopology -conf $PWD/conf/opensearch-conf.yaml -conf $PWD/conf/crawler-conf.yaml $PWD/seeds/ feeds.txt ``` This will launch the crawl topology in local mode for 60 seconds. It will also "inject" all URLs found in the file `./seeds/feeds.txt` in the status index. The URLs point to news feeds and sitemaps from which links to news articles are extracted and fetched. The topology will create WARC files in the directory specified in the configuration under the key `warc.dir`. This directory must be created beforehand. -Of course, it's also possible to add (or remove) the seeds (feeds and sitemaps) using the Elasticsearch API. In this case, the can topology can be run without the last two arguments. +Of course, it's also possible to add (or remove) the seeds (feeds and sitemaps) using the OpenSearch API. In this case, the can topology can be run without the last two arguments. -Alternatively, the topology can be run from the [crawler.flux](./conf/crawler.flux), please see the [Storm Flux documentation](https://storm.apache.org/releases/2.8.4/flux.html). Make sure to adapt the Flux definition to your needs! +Alternatively, the topology can be run from the [crawler.flux](./conf/crawler.flux), please see the [Storm Flux documentation](https://storm.apache.org/releases/2.8.8/flux.html). Make sure to adapt the Flux definition to your needs! In production, you should use `storm jar ...` to run the topology in distributed mode and continuously (no time limit) including the Storm UI and logging. @@ -88,7 +88,7 @@ NOTE: - Make sure that the OpenSearch port 9200 is not already in use or mapped by a running OpenSearch instance. Otherwise OpenSearch commands may affect the running instance! -To launch the topology using [Storm Flux](https://storm.apache.org/releases/2.8.4/flux.html): +To launch the topology using [Storm Flux](https://storm.apache.org/releases/2.8.8/flux.html): ``` docker compose run --rm news-crawler \ storm jar lib/crawler.jar org.apache.storm.flux.Flux --remote /news-crawler/conf/crawler.flux diff --git a/bin/status b/bin/status index 410c4d1..978fb5a 100755 --- a/bin/status +++ b/bin/status @@ -5,14 +5,14 @@ __ES_STATUS_URL_DEFAULT='http://localhost:9200/status' function ____show_help() { echo "$0 [-v|-V] [-C] []" echo - echo "Query StormCrawler's Elasticsearch status index" + echo "Query StormCrawler's Elasticsearch or OpenSearch status index" echo " with help of curl, jq and bash" echo echo "Global options" echo " -h show detailed help" echo " -v verbose, print commands before execution" echo " -V very verbose" - echo " -D dry run, do not execute request to ES (use in combination with -v)" + echo " -D dry run, do not execute request (use in combination with -v)" echo " -C colorize JSON output" echo echo "Commands" @@ -134,12 +134,12 @@ ES_STATUS_URL=${ES_STATUS_URL:-$__ES_STATUS_URL_DEFAULT} set -e -# current time in Elasticsearch date format +# current time in Elasticsearch/OpenSearch date format function ____now () { date -u '+%Y-%m-%dT%H:%M:%S.000Z' } -# given date in Elasticsearch date format +# given date in Elasticsearch/OpenSearch date format function ____date () { date -d"$1" -u '+%Y-%m-%dT%H:%M:%S.000Z' } diff --git a/conf/crawler-conf.yaml b/conf/crawler-conf.yaml index eb2594b..6bbd52c 100644 --- a/conf/crawler-conf.yaml +++ b/conf/crawler-conf.yaml @@ -77,6 +77,13 @@ config: http.protocol.implementation: org.apache.stormcrawler.protocol.okhttp.HttpProtocol https.protocol.implementation: org.apache.stormcrawler.protocol.okhttp.HttpProtocol + # the http/https protocol versions to use, in order of preference + # - the WARC writer handles HTTP/1.1 and HTTP/2 (cf. storm-crawler#1010) + # - okhttp does not support HTTP/1.0 requests (it supports responses however) + # http.protocol.versions: + # - "h2" + # - "http/1.1" + # do not fail on unknown SSL certificates http.trust.everything: true diff --git a/conf/crawler.flux b/conf/crawler.flux index 9f48a01..4d390a5 100644 --- a/conf/crawler.flux +++ b/conf/crawler.flux @@ -45,7 +45,7 @@ components: - name: "put" args: - "software" - - "StormCrawler 2.10 https://stormcrawler.net/" + - "StormCrawler 3.6.0 https://stormcrawler.apache.org/" - name: "put" args: - "description" diff --git a/docker-compose.yaml b/docker-compose.yaml index 24d048c..8d8d1fd 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -17,13 +17,13 @@ services: # Apache Storm components # - Zookeeper coordinates the communication between Nimbus and the Supervisors zookeeper: - image: zookeeper:${ZOOKEEPER_VERSION:-3.9.3} + image: zookeeper:${ZOOKEEPER_VERSION:-3.9.4} container_name: zookeeper restart: always # - the daemon Nimbus runs on the master node storm-nimbus: - image: storm:${STORM_VERSION:-2.8.4} + image: storm:${STORM_VERSION:-2.8.8} container_name: storm-nimbus hostname: nimbus command: storm nimbus @@ -37,7 +37,7 @@ services: # - the Supervisors run on the worker nodes storm-supervisor: - image: storm:${STORM_VERSION:-2.8.4} + image: storm:${STORM_VERSION:-2.8.8} container_name: storm-supervisor command: storm supervisor -c worker.childopts=-Xmx%HEAP-MEM%m depends_on: @@ -60,7 +60,7 @@ services: # - the Storm UI provides diagnostics about the Storm cluster storm-ui: - image: storm:${STORM_VERSION:-2.8.4} + image: storm:${STORM_VERSION:-2.8.8} container_name: storm-ui command: storm ui depends_on: @@ -72,7 +72,7 @@ services: restart: always opensearch-news-crawl: - image: opensearchproject/opensearch:${OPENSEARCH_VERSION:-2.19.4} + image: opensearchproject/opensearch:${OPENSEARCH_VERSION:-2.19.5} container_name: opensearch-news-crawl environment: - cluster.name=opensearch-news-crawl-cluster @@ -95,7 +95,7 @@ services: - "127.0.0.1:9200:9200" # REST API opensearch-dashboard-news-crawl: - image: opensearchproject/opensearch-dashboards:${OPENSEARCH_VERSION:-2.19.4} + image: opensearchproject/opensearch-dashboards:${OPENSEARCH_VERSION:-2.19.5} container_name: opensearch-dashboard-news-crawl ports: - "127.0.0.1:5601:5601" diff --git a/pom.xml b/pom.xml index 9deb934..0508210 100644 --- a/pom.xml +++ b/pom.xml @@ -26,7 +26,7 @@ under the License. 4.0.0 org.commoncrawl.stormcrawler.news crawler - 3.5.1 + 3.6.0 jar @@ -39,10 +39,10 @@ under the License. UTF-8 - 3.5.1 - 2.8.4 + 3.6.0 + 2.8.8 1.12.797 - 2.18.1 + 2.21.3 1.6 5.23.0 3.0.1 @@ -171,17 +171,6 @@ under the License. ${aws.version} - - - com.fasterxml.jackson.core - jackson-annotations - ${jackson.version} - - - com.fasterxml.jackson.core - jackson-core - ${jackson.version} - com.fasterxml.jackson.core jackson-databind diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java b/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java index 54c69eb..3a3c018 100644 --- a/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java +++ b/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java @@ -44,7 +44,7 @@ import org.slf4j.LoggerFactory; /** - * Dummy topology to play with the spouts and bolts on ElasticSearch + * Dummy topology to play with the spouts and bolts on OpenSearch */ public class CrawlTopology extends ConfigurableTopology { From 98e531a7b1b64057dce9bbbd8ada5e86fc6be02d Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Fri, 12 Jun 2026 09:18:41 +0200 Subject: [PATCH 6/8] Consistently name components and containers in docker compose configuration Add suffix "-news-crawl" to all Storm / Zookeeper container names to avoid name collisions with other StormCrawler setups running on the same system. --- README.md | 2 +- docker-compose.yaml | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 2d18792..4efbf08 100644 --- a/README.md +++ b/README.md @@ -104,7 +104,7 @@ After 1-2 minutes if everything is up, connect to OpenSearch on port [9200](http For inspecting the worker log files: ``` -docker exec storm-supervisor /bin/bash -c 'cat /logs/workers-artifacts/*/*/worker.log' +docker exec storm-supervisor-news-crawl /bin/bash -c 'cat /logs/workers-artifacts/*/*/worker.log' ``` To stop the topology: diff --git a/docker-compose.yaml b/docker-compose.yaml index 8d8d1fd..31bbc05 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -18,13 +18,13 @@ services: # - Zookeeper coordinates the communication between Nimbus and the Supervisors zookeeper: image: zookeeper:${ZOOKEEPER_VERSION:-3.9.4} - container_name: zookeeper + container_name: zookeeper-news-crawl restart: always # - the daemon Nimbus runs on the master node storm-nimbus: image: storm:${STORM_VERSION:-2.8.8} - container_name: storm-nimbus + container_name: storm-nimbus-news-crawl hostname: nimbus command: storm nimbus depends_on: @@ -38,7 +38,7 @@ services: # - the Supervisors run on the worker nodes storm-supervisor: image: storm:${STORM_VERSION:-2.8.8} - container_name: storm-supervisor + container_name: storm-supervisor-news-crawl command: storm supervisor -c worker.childopts=-Xmx%HEAP-MEM%m depends_on: - zookeeper @@ -50,7 +50,7 @@ services: # which need to be able to access # - (in case a indexing topology is run) the # OpenSearch (http://opensearch:9200/) and - - opensearch-news-crawl + - opensearch # - the WARC output folder # - and the seed folder volumes: @@ -61,7 +61,7 @@ services: # - the Storm UI provides diagnostics about the Storm cluster storm-ui: image: storm:${STORM_VERSION:-2.8.8} - container_name: storm-ui + container_name: storm-ui-news-crawl command: storm ui depends_on: - storm-nimbus @@ -71,7 +71,7 @@ services: - "127.0.0.1:8080:8080" restart: always - opensearch-news-crawl: + opensearch: image: opensearchproject/opensearch:${OPENSEARCH_VERSION:-2.19.5} container_name: opensearch-news-crawl environment: @@ -94,7 +94,7 @@ services: ports: - "127.0.0.1:9200:9200" # REST API - opensearch-dashboard-news-crawl: + opensearch-dashboard: image: opensearchproject/opensearch-dashboards:${OPENSEARCH_VERSION:-2.19.5} container_name: opensearch-dashboard-news-crawl ports: From f57b83d39d417845e7ac73b5fe139db16d4a5c87 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Fri, 12 Jun 2026 12:18:14 +0200 Subject: [PATCH 7/8] Add GitHub Dependabot configuration file --- .github/dependabot.yml | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 .github/dependabot.yml diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..4205e52 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,31 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +version: 2 +updates: + - package-ecosystem: maven + directory: "/" + schedule: + interval: weekly + open-pull-requests-limit: 5 + ignore: + # Jackson libs must be in sync with the version required by Storm + - dependency-name: "com.fasterxml.jackson*" + + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: weekly From e6e4d7466bbb1c75034569fb3822a86b55a4b6c7 Mon Sep 17 00:00:00 2001 From: Luca <15426+lfoppiano@users.noreply.github.com> Date: Tue, 16 Jun 2026 19:37:36 +0200 Subject: [PATCH 8/8] Add gitignore and automatic code formatting (#71) - Added Cosium formatter - Added .gitignore - Fixed Licence headers (start with /* instead of /**) --- .github/workflows/maven.yml | 2 + .gitignore | 5 + .mvn/jvm.config | 8 + README.md | 17 +- pom.xml | 217 +++--- .../stormcrawler/filter/FastURLFilter.java | 629 +++++++++--------- .../stormcrawler/news/ContentDetector.java | 31 +- .../stormcrawler/news/CrawlTopology.java | 182 ++--- .../stormcrawler/news/FeedDetectorBolt.java | 34 +- .../news/NewsSiteMapParserBolt.java | 235 ++++--- .../stormcrawler/news/PreFilterBolt.java | 136 ++-- .../news/PunycodeURLNormalizer.java | 14 +- .../news/bootstrap/BootstrapTopology.java | 43 +- .../news/bootstrap/FeedLinkParseFilter.java | 26 +- .../bootstrap/NewsSiteMapDetectorBolt.java | 43 +- .../stormcrawler/FastURLFilterTest.java | 84 ++- .../news/NewsSiteMapParserTest.java | 115 ++-- 17 files changed, 954 insertions(+), 867 deletions(-) create mode 100644 .gitignore create mode 100644 .mvn/jvm.config diff --git a/.github/workflows/maven.yml b/.github/workflows/maven.yml index dc52ef3..5c79a4c 100644 --- a/.github/workflows/maven.yml +++ b/.github/workflows/maven.yml @@ -63,5 +63,7 @@ jobs: with: distribution: adopt java-version: ${{ matrix.java }} + - name: Check code formatting + run: mvn -B --no-transfer-progress com.cosium.code:git-code-format-maven-plugin:validate-code-format -Dskip.format.code=false - name: Build with Maven run: mvn -B --no-transfer-progress package --file pom.xml -DCI_ENV=true verify diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..37effdb --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +.idea +target +opensearchdata +warcdata +.java-version \ No newline at end of file diff --git a/.mvn/jvm.config b/.mvn/jvm.config new file mode 100644 index 0000000..87ae20c --- /dev/null +++ b/.mvn/jvm.config @@ -0,0 +1,8 @@ +--add-exports jdk.compiler/com.sun.tools.javac.api=ALL-UNNAMED +--add-exports jdk.compiler/com.sun.tools.javac.file=ALL-UNNAMED +--add-exports jdk.compiler/com.sun.tools.javac.main=ALL-UNNAMED +--add-exports jdk.compiler/com.sun.tools.javac.parser=ALL-UNNAMED +--add-exports jdk.compiler/com.sun.tools.javac.tree=ALL-UNNAMED +--add-exports jdk.compiler/com.sun.tools.javac.util=ALL-UNNAMED +--add-opens jdk.compiler/com.sun.tools.javac.code=ALL-UNNAMED +--add-opens jdk.compiler/com.sun.tools.javac.comp=ALL-UNNAMED diff --git a/README.md b/README.md index 4efbf08..874da21 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ Crawler for news based on [StormCrawler](https://stormcrawler.apache.org/). Prod ## Prerequisites - +* JVM 17 or higher * Install OpenSearch 2.19.5 * Install Apache Storm 2.8.8 * Start OpenSearch and Storm @@ -119,3 +119,18 @@ NewsCrawl ACTIVE 48 1 146 NewsCrawl-1 $> storm kill NewsCrawl ``` +## Note for developers + +Please format your code before submitting a PR with + +``` +mvn git-code-format:format-code -Dgcf.globPattern="**/*" -Dskip.format.code=false +``` + +You can enable pre-commit format hooks by running: + +``` +mvn clean install -Dskip.format.code=false +``` + + diff --git a/pom.xml b/pom.xml index 0508210..69b7487 100644 --- a/pom.xml +++ b/pom.xml @@ -1,5 +1,4 @@ - - - + 4.0.0 org.commoncrawl.stormcrawler.news crawler 3.6.0 jar + + https://github.com/commoncrawl/news-crawl Apache License, Version 2.0 @@ -35,8 +36,6 @@ under the License. - https://github.com/commoncrawl/news-crawl - UTF-8 3.6.0 @@ -47,89 +46,10 @@ under the License. 5.23.0 3.0.1 4.13.2 + 5.4 + true - - - - org.apache.maven.plugins - maven-compiler-plugin - 3.15.0 - - 17 - 17 - - - - org.codehaus.mojo - exec-maven-plugin - 3.6.3 - - - - exec - - - - - java - true - false - compile - - - - org.apache.maven.plugins - maven-shade-plugin - 3.6.2 - - - package - - shade - - - false - - - - org.apache.storm.flux.Flux - - - - - - - - - - *:* - - META-INF/*.SF - META-INF/*.DSA - META-INF/*.RSA - - - - - org.apache.storm:flux-core - - org/apache/commons/** - org/apache/http/** - org/yaml/** - - - - - - - - - - org.apache.stormcrawler @@ -207,4 +127,125 @@ under the License. test + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.15.0 + + 17 + 17 + + + + org.codehaus.mojo + exec-maven-plugin + 3.6.3 + + java + true + false + compile + + + + + exec + + + + + + org.apache.maven.plugins + maven-shade-plugin + 3.6.2 + + + + shade + + package + + false + + + + org.apache.storm.flux.Flux + + + + + + + + + + *:* + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + + + + + org.apache.storm:flux-core + + org/apache/commons/** + org/apache/http/** + org/yaml/** + + + + + + + + + com.cosium.code + git-code-format-maven-plugin + ${git-code-format-maven-plugin.version} + + + + install-formatter-hook + + install-hooks + + + + + validate-code-format + + validate-code-format + + + + + + + com.cosium.code + google-java-format + ${git-code-format-maven-plugin.version} + + + + ${skip.format.code} + + true + false + false + false + + + + + diff --git a/src/main/java/org/commoncrawl/stormcrawler/filter/FastURLFilter.java b/src/main/java/org/commoncrawl/stormcrawler/filter/FastURLFilter.java index 7c52716..9b2ee32 100644 --- a/src/main/java/org/commoncrawl/stormcrawler/filter/FastURLFilter.java +++ b/src/main/java/org/commoncrawl/stormcrawler/filter/FastURLFilter.java @@ -1,4 +1,4 @@ -/** +/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. @@ -16,6 +16,16 @@ */ package org.commoncrawl.stormcrawler.filter; +import com.amazonaws.services.s3.AmazonS3; +import com.amazonaws.services.s3.AmazonS3ClientBuilder; +import com.amazonaws.services.s3.model.GetObjectRequest; +import com.amazonaws.services.s3.model.ObjectMetadata; +import com.amazonaws.services.s3.model.S3Object; +import com.fasterxml.jackson.core.JsonParseException; +import com.fasterxml.jackson.databind.JsonMappingException; +import com.fasterxml.jackson.databind.JsonNode; +import com.google.common.collect.LinkedHashMultimap; +import com.google.common.collect.Multimap; import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.IOException; @@ -29,7 +39,6 @@ import java.util.TimerTask; import java.util.regex.Pattern; import java.util.zip.GZIPInputStream; - import org.apache.commons.lang3.StringUtils; import org.apache.stormcrawler.JSONResource; import org.apache.stormcrawler.Metadata; @@ -38,28 +47,15 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.amazonaws.services.s3.AmazonS3; -import com.amazonaws.services.s3.AmazonS3ClientBuilder; -import com.amazonaws.services.s3.model.GetObjectRequest; -import com.amazonaws.services.s3.model.ObjectMetadata; -import com.amazonaws.services.s3.model.S3Object; -import com.fasterxml.jackson.core.JsonParseException; -import com.fasterxml.jackson.databind.JsonMappingException; -import com.fasterxml.jackson.databind.JsonNode; -import com.google.common.collect.LinkedHashMultimap; -import com.google.common.collect.Multimap; - /** - * Version of the FastURLFilter that can load from a text representation instead - * of the JSON that the SC version handles. Can also reload periodically and get - * its content from S3. - * - * Filters URLs based on a file of regular expressions using host/domains - * matching first. The default policy is to accept a URL if no matches are - * found. + * Version of the FastURLFilter that can load from a text representation instead of the JSON that + * the SC version handles. Can also reload periodically and get its content from S3. + * + *

Filters URLs based on a file of regular expressions using host/domains matching first. The + * default policy is to accept a URL if no matches are found. + * + *

Rule Format: * - * Rule Format: - * *

  * Host www.example.org
  *   DenyPath /path/to/be/excluded
@@ -72,48 +68,45 @@
  * Domain example.org
  *   DenyPathQuery /resource/.*?action=exclude
  * 
- * - * Host rules are evaluated before Domain rules. For - * Host rules the entire host name of a URL must match while the - * domain names in Domain rules are considered as matches if the - * domain is a suffix of the host name (consisting of complete host name parts). - * Shorter domain suffixes are checked first, a single dot - * "." as "domain name" can be used to specify - * global rules applied to every URL. - * - * E.g., for "www.example.com" the rules given above are looked up in the - * following order: + * + * Host rules are evaluated before Domain rules. For Host + * rules the entire host name of a URL must match while the domain names in Domain + * rules are considered as matches if the domain is a suffix of the host name (consisting of + * complete host name parts). Shorter domain suffixes are checked first, a single dot ". + * " as "domain name" can be used to specify global rules applied to every + * URL. + * + *

E.g., for "www.example.com" the rules given above are looked up in the following order: + * *

    - *
  1. check "www.example.com" whether host-based rules exist and whether one of - * them matches
  2. - *
  3. check "www.example.com" for domain-based rules
  4. - *
  5. check "example.com" for domain-based rules
  6. - *
  7. check "com" for domain-based rules
  8. - *
  9. check for global rules ("Domain .")
  10. + *
  11. check "www.example.com" whether host-based rules exist and whether one of them matches + *
  12. check "www.example.com" for domain-based rules + *
  13. check "example.com" for domain-based rules + *
  14. check "com" for domain-based rules + *
  15. check for global rules ("Domain .") *
- * The first matching rule will reject the URL and no further rules are checked. - * If no rule matches the URL is accepted. URLs without a host name (e.g., - * file:/path/file.txt are checked for global rules only. URLs - * which fail to be parsed as {@link java.net.URL} are always rejected. - * - * For rules either the URL path (DenyPath) or path and query - * (DenyPathQuery) are checked whether the given - * {@link java.util.regex Java Regular expression} is found (see - * {@link java.util.regex.Matcher#find()}) in the URL path (and query). - * - * Rules are applied in the order of their definition. For better performance, - * regular expressions which are simpler/faster or match more URLs should be - * defined earlier. - * - * Comments in the rule file start with the # character and reach - * until the end of the line. - * - * The rules file is defined via the property urlfilter.fast.file, - * the default name is fast-urlfilter.txt. + * + * The first matching rule will reject the URL and no further rules are checked. If no rule matches + * the URL is accepted. URLs without a host name (e.g., file:/path/file.txt are checked + * for global rules only. URLs which fail to be parsed as {@link java.net.URL} are always rejected. + * + *

For rules either the URL path (DenyPath) or path and query (DenyPathQuery + * ) are checked whether the given {@link java.util.regex Java Regular expression} is found + * (see {@link java.util.regex.Matcher#find()}) in the URL path (and query). + * + *

Rules are applied in the order of their definition. For better performance, regular + * expressions which are simpler/faster or match more URLs should be defined earlier. + * + *

Comments in the rule file start with the # character and reach until the end of + * the line. + * + *

The rules file is defined via the property urlfilter.fast.file, the default name + * is fast-urlfilter.txt. */ -public class FastURLFilter extends URLFilter implements JSONResource { +public class FastURLFilter extends URLFilter implements JSONResource { - protected static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + protected static final Logger LOG = + LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); public static final String URLFILTER_FAST_FILE = "urlfilter.fast.file"; private Multimap hostRules = LinkedHashMultimap.create(); @@ -121,302 +114,320 @@ public class FastURLFilter extends URLFilter implements JSONResource { private String resourceFile; - private static final Pattern CATCH_ALL_RULE = Pattern.compile("^\\s*DenyPath(?:Query)?\\s+\\.[*?]\\s*$"); + private static final Pattern CATCH_ALL_RULE = + Pattern.compile("^\\s*DenyPath(?:Query)?\\s+\\.[*?]\\s*$"); private String resourceETAG; public void configure(@SuppressWarnings("rawtypes") Map stormConf, JsonNode filterParams) { - // read from conf first - int refreshRate = ConfUtils.getInt(stormConf, "fast.urlfilter.refresh", -1); - this.resourceFile = ConfUtils.getString(stormConf, "fast.urlfilter.file", null); - - // then from the param file (which needs recompiling in case of change) - if (filterParams != null) { - JsonNode node = filterParams.get("file"); - if (node != null && node.isTextual() && this.resourceFile == null) { - this.resourceFile = node.asText(); - } - node = filterParams.get("refresh"); - if (node != null && node.isInt() && refreshRate == -1) { - refreshRate = node.asInt(); - } - } - - try { - loadJSONResources(); - } catch (Exception e) { - LOG.error("Exception while loading resources", e); - } - - if (refreshRate != -1) { - LOG.info("Filter set to reload from {} every {} sec", getResourceFile(), refreshRate); - new Timer().schedule(new TimerTask() { - public void run() { - LOG.info("Reloading resources"); - try { - loadJSONResources(); - } catch (Exception e) { - LOG.error("Can't load resources", e); - } - } - }, refreshRate * 1000, refreshRate * 1000); - } + // read from conf first + int refreshRate = ConfUtils.getInt(stormConf, "fast.urlfilter.refresh", -1); + this.resourceFile = ConfUtils.getString(stormConf, "fast.urlfilter.file", null); + + // then from the param file (which needs recompiling in case of change) + if (filterParams != null) { + JsonNode node = filterParams.get("file"); + if (node != null && node.isTextual() && this.resourceFile == null) { + this.resourceFile = node.asText(); + } + node = filterParams.get("refresh"); + if (node != null && node.isInt() && refreshRate == -1) { + refreshRate = node.asInt(); + } + } + + try { + loadJSONResources(); + } catch (Exception e) { + LOG.error("Exception while loading resources", e); + } + + if (refreshRate != -1) { + LOG.info("Filter set to reload from {} every {} sec", getResourceFile(), refreshRate); + new Timer() + .schedule( + new TimerTask() { + public void run() { + LOG.info("Reloading resources"); + try { + loadJSONResources(); + } catch (Exception e) { + LOG.error("Can't load resources", e); + } + } + }, + refreshRate * 1000, + refreshRate * 1000); + } } /** * Load the resources from the JSON file in the uber jar or from S3 - * + * * @throws Exception - **/ + */ @Override public void loadJSONResources() throws Exception { - InputStream inputStream = null; - AmazonS3 s3client = null; - try { - if (getResourceFile().startsWith("s3://")) { - // try loading from S3 - s3client = AmazonS3ClientBuilder.standard().build(); - java.net.URI uri = new java.net.URI(getResourceFile()); - - String bucketName = uri.getHost(); - // remove the first "/" - String path = uri.getPath().substring(1); - - // optimisation - avoid a full reload if the resource has not changed - ObjectMetadata metadata = s3client.getObjectMetadata(bucketName, path); - final String ETAG = metadata.getETag(); - if (ETAG != null && ETAG.equals(resourceETAG)) { - LOG.info("Unchanged ETAG for {} - skipping reload", getResourceFile()); - return; - } else { - resourceETAG = ETAG; - } - - final S3Object object = s3client.getObject(new GetObjectRequest(bucketName, path)); - inputStream = object.getObjectContent(); - } else { - inputStream = getClass().getClassLoader().getResourceAsStream(getResourceFile()); - if (inputStream == null) { - LOG.error("Can't load conf from {}", getResourceFile()); - return; - } - } - if (getResourceFile().endsWith(".gz")) { - inputStream = new GZIPInputStream(inputStream); - } - - loadJSONResources(new BufferedInputStream(inputStream)); - } finally { - if (inputStream != null) { - inputStream.close(); - } - if (s3client != null) { - s3client.shutdown(); - } - } + InputStream inputStream = null; + AmazonS3 s3client = null; + try { + if (getResourceFile().startsWith("s3://")) { + // try loading from S3 + s3client = AmazonS3ClientBuilder.standard().build(); + java.net.URI uri = new java.net.URI(getResourceFile()); + + String bucketName = uri.getHost(); + // remove the first "/" + String path = uri.getPath().substring(1); + + // optimisation - avoid a full reload if the resource has not changed + ObjectMetadata metadata = s3client.getObjectMetadata(bucketName, path); + final String ETAG = metadata.getETag(); + if (ETAG != null && ETAG.equals(resourceETAG)) { + LOG.info("Unchanged ETAG for {} - skipping reload", getResourceFile()); + return; + } else { + resourceETAG = ETAG; + } + + final S3Object object = s3client.getObject(new GetObjectRequest(bucketName, path)); + inputStream = object.getObjectContent(); + } else { + inputStream = getClass().getClassLoader().getResourceAsStream(getResourceFile()); + if (inputStream == null) { + LOG.error("Can't load conf from {}", getResourceFile()); + return; + } + } + if (getResourceFile().endsWith(".gz")) { + inputStream = new GZIPInputStream(inputStream); + } + + loadJSONResources(new BufferedInputStream(inputStream)); + } finally { + if (inputStream != null) { + inputStream.close(); + } + if (s3client != null) { + s3client.shutdown(); + } + } } @Override public void loadJSONResources(InputStream inputStream) - throws JsonParseException, JsonMappingException, IOException { - long start = System.currentTimeMillis(); - - try (Reader r = new InputStreamReader(inputStream)) { - reloadRules(r); - } - - long end = System.currentTimeMillis(); - LOG.info("Loaded {} hostrules and {} domain rules in {} msec from {}", hostRules.size(), domainRules.size(), - (end - start), resourceFile); + throws JsonParseException, JsonMappingException, IOException { + long start = System.currentTimeMillis(); + + try (Reader r = new InputStreamReader(inputStream)) { + reloadRules(r); + } + + long end = System.currentTimeMillis(); + LOG.info( + "Loaded {} hostrules and {} domain rules in {} msec from {}", + hostRules.size(), + domainRules.size(), + (end - start), + resourceFile); } @Override public String getResourceFile() { - return resourceFile; + return resourceFile; } @Override public String filter(URL sourceUrl, Metadata sourceMetadata, String urlToFilter) { - synchronized (this) { - URL u; - - try { - u = new URL(urlToFilter); - } catch (Exception e) { - LOG.debug("Rejected {} because failed to parse as URL: {}", urlToFilter, e.getMessage()); - return null; - } - - String hostname = u.getHost(); - - // first check for host-specific rules - for (Rule rule : hostRules.get(hostname)) { - if (rule.match(u)) { - return null; - } - } - - // also look up domain rules for host name - for (Rule rule : domainRules.get(hostname)) { - if (rule.match(u)) { - return null; - } - } - - // check suffixes of host name from longer to shorter: - // subdomains, domain, top-level domain - int start = 0; - int pos; - while ((pos = hostname.indexOf('.', start)) != -1) { - start = pos + 1; - String domain = hostname.substring(start); - for (Rule rule : domainRules.get(domain)) { - if (rule.match(u)) { - return null; - } - } - } - - // finally check "global" rules defined for `Domain .` - for (Rule rule : domainRules.get(".")) { - if (rule.match(u)) { - return null; - } - } - - // no reject rules found - return urlToFilter; - } + synchronized (this) { + URL u; + + try { + u = new URL(urlToFilter); + } catch (Exception e) { + LOG.debug( + "Rejected {} because failed to parse as URL: {}", + urlToFilter, + e.getMessage()); + return null; + } + + String hostname = u.getHost(); + + // first check for host-specific rules + for (Rule rule : hostRules.get(hostname)) { + if (rule.match(u)) { + return null; + } + } + + // also look up domain rules for host name + for (Rule rule : domainRules.get(hostname)) { + if (rule.match(u)) { + return null; + } + } + + // check suffixes of host name from longer to shorter: + // subdomains, domain, top-level domain + int start = 0; + int pos; + while ((pos = hostname.indexOf('.', start)) != -1) { + start = pos + 1; + String domain = hostname.substring(start); + for (Rule rule : domainRules.get(domain)) { + if (rule.match(u)) { + return null; + } + } + } + + // finally check "global" rules defined for `Domain .` + for (Rule rule : domainRules.get(".")) { + if (rule.match(u)) { + return null; + } + } + + // no reject rules found + return urlToFilter; + } } private void reloadRules(Reader rules) throws IOException { - synchronized (this) { - domainRules.clear(); - hostRules.clear(); - - BufferedReader reader = new BufferedReader(rules); - - String current = null; - boolean host = false; - int lineno = 0; - - String line; - try { - while ((line = reader.readLine()) != null) { - lineno++; - line = line.trim(); - - if (line.indexOf("#") != -1) { - // strip comments - line = line.substring(0, line.indexOf("#")).trim(); - } - - if (StringUtils.isBlank(line)) { - continue; - } - - if (line.startsWith("Host")) { - host = true; - current = line.split("\\s+")[1]; - } else if (line.startsWith("Domain")) { - host = false; - current = line.split("\\s+")[1]; - } else { - if (current == null) { - continue; - } - - Rule rule = null; - try { - if (CATCH_ALL_RULE.matcher(line).matches()) { - rule = DenyAllRule.getInstance(); - } else if (line.startsWith("DenyPathQuery")) { - rule = new DenyPathQueryRule(line.split("\\s+")[1]); - } else if (line.startsWith("DenyPath")) { - rule = new DenyPathRule(line.split("\\s+")[1]); - } else { - LOG.warn("Problem reading rule on line {}: {}", lineno, line); - continue; - } - } catch (Exception e) { - LOG.warn("Problem reading rule on line {}: {} - {}", lineno, line, e.getMessage()); - continue; - } - - if (host) { - LOG.trace("Adding host rule [{}] [{}]", current, rule); - hostRules.put(current, rule); - } else { - LOG.trace("Adding domain rule [{}] [{}]", current, rule); - domainRules.put(current, rule); - } - } - } - - } catch (IOException e) { - LOG.warn("Caught exception while reading rules file at line {}: {}", lineno, e.getMessage()); - throw e; - } - } + synchronized (this) { + domainRules.clear(); + hostRules.clear(); + + BufferedReader reader = new BufferedReader(rules); + + String current = null; + boolean host = false; + int lineno = 0; + + String line; + try { + while ((line = reader.readLine()) != null) { + lineno++; + line = line.trim(); + + if (line.indexOf("#") != -1) { + // strip comments + line = line.substring(0, line.indexOf("#")).trim(); + } + + if (StringUtils.isBlank(line)) { + continue; + } + + if (line.startsWith("Host")) { + host = true; + current = line.split("\\s+")[1]; + } else if (line.startsWith("Domain")) { + host = false; + current = line.split("\\s+")[1]; + } else { + if (current == null) { + continue; + } + + Rule rule = null; + try { + if (CATCH_ALL_RULE.matcher(line).matches()) { + rule = DenyAllRule.getInstance(); + } else if (line.startsWith("DenyPathQuery")) { + rule = new DenyPathQueryRule(line.split("\\s+")[1]); + } else if (line.startsWith("DenyPath")) { + rule = new DenyPathRule(line.split("\\s+")[1]); + } else { + LOG.warn("Problem reading rule on line {}: {}", lineno, line); + continue; + } + } catch (Exception e) { + LOG.warn( + "Problem reading rule on line {}: {} - {}", + lineno, + line, + e.getMessage()); + continue; + } + + if (host) { + LOG.trace("Adding host rule [{}] [{}]", current, rule); + hostRules.put(current, rule); + } else { + LOG.trace("Adding domain rule [{}] [{}]", current, rule); + domainRules.put(current, rule); + } + } + } + + } catch (IOException e) { + LOG.warn( + "Caught exception while reading rules file at line {}: {}", + lineno, + e.getMessage()); + throw e; + } + } } public static class Rule { - protected Pattern pattern; + protected Pattern pattern; - Rule() { - } + Rule() {} - public Rule(String regex) { - pattern = Pattern.compile(regex); - } + public Rule(String regex) { + pattern = Pattern.compile(regex); + } - public boolean match(URL url) { - return pattern.matcher(url.toString()).find(); - } + public boolean match(URL url) { + return pattern.matcher(url.toString()).find(); + } - public String toString() { - return pattern.toString(); - } + public String toString() { + return pattern.toString(); + } } public static class DenyPathRule extends Rule { - public DenyPathRule(String regex) { - super(regex); - } - - public boolean match(URL url) { - String haystack = url.getPath(); - return pattern.matcher(haystack).find(); - } + public DenyPathRule(String regex) { + super(regex); + } + + public boolean match(URL url) { + String haystack = url.getPath(); + return pattern.matcher(haystack).find(); + } } /** Rule for DenyPath .* or DenyPath .? */ public static class DenyAllRule extends Rule { - private static Rule instance = new DenyAllRule("."); + private static Rule instance = new DenyAllRule("."); - private DenyAllRule(String regex) { - super(regex); - } + private DenyAllRule(String regex) { + super(regex); + } - public static Rule getInstance() { - return instance; - } + public static Rule getInstance() { + return instance; + } - public boolean match(URL url) { - return true; - } + public boolean match(URL url) { + return true; + } } public static class DenyPathQueryRule extends Rule { - public DenyPathQueryRule(String regex) { - super(regex); - } - - public boolean match(URL url) { - String haystack = url.getFile(); - return pattern.matcher(haystack).find(); - } + public DenyPathQueryRule(String regex) { + super(regex); + } + + public boolean match(URL url) { + String haystack = url.getFile(); + return pattern.matcher(haystack).find(); + } } } diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/ContentDetector.java b/src/main/java/org/commoncrawl/stormcrawler/news/ContentDetector.java index 1c3a4a5..9979781 100644 --- a/src/main/java/org/commoncrawl/stormcrawler/news/ContentDetector.java +++ b/src/main/java/org/commoncrawl/stormcrawler/news/ContentDetector.java @@ -13,33 +13,28 @@ */ package org.commoncrawl.stormcrawler.news; +import com.google.common.primitives.Bytes; import java.nio.charset.StandardCharsets; import java.util.Arrays; -import com.google.common.primitives.Bytes; - public class ContentDetector { protected byte[][][] clues; protected int maxOffset; /** - * Set up detector to detect content sniffing for a set of clue strings in a - * prefix of the binary content. + * Set up detector to detect content sniffing for a set of clue strings in a prefix of the + * binary content. * - * @param clues - * nested list of literal clues. Outer list defines an OR-group, - * inner list contained ANDed clues required to match all, e.g. - * the following definition would match if either - * "clue1" and "and_clue2" are matched, or - * alternatively "or_clue3" is found + * @param clues nested list of literal clues. Outer list defines an OR-group, inner list + * contained ANDed clues required to match all, e.g. the following definition would match if + * either "clue1" and "and_clue2" are matched, or alternatively + * "or_clue3" is found + *

+     *                  { { clue1, and_clue2 }, { or_clue3 } }
+     *                  
* - *
-     *            { { clue1, and_clue2 }, { or_clue3 } }
-     *            
- * - * @param maxOffset - * max. offset of content prefix checked for clues + * @param maxOffset max. offset of content prefix checked for clues */ public ContentDetector(String[][] clues, int maxOffset) { this.maxOffset = maxOffset; @@ -60,8 +55,7 @@ public int getFirstMatch(byte[] content) { for (int i = 0; i < clues.length; i++) { byte[][] group = clues[i]; for (byte[] clue : group) { - if (Bytes.indexOf(beginning, clue) == -1) - continue OR; + if (Bytes.indexOf(beginning, clue) == -1) continue OR; } // success, all members of one group matched return i; @@ -72,5 +66,4 @@ public int getFirstMatch(byte[] content) { public boolean matches(byte[] content) { return (getFirstMatch(content) >= 0); } - } diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java b/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java index 3a3c018..cc2d85a 100644 --- a/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java +++ b/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java @@ -1,4 +1,4 @@ -/** +/* * Licensed to DigitalPebble Ltd under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. @@ -14,13 +14,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.commoncrawl.stormcrawler.news; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.Map; - import org.apache.storm.topology.BoltDeclarer; import org.apache.storm.topology.TopologyBuilder; import org.apache.storm.tuple.Fields; @@ -43,116 +41,126 @@ import org.apache.stormcrawler.warc.WARCHdfsBolt; import org.slf4j.LoggerFactory; -/** - * Dummy topology to play with the spouts and bolts on OpenSearch - */ +/** Dummy topology to play with the spouts and bolts on OpenSearch */ public class CrawlTopology extends ConfigurableTopology { private static final org.slf4j.Logger LOG = LoggerFactory.getLogger(CrawlTopology.class); public static void main(String[] args) throws Exception { - ConfigurableTopology.start(new CrawlTopology(), args); + ConfigurableTopology.start(new CrawlTopology(), args); } @Override protected int run(String[] args) { - TopologyBuilder builder = new TopologyBuilder(); + TopologyBuilder builder = new TopologyBuilder(); - int numWorkers = ConfUtils.getInt(getConf(), "topology.workers", 1); + int numWorkers = ConfUtils.getInt(getConf(), "topology.workers", 1); - // set to the real number of shards ONLY if es.status.routing is set to - // true in the configuration - int numShards = 16; + // set to the real number of shards ONLY if es.status.routing is set to + // true in the configuration + int numShards = 16; - if (args.length >= 2) { - // arguments include seed directory and file pattern - LOG.info("Injecting seeds from {} by pattern {}", args[0], args[1]); - builder.setSpout("filespout", new FileSpout(args[0], args[1], true)); - Fields key = new Fields("url"); + if (args.length >= 2) { + // arguments include seed directory and file pattern + LOG.info("Injecting seeds from {} by pattern {}", args[0], args[1]); + builder.setSpout("filespout", new FileSpout(args[0], args[1], true)); + Fields key = new Fields("url"); - builder.setBolt("filter", new URLFilterBolt()).fieldsGrouping("filespout", Constants.StatusStreamName, key); - } + builder.setBolt("filter", new URLFilterBolt()) + .fieldsGrouping("filespout", Constants.StatusStreamName, key); + } - builder.setSpout("spout", new AggregationSpout(), numShards); + builder.setSpout("spout", new AggregationSpout(), numShards); - builder.setBolt("prefilter", new PreFilterBolt("pre-urlfilters.json"), numWorkers).shuffleGrouping("spout"); + builder.setBolt("prefilter", new PreFilterBolt("pre-urlfilters.json"), numWorkers) + .shuffleGrouping("spout"); - builder.setBolt("partitioner", new URLPartitionerBolt(), numWorkers).shuffleGrouping("prefilter"); + builder.setBolt("partitioner", new URLPartitionerBolt(), numWorkers) + .shuffleGrouping("prefilter"); - builder.setBolt("fetch", new FetcherBolt(), numWorkers).fieldsGrouping("partitioner", new Fields("key")); + builder.setBolt("fetch", new FetcherBolt(), numWorkers) + .fieldsGrouping("partitioner", new Fields("key")); - builder.setBolt("sitemap", new NewsSiteMapParserBolt(), numWorkers).setNumTasks(2) - .localOrShuffleGrouping("fetch"); + builder.setBolt("sitemap", new NewsSiteMapParserBolt(), numWorkers) + .setNumTasks(2) + .localOrShuffleGrouping("fetch"); - builder.setBolt("feed", new FeedParserBolt(), numWorkers).setNumTasks(4).localOrShuffleGrouping("sitemap"); + builder.setBolt("feed", new FeedParserBolt(), numWorkers) + .setNumTasks(4) + .localOrShuffleGrouping("sitemap"); - // don't need to parse the pages but need to update their status - builder.setBolt("ssb", new DummyIndexer(), numWorkers).localOrShuffleGrouping("feed"); + // don't need to parse the pages but need to update their status + builder.setBolt("ssb", new DummyIndexer(), numWorkers).localOrShuffleGrouping("feed"); - WARCHdfsBolt warcbolt = getWarcBolt("CC-NEWS"); + WARCHdfsBolt warcbolt = getWarcBolt("CC-NEWS"); - // take it from feed default output so that the feed files themselves - // don't get included - unless we want them too of course! - builder.setBolt("warc", warcbolt, numWorkers).localOrShuffleGrouping("feed"); - - final Fields furl = new Fields("url"); + // take it from feed default output so that the feed files themselves + // don't get included - unless we want them too of course! + builder.setBolt("warc", warcbolt, numWorkers).localOrShuffleGrouping("feed"); - BoltDeclarer statusBolt = builder.setBolt("status", new StatusUpdaterBolt(), numWorkers) - .fieldsGrouping("fetch", Constants.StatusStreamName, furl) - .fieldsGrouping("sitemap", Constants.StatusStreamName, furl) - .fieldsGrouping("feed", Constants.StatusStreamName, furl) - .fieldsGrouping("ssb", Constants.StatusStreamName, furl) - .fieldsGrouping("prefilter", Constants.StatusStreamName, furl); - - if (args.length >= 2) { - statusBolt.customGrouping("filter", Constants.StatusStreamName, new URLStreamGrouping()); - } - statusBolt.setNumTasks(numShards); + final Fields furl = new Fields("url"); - return submit(conf, builder); + BoltDeclarer statusBolt = + builder.setBolt("status", new StatusUpdaterBolt(), numWorkers) + .fieldsGrouping("fetch", Constants.StatusStreamName, furl) + .fieldsGrouping("sitemap", Constants.StatusStreamName, furl) + .fieldsGrouping("feed", Constants.StatusStreamName, furl) + .fieldsGrouping("ssb", Constants.StatusStreamName, furl) + .fieldsGrouping("prefilter", Constants.StatusStreamName, furl); + + if (args.length >= 2) { + statusBolt.customGrouping( + "filter", Constants.StatusStreamName, new URLStreamGrouping()); + } + statusBolt.setNumTasks(numShards); + + return submit(conf, builder); } protected WARCHdfsBolt getWarcBolt(String filePrefix) { - // path is absolute - String warcFilePath = ConfUtils.getString(getConf(), "warc.dir", "/data/warc"); - - WARCFileNameFormat fileNameFormat = new WARCFileNameFormat(); - fileNameFormat.withPath(warcFilePath); - fileNameFormat.withPrefix(filePrefix); - - Map fields = new LinkedHashMap<>(); - fields.put("software", "StormCrawler 2.10 https://stormcrawler.net/"); - fields.put("description", "News crawl for Common Crawl"); - String userAgent = AbstractHttpProtocol.getAgentString(getConf()); - fields.put("http-header-user-agent", userAgent); - fields.put("http-header-from", ConfUtils.getString(getConf(), "http.agent.email")); - String robotsTxtParser = "checked by crawler-commons " + crawlercommons.CrawlerCommons.getVersion() - + " (https://github.com/crawler-commons/crawler-commons)"; - fields.put("robots", robotsTxtParser); - fields.put("format", "WARC File Format 1.1"); - fields.put("conformsTo", "https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/"); - - WARCHdfsBolt warcbolt = (WARCHdfsBolt) new WARCHdfsBolt(); - warcbolt.withConfigKey("warc"); - warcbolt.withFileNameFormat(fileNameFormat); - warcbolt.withHeader(fields); - warcbolt.withRequestRecords(); - - // use RawLocalFileSystem (instead of ChecksumFileSystem) to avoid that - // WARC files are truncated if the topology is stopped because of a - // delayed sync of the default ChecksumFileSystem - Map hdfsConf = new HashMap<>(); - hdfsConf.put("fs.file.impl", "org.apache.hadoop.fs.RawLocalFileSystem"); - getConf().put("warc", hdfsConf); - - // will rotate if reaches size or time limit - int maxMB = ConfUtils.getInt(getConf(), "warc.rotation.policy.max-mb", 1024); - int maxMinutes = ConfUtils.getInt(getConf(), "warc.rotation.policy.max-minutes", 1440); - FileTimeSizeRotationPolicy rotpol = new FileTimeSizeRotationPolicy(maxMB, Units.MB); - rotpol.setTimeRotationInterval(maxMinutes, FileTimeSizeRotationPolicy.TimeUnit.MINUTES); - warcbolt.withRotationPolicy(rotpol); - - return warcbolt; + // path is absolute + String warcFilePath = ConfUtils.getString(getConf(), "warc.dir", "/data/warc"); + + WARCFileNameFormat fileNameFormat = new WARCFileNameFormat(); + fileNameFormat.withPath(warcFilePath); + fileNameFormat.withPrefix(filePrefix); + + Map fields = new LinkedHashMap<>(); + fields.put("software", "StormCrawler 2.10 https://stormcrawler.net/"); + fields.put("description", "News crawl for Common Crawl"); + String userAgent = AbstractHttpProtocol.getAgentString(getConf()); + fields.put("http-header-user-agent", userAgent); + fields.put("http-header-from", ConfUtils.getString(getConf(), "http.agent.email")); + String robotsTxtParser = + "checked by crawler-commons " + + crawlercommons.CrawlerCommons.getVersion() + + " (https://github.com/crawler-commons/crawler-commons)"; + fields.put("robots", robotsTxtParser); + fields.put("format", "WARC File Format 1.1"); + fields.put( + "conformsTo", + "https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/"); + + WARCHdfsBolt warcbolt = (WARCHdfsBolt) new WARCHdfsBolt(); + warcbolt.withConfigKey("warc"); + warcbolt.withFileNameFormat(fileNameFormat); + warcbolt.withHeader(fields); + warcbolt.withRequestRecords(); + + // use RawLocalFileSystem (instead of ChecksumFileSystem) to avoid that + // WARC files are truncated if the topology is stopped because of a + // delayed sync of the default ChecksumFileSystem + Map hdfsConf = new HashMap<>(); + hdfsConf.put("fs.file.impl", "org.apache.hadoop.fs.RawLocalFileSystem"); + getConf().put("warc", hdfsConf); + + // will rotate if reaches size or time limit + int maxMB = ConfUtils.getInt(getConf(), "warc.rotation.policy.max-mb", 1024); + int maxMinutes = ConfUtils.getInt(getConf(), "warc.rotation.policy.max-minutes", 1440); + FileTimeSizeRotationPolicy rotpol = new FileTimeSizeRotationPolicy(maxMB, Units.MB); + rotpol.setTimeRotationInterval(maxMinutes, FileTimeSizeRotationPolicy.TimeUnit.MINUTES); + warcbolt.withRotationPolicy(rotpol); + + return warcbolt; } - } diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/FeedDetectorBolt.java b/src/main/java/org/commoncrawl/stormcrawler/news/FeedDetectorBolt.java index c365525..e607dc2 100644 --- a/src/main/java/org/commoncrawl/stormcrawler/news/FeedDetectorBolt.java +++ b/src/main/java/org/commoncrawl/stormcrawler/news/FeedDetectorBolt.java @@ -14,13 +14,11 @@ package org.commoncrawl.stormcrawler.news; import java.util.Map; - +import org.apache.http.HttpHeaders; import org.apache.storm.task.OutputCollector; import org.apache.storm.task.TopologyContext; import org.apache.storm.tuple.Tuple; import org.apache.storm.tuple.Values; -import org.slf4j.LoggerFactory; - import org.apache.stormcrawler.Constants; import org.apache.stormcrawler.Metadata; import org.apache.stormcrawler.bolt.FeedParserBolt; @@ -29,28 +27,23 @@ import org.apache.stormcrawler.parse.ParseFilters; import org.apache.stormcrawler.parse.ParseResult; import org.apache.stormcrawler.persistence.Status; -import org.apache.http.HttpHeaders; +import org.slf4j.LoggerFactory; /** Detect RSS and Atom feeds, but do not parse and extract links */ @SuppressWarnings("serial") public class FeedDetectorBolt extends FeedParserBolt { - private static final org.slf4j.Logger LOG = LoggerFactory - .getLogger(FeedDetectorBolt.class); + private static final org.slf4j.Logger LOG = LoggerFactory.getLogger(FeedDetectorBolt.class); - public static final String[] mimeTypeClues = { - "rss+xml", "atom+xml", "text/rss" - }; + public static final String[] mimeTypeClues = {"rss+xml", "atom+xml", "text/rss"}; - public static String[][] contentClues = { { "<{}> for {}", - ct, url); + LOG.info("Feed detected from content type <{}> for {}", ct, url); break; } } @@ -90,8 +82,8 @@ public void execute(Tuple tuple) { parseData.setMetadata(metadata); parseFilters.filter(url, content, null, parse); // emit status - collector.emit(Constants.StatusStreamName, tuple, - new Values(url, metadata, Status.FETCHED)); + collector.emit( + Constants.StatusStreamName, tuple, new Values(url, metadata, Status.FETCHED)); } else { // pass on collector.emit(tuple, tuple.getValues()); @@ -100,11 +92,9 @@ public void execute(Tuple tuple) { } @Override - @SuppressWarnings({ "rawtypes" }) - public void prepare(Map stormConf, TopologyContext context, - OutputCollector collect) { + @SuppressWarnings({"rawtypes"}) + public void prepare(Map stormConf, TopologyContext context, OutputCollector collect) { super.prepare(stormConf, context, collect); parseFilters = ParseFilters.fromConf(stormConf); } - } diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserBolt.java b/src/main/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserBolt.java index 3c4cf55..187d086 100644 --- a/src/main/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserBolt.java +++ b/src/main/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserBolt.java @@ -15,6 +15,18 @@ import static org.apache.stormcrawler.Constants.StatusStreamName; +import crawlercommons.sitemaps.AbstractSiteMap; +import crawlercommons.sitemaps.Namespace; +import crawlercommons.sitemaps.SiteMap; +import crawlercommons.sitemaps.SiteMapIndex; +import crawlercommons.sitemaps.SiteMapParser; +import crawlercommons.sitemaps.SiteMapURL; +import crawlercommons.sitemaps.SiteMapURL.ChangeFrequency; +import crawlercommons.sitemaps.UnknownFormatException; +import crawlercommons.sitemaps.extension.Extension; +import crawlercommons.sitemaps.extension.ExtensionMetadata; +import crawlercommons.sitemaps.extension.LinkAttributes; +import crawlercommons.sitemaps.extension.NewsAttributes; import java.io.IOException; import java.net.URL; import java.util.ArrayList; @@ -24,7 +36,6 @@ import java.util.Iterator; import java.util.List; import java.util.Map; - import org.apache.commons.lang3.StringUtils; import org.apache.http.HttpHeaders; import org.apache.storm.metric.api.MeanReducer; @@ -46,56 +57,45 @@ import org.apache.stormcrawler.util.ConfUtils; import org.slf4j.LoggerFactory; -import crawlercommons.sitemaps.AbstractSiteMap; -import crawlercommons.sitemaps.Namespace; -import crawlercommons.sitemaps.SiteMap; -import crawlercommons.sitemaps.SiteMapIndex; -import crawlercommons.sitemaps.SiteMapParser; -import crawlercommons.sitemaps.SiteMapURL; -import crawlercommons.sitemaps.SiteMapURL.ChangeFrequency; -import crawlercommons.sitemaps.UnknownFormatException; -import crawlercommons.sitemaps.extension.Extension; -import crawlercommons.sitemaps.extension.ExtensionMetadata; -import crawlercommons.sitemaps.extension.LinkAttributes; -import crawlercommons.sitemaps.extension.NewsAttributes; - - /** - * ParserBolt for news + * ParserBolt for news * sitemaps. */ @SuppressWarnings("serial") public class NewsSiteMapParserBolt extends SiteMapParserBolt { // TODO: - // this is a modified copy of c.d.s.bolt.SiteMapParserBolt - // - make parent class extensible and overridable - // modifications: - // - detect and process only Google news sitemaps - // - or a sitemapindex because some subsitemaps may - // be news sitemaps - // - pass "isSitemapNews" to status metadata + // this is a modified copy of c.d.s.bolt.SiteMapParserBolt + // - make parent class extensible and overridable + // modifications: + // - detect and process only Google news sitemaps + // - or a sitemapindex because some subsitemaps may + // be news sitemaps + // - pass "isSitemapNews" to status metadata public static enum SitemapType { - NEWS, INDEX, SITEMAP + NEWS, + INDEX, + SITEMAP } public static final String isSitemapNewsKey = "isSitemapNews"; public static final String isSitemapIndexKey = "isSitemapIndex"; + /** - * A sitemap (not necessarily a news sitemap) which is verified to contain - * links to news articles. Necessary to crawl news sites which provide a - * sitemap but neither a news feed or sitemap. + * A sitemap (not necessarily a news sitemap) which is verified to contain links to news + * articles. Necessary to crawl news sites which provide a sitemap but neither a news feed or + * sitemap. */ public static final String isSitemapVerifiedKey = "isSitemapVerified"; - private static final org.slf4j.Logger LOG = LoggerFactory - .getLogger(NewsSiteMapParserBolt.class); + private static final org.slf4j.Logger LOG = + LoggerFactory.getLogger(NewsSiteMapParserBolt.class); /* content clues for news sitemaps, sitemap indexes or any sitemaps */ public static String[][] contentClues; public static int contentCluesSitemapNewsMatchUpTo = -1; public static int contentCluesSitemapIndexMatchUpTo = -1; + static { int cluesSize = Namespace.NEWS.length + 1 + 1 + Namespace.SITEMAP_LEGACY.length; contentClues = new String[cluesSize][1]; @@ -129,7 +129,7 @@ public static enum SitemapType { private ReducedMetric averagedMetrics; - /** Delay in minutes used for scheduling sub-sitemaps **/ + /** Delay in minutes used for scheduling sub-sitemaps * */ private int scheduleSitemapsWithDelay = -1; @Override @@ -140,14 +140,10 @@ public void execute(Tuple tuple) { byte[] content = tuple.getBinaryByField("content"); String url = tuple.getStringByField("url"); - boolean isSitemap = Boolean.valueOf( - metadata.getFirstValue(SiteMapParserBolt.isSitemapKey)); - boolean isNewsSitemap = Boolean - .valueOf(metadata.getFirstValue(isSitemapNewsKey)); - boolean isSitemapIndex = Boolean - .valueOf(metadata.getFirstValue(isSitemapIndexKey)); - boolean isSitemapVerified = Boolean - .valueOf(metadata.getFirstValue(isSitemapVerifiedKey)); + boolean isSitemap = Boolean.valueOf(metadata.getFirstValue(SiteMapParserBolt.isSitemapKey)); + boolean isNewsSitemap = Boolean.valueOf(metadata.getFirstValue(isSitemapNewsKey)); + boolean isSitemapIndex = Boolean.valueOf(metadata.getFirstValue(isSitemapIndexKey)); + boolean isSitemapVerified = Boolean.valueOf(metadata.getFirstValue(isSitemapVerifiedKey)); if (sniffContent) { SitemapType type = detectContent(url, content); @@ -183,14 +179,16 @@ public void execute(Tuple tuple) { if (isNewsSitemap || isSitemapIndex || isSitemapVerified) { /* - * remove the isSitemap key from metadata to avoid that the default - * sitemap fetch interval is applied to news sitemaps, sitemap - * indexes and verified sitemaps + * remove the isSitemap key from metadata to avoid that the default sitemap + * fetch interval is applied to news sitemaps, sitemap indexes and verified + * sitemaps */ metadata.remove(isSitemapKey); } else { if (isSitemap) { - collector.emit(Constants.StatusStreamName, tuple, + collector.emit( + Constants.StatusStreamName, + tuple, new Values(url, metadata, Status.FETCHED)); } else { // not a sitemap, just pass it on @@ -217,8 +215,8 @@ public void execute(Tuple tuple) { metadata.setValue(Constants.STATUS_ERROR_SOURCE, "sitemap parsing"); metadata.setValue(Constants.STATUS_ERROR_MESSAGE, errorMessage); metadata.remove("numLinks"); - collector.emit(Constants.StatusStreamName, tuple, new Values(url, - metadata, Status.ERROR)); + collector.emit( + Constants.StatusStreamName, tuple, new Values(url, metadata, Status.ERROR)); collector.ack(tuple); return; } @@ -232,15 +230,12 @@ public void execute(Tuple tuple) { parseFilters.filter(url, content, null, parse); } catch (RuntimeException e) { - String errorMessage = "Exception while running parse filters on " - + url + ": " + e; + String errorMessage = "Exception while running parse filters on " + url + ": " + e; LOG.error(errorMessage); - metadata.setValue(Constants.STATUS_ERROR_SOURCE, - "content filtering"); + metadata.setValue(Constants.STATUS_ERROR_SOURCE, "content filtering"); metadata.setValue(Constants.STATUS_ERROR_MESSAGE, errorMessage); metadata.remove("numLinks"); - collector.emit(StatusStreamName, tuple, new Values(url, metadata, - Status.ERROR)); + collector.emit(StatusStreamName, tuple, new Values(url, metadata, Status.ERROR)); collector.ack(tuple); return; } @@ -263,8 +258,7 @@ public void execute(Tuple tuple) { ol.getMetadata().setValue(isSitemapVerifiedKey, "true"); } } - Values v = new Values(ol.getTargetURL(), ol.getMetadata(), - Status.DISCOVERED); + Values v = new Values(ol.getTargetURL(), ol.getMetadata(), Status.DISCOVERED); collector.emit(Constants.StatusStreamName, tuple, v); } @@ -272,8 +266,8 @@ public void execute(Tuple tuple) { metadata.setValue("numLinks", String.valueOf(outlinks.size())); // marking the main URL as successfully fetched - collector.emit(Constants.StatusStreamName, tuple, new Values(url, - metadata, Status.FETCHED)); + collector.emit( + Constants.StatusStreamName, tuple, new Values(url, metadata, Status.FETCHED)); collector.ack(tuple); } @@ -291,12 +285,10 @@ public SitemapType detectContent(String url, byte[] content) { if (match >= 0) { // a sitemap, need to detect type of sitemap if (match <= contentCluesSitemapNewsMatchUpTo) { - LOG.info("{} detected as news sitemap based on content", - url); + LOG.info("{} detected as news sitemap based on content", url); return SitemapType.NEWS; } else if (match <= contentCluesSitemapIndexMatchUpTo) { - LOG.info("{} detected as sitemap index based on content", - url); + LOG.info("{} detected as sitemap index based on content", url); return SitemapType.INDEX; } else { return SitemapType.SITEMAP; @@ -317,12 +309,15 @@ private boolean recentlyModified(Date lastModified) { return true; } - protected AbstractSiteMap parseSiteMap(String url, byte[] content, - String contentType, Metadata parentMetadata, List links) + protected AbstractSiteMap parseSiteMap( + String url, + byte[] content, + String contentType, + Metadata parentMetadata, + List links) throws UnknownFormatException, IOException { - SiteMapParser parser = new SiteMapParser(strictModeSitemaps, - allowPartialSitemaps); + SiteMapParser parser = new SiteMapParser(strictModeSitemaps, allowPartialSitemaps); parser.setStrictNamespace(true); parser.addAcceptedNamespace(Namespace.SITEMAP_LEGACY); parser.addAcceptedNamespace(Namespace.EMPTY); @@ -334,8 +329,7 @@ protected AbstractSiteMap parseSiteMap(String url, byte[] content, long start = System.currentTimeMillis(); AbstractSiteMap siteMap; // let the parser guess what the mimetype is - if (StringUtils.isBlank(contentType) - || contentType.contains("octet-stream")) { + if (StringUtils.isBlank(contentType) || contentType.contains("octet-stream")) { siteMap = parser.parseSiteMap(content, sURL); } else { siteMap = parser.parseSiteMap(contentType, content, sURL); @@ -351,8 +345,8 @@ protected AbstractSiteMap parseSiteMap(String url, byte[] content, Collection subsitemaps = smi.getSitemaps(); int delay = 0; /* - * keep the subsitemaps as outlinks they will be fetched and parsed - * in the following steps + * keep the subsitemaps as outlinks they will be fetched and parsed in the + * following steps */ Iterator iter = subsitemaps.iterator(); while (iter.hasNext()) { @@ -365,13 +359,21 @@ protected AbstractSiteMap parseSiteMap(String url, byte[] content, linksSkippedNotRecentlyModified++; LOG.debug( "{} has a modified date {} which is more than {} hours old", - target, lastModified.toString(), + target, + lastModified.toString(), filterHoursSinceModified); continue; } - Outlink ol = filterOutlink(sURL, target, parentMetadata, - isSitemapKey, "true", isSitemapNewsKey, "false"); + Outlink ol = + filterOutlink( + sURL, + target, + parentMetadata, + isSitemapKey, + "true", + isSitemapNewsKey, + "false"); if (ol == null) { continue; } @@ -379,9 +381,8 @@ protected AbstractSiteMap parseSiteMap(String url, byte[] content, // add a delay if (this.scheduleSitemapsWithDelay > 0) { if (delay > 0) { - ol.getMetadata().setValue( - DefaultScheduler.DELAY_METADATA, - Integer.toString(delay)); + ol.getMetadata() + .setValue(DefaultScheduler.DELAY_METADATA, Integer.toString(delay)); } delay += this.scheduleSitemapsWithDelay; } @@ -389,15 +390,19 @@ protected AbstractSiteMap parseSiteMap(String url, byte[] content, links.add(ol); LOG.debug("{} : [sitemap] {}", url, target); } - LOG.info("Sitemap index (found {} sitemaps, {} skipped): {}", - linksFound, linksSkippedNotRecentlyModified, url); + LOG.info( + "Sitemap index (found {} sitemaps, {} skipped): {}", + linksFound, + linksSkippedNotRecentlyModified, + url); } // sitemap files else { SiteMap sm = (SiteMap) siteMap; Collection sitemapURLs = sm.getSiteMapUrls(); Iterator iter = sitemapURLs.iterator(); - sitemap_urls: while (iter.hasNext()) { + sitemap_urls: + while (iter.hasNext()) { linksFound++; SiteMapURL smurl = iter.next(); // TODO handle priority in metadata @@ -414,11 +419,12 @@ protected AbstractSiteMap parseSiteMap(String url, byte[] content, linksSkippedNotRecentlyModified++; LOG.debug( "{} has a modified date {} which is more than {} hours old", - target, lastModified, filterHoursSinceModified); + target, + lastModified, + filterHoursSinceModified); continue; } - ExtensionMetadata[] newsAttrs = smurl - .getAttributesForExtension(Extension.NEWS); + ExtensionMetadata[] newsAttrs = smurl.getAttributesForExtension(Extension.NEWS); if (newsAttrs != null) { // filter based on news publication date // 2008-12-23 @@ -429,7 +435,9 @@ protected AbstractSiteMap parseSiteMap(String url, byte[] content, linksSkippedNotRecentlyModified++; LOG.debug( "{} has a news publication date {} which is more than {} hours old", - target, pubDate, filterHoursSinceModified); + target, + pubDate, + filterHoursSinceModified); continue sitemap_urls; } } @@ -437,8 +445,7 @@ protected AbstractSiteMap parseSiteMap(String url, byte[] content, } // add alternative language links - ExtensionMetadata[] linkAttrs = smurl - .getAttributesForExtension(Extension.LINKS); + ExtensionMetadata[] linkAttrs = smurl.getAttributesForExtension(Extension.LINKS); if (linkAttrs != null) { for (ExtensionMetadata attr : linkAttrs) { LinkAttributes linkAttr = (LinkAttributes) attr; @@ -451,17 +458,30 @@ protected AbstractSiteMap parseSiteMap(String url, byte[] content, // skip href links duplicating sitemap URL continue; } - Outlink ol = filterOutlink(sURL, hrefStr, - parentMetadata, isSitemapKey, "false", - isSitemapNewsKey, "false"); + Outlink ol = + filterOutlink( + sURL, + hrefStr, + parentMetadata, + isSitemapKey, + "false", + isSitemapNewsKey, + "false"); if (ol != null) { links.add(ol); } } } - Outlink ol = filterOutlink(sURL, target, parentMetadata, - isSitemapKey, "false", isSitemapNewsKey, "false"); + Outlink ol = + filterOutlink( + sURL, + target, + parentMetadata, + isSitemapKey, + "false", + isSitemapNewsKey, + "false"); if (ol == null) { continue; } @@ -469,34 +489,33 @@ protected AbstractSiteMap parseSiteMap(String url, byte[] content, links.add(ol); LOG.debug("{} : [sitemap] {}", url, target); } - LOG.info("Sitemap (found {} links, {} skipped): {}", linksFound, - linksSkippedNotRecentlyModified, url); + LOG.info( + "Sitemap (found {} links, {} skipped): {}", + linksFound, + linksSkippedNotRecentlyModified, + url); } return siteMap; } @Override - @SuppressWarnings({ "rawtypes", "unchecked" }) - public void prepare(Map stormConf, TopologyContext context, - OutputCollector collector) { + @SuppressWarnings({"rawtypes", "unchecked"}) + public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) { super.prepare(stormConf, context, collector); - sniffContent = ConfUtils.getBoolean(stormConf, - "sitemap.sniffContent", false); - filterHoursSinceModified = ConfUtils.getInt(stormConf, - "sitemap.filter.hours.since.modified", -1); + sniffContent = ConfUtils.getBoolean(stormConf, "sitemap.sniffContent", false); + filterHoursSinceModified = + ConfUtils.getInt(stormConf, "sitemap.filter.hours.since.modified", -1); parseFilters = ParseFilters.fromConf(stormConf); - int maxOffsetGuess = ConfUtils.getInt(stormConf, "sitemap.offset.guess", - 1024); - contentDetector = new ContentDetector( - NewsSiteMapParserBolt.contentClues, maxOffsetGuess); - rssContentDetector = new ContentDetector( - FeedDetectorBolt.contentClues, maxOffsetGuess); - averagedMetrics = context.registerMetric( - "news_sitemap_average_processing_time", - new ReducedMetric(new MeanReducer()), 30); - scheduleSitemapsWithDelay = ConfUtils.getInt(stormConf, - "sitemap.schedule.delay", scheduleSitemapsWithDelay); + int maxOffsetGuess = ConfUtils.getInt(stormConf, "sitemap.offset.guess", 1024); + contentDetector = new ContentDetector(NewsSiteMapParserBolt.contentClues, maxOffsetGuess); + rssContentDetector = new ContentDetector(FeedDetectorBolt.contentClues, maxOffsetGuess); + averagedMetrics = + context.registerMetric( + "news_sitemap_average_processing_time", + new ReducedMetric(new MeanReducer()), + 30); + scheduleSitemapsWithDelay = + ConfUtils.getInt(stormConf, "sitemap.schedule.delay", scheduleSitemapsWithDelay); } - } diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/PreFilterBolt.java b/src/main/java/org/commoncrawl/stormcrawler/news/PreFilterBolt.java index b986506..18106c3 100644 --- a/src/main/java/org/commoncrawl/stormcrawler/news/PreFilterBolt.java +++ b/src/main/java/org/commoncrawl/stormcrawler/news/PreFilterBolt.java @@ -1,9 +1,22 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.commoncrawl.stormcrawler.news; import java.io.IOException; import java.lang.invoke.MethodHandles; import java.util.Map; - import org.apache.commons.lang3.StringUtils; import org.apache.storm.task.OutputCollector; import org.apache.storm.task.TopologyContext; @@ -19,68 +32,67 @@ import org.slf4j.LoggerFactory; /** - * Variant of the URLFilterBolt to go upstream of the fetching to catch anything - * before it goes further into the topology. If filtered, a URL gets an ERROR - * status. + * Variant of the URLFilterBolt to go upstream of the fetching to catch anything before it goes + * further into the topology. If filtered, a URL gets an ERROR status. */ public class PreFilterBolt extends BaseRichBolt { - protected static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - - private URLFilters urlFilters; - - protected OutputCollector collector; - - private final String filterConfigFile; - - private static final String _s = org.apache.stormcrawler.Constants.StatusStreamName; - - public PreFilterBolt(String filterConfigFile) { - this.filterConfigFile = filterConfigFile; - } - - @Override - public void execute(Tuple input) { - - // must have at least a URL and metadata - String urlString = input.getStringByField("url"); - Metadata metadata = (Metadata) input.getValueByField("metadata"); - - String filtered = urlFilters.filter(null, null, urlString); - if (StringUtils.isBlank(filtered)) { - LOG.debug("URL rejected: {}", urlString); - // emit with an error to the status stream - metadata.addValue("error.cause", "Filtered"); - Values v = new Values(urlString, metadata, Status.ERROR); - collector.emit(_s, input, v); - collector.ack(input); - return; - } - - // pass to std out - Values v = new Values(urlString, metadata); - collector.emit(input, v); - collector.ack(input); - } - - @Override - public void declareOutputFields(OutputFieldsDeclarer declarer) { - declarer.declareStream(_s, new Fields("url", "metadata", "status")); - declarer.declare(new Fields("url", "metadata")); - } - - @Override - public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) { - this.collector = collector; - if (filterConfigFile != null) { - try { - urlFilters = new URLFilters(stormConf, filterConfigFile); - } catch (IOException e) { - throw new RuntimeException("Can't load filters from " + filterConfigFile); - } - } else { - urlFilters = URLFilters.fromConf(stormConf); - } - } - + protected static final Logger LOG = + LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + private URLFilters urlFilters; + + protected OutputCollector collector; + + private final String filterConfigFile; + + private static final String _s = org.apache.stormcrawler.Constants.StatusStreamName; + + public PreFilterBolt(String filterConfigFile) { + this.filterConfigFile = filterConfigFile; + } + + @Override + public void execute(Tuple input) { + + // must have at least a URL and metadata + String urlString = input.getStringByField("url"); + Metadata metadata = (Metadata) input.getValueByField("metadata"); + + String filtered = urlFilters.filter(null, null, urlString); + if (StringUtils.isBlank(filtered)) { + LOG.debug("URL rejected: {}", urlString); + // emit with an error to the status stream + metadata.addValue("error.cause", "Filtered"); + Values v = new Values(urlString, metadata, Status.ERROR); + collector.emit(_s, input, v); + collector.ack(input); + return; + } + + // pass to std out + Values v = new Values(urlString, metadata); + collector.emit(input, v); + collector.ack(input); + } + + @Override + public void declareOutputFields(OutputFieldsDeclarer declarer) { + declarer.declareStream(_s, new Fields("url", "metadata", "status")); + declarer.declare(new Fields("url", "metadata")); + } + + @Override + public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) { + this.collector = collector; + if (filterConfigFile != null) { + try { + urlFilters = new URLFilters(stormConf, filterConfigFile); + } catch (IOException e) { + throw new RuntimeException("Can't load filters from " + filterConfigFile); + } + } else { + urlFilters = URLFilters.fromConf(stormConf); + } + } } diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/PunycodeURLNormalizer.java b/src/main/java/org/commoncrawl/stormcrawler/news/PunycodeURLNormalizer.java index 4adf03d..114477c 100644 --- a/src/main/java/org/commoncrawl/stormcrawler/news/PunycodeURLNormalizer.java +++ b/src/main/java/org/commoncrawl/stormcrawler/news/PunycodeURLNormalizer.java @@ -13,20 +13,18 @@ */ package org.commoncrawl.stormcrawler.news; +import com.fasterxml.jackson.databind.JsonNode; import java.net.IDN; import java.net.MalformedURLException; import java.net.URL; import java.util.Map; - import org.apache.stormcrawler.Metadata; import org.apache.stormcrawler.filtering.URLFilter; -import com.fasterxml.jackson.databind.JsonNode; public class PunycodeURLNormalizer extends URLFilter { @Override - public void configure(Map stormConf, JsonNode filterParams) { - } + public void configure(Map stormConf, JsonNode filterParams) {} private boolean isAscii(String str) { char[] chars = str.toCharArray(); @@ -39,8 +37,7 @@ private boolean isAscii(String str) { } @Override - public String filter(URL sourceUrl, Metadata sourceMetadata, - String urlToFilter) { + public String filter(URL sourceUrl, Metadata sourceMetadata, String urlToFilter) { try { URL url = new URL(urlToFilter); String hostName = url.getHost(); @@ -51,12 +48,11 @@ public String filter(URL sourceUrl, Metadata sourceMetadata, if (hostName.equals(url.getHost())) { return urlToFilter; } - urlToFilter = new URL(url.getProtocol(), hostName, url.getPort(), - url.getFile()).toString(); + urlToFilter = + new URL(url.getProtocol(), hostName, url.getPort(), url.getFile()).toString(); } catch (MalformedURLException e) { return null; } return urlToFilter; } - } diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/BootstrapTopology.java b/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/BootstrapTopology.java index 821551e..14f45ff 100644 --- a/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/BootstrapTopology.java +++ b/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/BootstrapTopology.java @@ -1,4 +1,4 @@ -/** +/* * Licensed to DigitalPebble Ltd under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. @@ -19,31 +19,27 @@ import org.apache.storm.topology.TopologyBuilder; import org.apache.storm.tuple.Fields; -import org.commoncrawl.stormcrawler.news.CrawlTopology; -import org.commoncrawl.stormcrawler.news.FeedDetectorBolt; -import org.slf4j.LoggerFactory; - import org.apache.stormcrawler.ConfigurableTopology; import org.apache.stormcrawler.Constants; import org.apache.stormcrawler.bolt.FetcherBolt; import org.apache.stormcrawler.bolt.JSoupParserBolt; import org.apache.stormcrawler.bolt.URLFilterBolt; import org.apache.stormcrawler.bolt.URLPartitionerBolt; +import org.apache.stormcrawler.indexing.DummyIndexer; import org.apache.stormcrawler.opensearch.persistence.AggregationSpout; import org.apache.stormcrawler.opensearch.persistence.StatusUpdaterBolt; -import org.apache.stormcrawler.indexing.DummyIndexer; import org.apache.stormcrawler.spout.FileSpout; import org.apache.stormcrawler.util.ConfUtils; import org.apache.stormcrawler.util.URLStreamGrouping; import org.apache.stormcrawler.warc.WARCHdfsBolt; +import org.commoncrawl.stormcrawler.news.CrawlTopology; +import org.commoncrawl.stormcrawler.news.FeedDetectorBolt; +import org.slf4j.LoggerFactory; -/** - * Dummy topology to play with the spouts and bolts on ElasticSearch - */ +/** Dummy topology to play with the spouts and bolts on ElasticSearch */ public class BootstrapTopology extends CrawlTopology { - private static final org.slf4j.Logger LOG = LoggerFactory - .getLogger(BootstrapTopology.class); + private static final org.slf4j.Logger LOG = LoggerFactory.getLogger(BootstrapTopology.class); public static void main(String[] args) throws Exception { ConfigurableTopology.start(new BootstrapTopology(), args); @@ -53,11 +49,14 @@ public static void main(String[] args) throws Exception { protected int run(String[] args) { TopologyBuilder builder = new TopologyBuilder(); - LOG.debug("sitemap.sniffContent: {}", + LOG.debug( + "sitemap.sniffContent: {}", ConfUtils.getBoolean(getConf(), "sitemap.sniffContent", false)); - LOG.info("sitemap.sniffContent: {}", + LOG.info( + "sitemap.sniffContent: {}", ConfUtils.getBoolean(getConf(), "sitemap.sniffContent", false)); - LOG.warn("sitemap.sniffContent: {}", + LOG.warn( + "sitemap.sniffContent: {}", ConfUtils.getBoolean(getConf(), "sitemap.sniffContent", false)); int numWorkers = ConfUtils.getInt(getConf(), "topology.workers", 1); @@ -69,12 +68,11 @@ protected int run(String[] args) { if (args.length >= 2) { // arguments include seed directory and file pattern LOG.info("Injecting seeds from {} by pattern {}", args[0], args[1]); - builder.setSpout("filespout", - new FileSpout(args[0], args[1], true)); + builder.setSpout("filespout", new FileSpout(args[0], args[1], true)); Fields key = new Fields("url"); - builder.setBolt("filter", new URLFilterBolt()).fieldsGrouping( - "filespout", Constants.StatusStreamName, key); + builder.setBolt("filter", new URLFilterBolt()) + .fieldsGrouping("filespout", Constants.StatusStreamName, key); } builder.setSpout("spout", new AggregationSpout(), numShards); @@ -91,12 +89,10 @@ protected int run(String[] args) { builder.setBolt("feed", new FeedDetectorBolt(), numWorkers) .localOrShuffleGrouping("sitemap"); - builder.setBolt("parse", new JSoupParserBolt()) - .localOrShuffleGrouping("feed"); + builder.setBolt("parse", new JSoupParserBolt()).localOrShuffleGrouping("feed"); // don't need to parse the pages but need to update their status - builder.setBolt("ssb", new DummyIndexer(), numWorkers) - .localOrShuffleGrouping("parse"); + builder.setBolt("ssb", new DummyIndexer(), numWorkers).localOrShuffleGrouping("parse"); WARCHdfsBolt warcbolt = getWarcBolt("CC-NEWS-BOOTSTRAP"); @@ -109,8 +105,7 @@ protected int run(String[] args) { .localOrShuffleGrouping("parse", Constants.StatusStreamName) .localOrShuffleGrouping("ssb", Constants.StatusStreamName) .setNumTasks(numShards) - .customGrouping("filter", Constants.StatusStreamName, - new URLStreamGrouping()); + .customGrouping("filter", Constants.StatusStreamName, new URLStreamGrouping()); return submit(conf, builder); } diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/FeedLinkParseFilter.java b/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/FeedLinkParseFilter.java index 5707189..bb50291 100644 --- a/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/FeedLinkParseFilter.java +++ b/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/FeedLinkParseFilter.java @@ -14,19 +14,18 @@ package org.commoncrawl.stormcrawler.news.bootstrap; import java.util.ArrayList; - -import org.slf4j.LoggerFactory; -import org.w3c.dom.DocumentFragment; - import org.apache.stormcrawler.bolt.FeedParserBolt; import org.apache.stormcrawler.parse.Outlink; import org.apache.stormcrawler.parse.ParseResult; import org.apache.stormcrawler.parse.filter.LinkParseFilter; +import org.slf4j.LoggerFactory; +import org.w3c.dom.DocumentFragment; /** - * ParseFilter which extracts exclusively RSS links via Xpath, all other links - * are skipped. See {@link LinkParseFilter} how to register and configure in - * parsefilters.json. A configuration snippet: + * ParseFilter which extracts exclusively RSS links via Xpath, all other links are skipped. See + * {@link LinkParseFilter} how to register and configure in parsefilters.json. A configuration + * snippet: + * *
  *     {
  *      "class": "org.commoncrawl.stormcrawler.news.bootstrap.FeedLinkParseFilter",
@@ -41,12 +40,10 @@
  */
 public class FeedLinkParseFilter extends LinkParseFilter {
 
-    private static final org.slf4j.Logger LOG = LoggerFactory
-            .getLogger(FeedLinkParseFilter.class);
+    private static final org.slf4j.Logger LOG = LoggerFactory.getLogger(FeedLinkParseFilter.class);
 
     @Override
-    public void filter(String URL, byte[] content, DocumentFragment doc,
-            ParseResult parse) {
+    public void filter(String URL, byte[] content, DocumentFragment doc, ParseResult parse) {
 
         // skip existing links
         logLinks(parse, URL, "Skipped links");
@@ -60,11 +57,8 @@ public void filter(String URL, byte[] content, DocumentFragment doc,
 
     public static void logLinks(ParseResult parse, String URL, String message) {
         if (LOG.isDebugEnabled() && parse.getOutlinks().size() > 0) {
-            if (!message.isEmpty())
-                LOG.debug("{} for {}:", message, URL);
-            for (Outlink outlink : parse.getOutlinks())
-                LOG.debug(outlink.getTargetURL());
+            if (!message.isEmpty()) LOG.debug("{} for {}:", message, URL);
+            for (Outlink outlink : parse.getOutlinks()) LOG.debug(outlink.getTargetURL());
         }
     }
-
 }
diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/NewsSiteMapDetectorBolt.java b/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/NewsSiteMapDetectorBolt.java
index 1160201..f0af134 100644
--- a/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/NewsSiteMapDetectorBolt.java
+++ b/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/NewsSiteMapDetectorBolt.java
@@ -14,15 +14,10 @@
 package org.commoncrawl.stormcrawler.news.bootstrap;
 
 import java.util.Map;
-
 import org.apache.storm.task.OutputCollector;
 import org.apache.storm.task.TopologyContext;
 import org.apache.storm.tuple.Tuple;
 import org.apache.storm.tuple.Values;
-import org.commoncrawl.stormcrawler.news.ContentDetector;
-import org.commoncrawl.stormcrawler.news.NewsSiteMapParserBolt;
-import org.slf4j.LoggerFactory;
-
 import org.apache.stormcrawler.Constants;
 import org.apache.stormcrawler.Metadata;
 import org.apache.stormcrawler.bolt.SiteMapParserBolt;
@@ -31,25 +26,26 @@
 import org.apache.stormcrawler.parse.ParseFilters;
 import org.apache.stormcrawler.parse.ParseResult;
 import org.apache.stormcrawler.persistence.Status;
+import org.commoncrawl.stormcrawler.news.ContentDetector;
+import org.commoncrawl.stormcrawler.news.NewsSiteMapParserBolt;
+import org.slf4j.LoggerFactory;
 
 /**
- * Detector for news
+ * Detector for news
  * sitemaps and also sitemaps.
  */
 @SuppressWarnings("serial")
 public class NewsSiteMapDetectorBolt extends SiteMapParserBolt {
 
-    private static final org.slf4j.Logger LOG = LoggerFactory
-            .getLogger(NewsSiteMapDetectorBolt.class);
+    private static final org.slf4j.Logger LOG =
+            LoggerFactory.getLogger(NewsSiteMapDetectorBolt.class);
 
     protected static final int maxOffsetContentGuess = 1024;
-    private static ContentDetector contentDetector = new ContentDetector(
-            NewsSiteMapParserBolt.contentClues, maxOffsetContentGuess);
+    private static ContentDetector contentDetector =
+            new ContentDetector(NewsSiteMapParserBolt.contentClues, maxOffsetContentGuess);
 
     private ParseFilter parseFilters;
 
-
     @Override
     public void execute(Tuple tuple) {
         Metadata metadata = (Metadata) tuple.getValueByField("metadata");
@@ -57,10 +53,9 @@ public void execute(Tuple tuple) {
         byte[] content = tuple.getBinaryByField("content");
         String url = tuple.getStringByField("url");
 
-        boolean isSitemap = Boolean.valueOf(
-                metadata.getFirstValue(SiteMapParserBolt.isSitemapKey));
-        boolean isNewsSitemap = Boolean.valueOf(
-                metadata.getFirstValue(NewsSiteMapParserBolt.isSitemapNewsKey));
+        boolean isSitemap = Boolean.valueOf(metadata.getFirstValue(SiteMapParserBolt.isSitemapKey));
+        boolean isNewsSitemap =
+                Boolean.valueOf(metadata.getFirstValue(NewsSiteMapParserBolt.isSitemapNewsKey));
 
         if (!isNewsSitemap || !isSitemap) {
             int match = contentDetector.getFirstMatch(content);
@@ -70,10 +65,8 @@ public void execute(Tuple tuple) {
                 metadata.setValue(SiteMapParserBolt.isSitemapKey, "true");
                 if (match <= NewsSiteMapParserBolt.contentCluesSitemapNewsMatchUpTo) {
                     isNewsSitemap = true;
-                    LOG.info("{} detected as news sitemap based on content",
-                            url);
-                    metadata.setValue(NewsSiteMapParserBolt.isSitemapNewsKey,
-                            "true");
+                    LOG.info("{} detected as news sitemap based on content", url);
+                    metadata.setValue(NewsSiteMapParserBolt.isSitemapNewsKey, "true");
                 }
             }
         }
@@ -85,8 +78,8 @@ public void execute(Tuple tuple) {
             parseData.setMetadata(metadata);
             parseFilters.filter(url, content, null, parse);
             // emit status
-            collector.emit(Constants.StatusStreamName, tuple,
-                    new Values(url, metadata, Status.FETCHED));
+            collector.emit(
+                    Constants.StatusStreamName, tuple, new Values(url, metadata, Status.FETCHED));
         } else {
             // pass on
             collector.emit(tuple, tuple.getValues());
@@ -95,11 +88,9 @@ public void execute(Tuple tuple) {
     }
 
     @Override
-    @SuppressWarnings({ "rawtypes", "unchecked" })
-    public void prepare(Map stormConf, TopologyContext context,
-            OutputCollector collect) {
+    @SuppressWarnings({"rawtypes", "unchecked"})
+    public void prepare(Map stormConf, TopologyContext context, OutputCollector collect) {
         super.prepare(stormConf, context, collect);
         parseFilters = ParseFilters.fromConf(stormConf);
     }
-
 }
diff --git a/src/test/java/org/commoncrawl/stormcrawler/FastURLFilterTest.java b/src/test/java/org/commoncrawl/stormcrawler/FastURLFilterTest.java
index 64df0ba..58aaa04 100644
--- a/src/test/java/org/commoncrawl/stormcrawler/FastURLFilterTest.java
+++ b/src/test/java/org/commoncrawl/stormcrawler/FastURLFilterTest.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to DigitalPebble Ltd under one or more contributor license agreements. See the NOTICE
  * file distributed with this work for additional information regarding copyright ownership.
  * DigitalPebble licenses this file to You under the Apache License, Version 2.0 (the "License");
@@ -14,74 +14,72 @@
  */
 package org.commoncrawl.stormcrawler;
 
+import com.fasterxml.jackson.databind.node.JsonNodeFactory;
+import com.fasterxml.jackson.databind.node.ObjectNode;
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.util.HashMap;
 import java.util.Map;
-
+import org.apache.stormcrawler.Metadata;
+import org.apache.stormcrawler.filtering.URLFilter;
 import org.commoncrawl.stormcrawler.filter.FastURLFilter;
 import org.junit.Assert;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
-import org.apache.stormcrawler.Metadata;
-import org.apache.stormcrawler.filtering.URLFilter;
-import com.fasterxml.jackson.databind.node.JsonNodeFactory;
-import com.fasterxml.jackson.databind.node.ObjectNode;
-
 public class FastURLFilterTest {
 
     protected static URLFilter filter;
 
     @BeforeClass
     public static void init() {
-	filter = createFilter("fast-urlfilter.txt");
+        filter = createFilter("fast-urlfilter.txt");
     }
 
     public static FastURLFilter createFilter(String fileName) {
-	ObjectNode filterParams = new ObjectNode(JsonNodeFactory.instance);
-	filterParams.put("file", fileName);
-	FastURLFilter filter = new FastURLFilter();
-	Map conf = new HashMap<>();
-	conf.put("fast.urlfilter.refresh", 10);
-	filter.configure(conf, filterParams);
-	return filter;
+        ObjectNode filterParams = new ObjectNode(JsonNodeFactory.instance);
+        filterParams.put("file", fileName);
+        FastURLFilter filter = new FastURLFilter();
+        Map conf = new HashMap<>();
+        conf.put("fast.urlfilter.refresh", 10);
+        filter.configure(conf, filterParams);
+        return filter;
     }
 
     @Test
     public void testHostFilter() throws MalformedURLException {
-	URL url = new URL("http://may.go.com/image.jpg");
-	Metadata metadata = new Metadata();
-	String filterResult = filter.filter(url, metadata, url.toExternalForm());
-	Assert.assertEquals(url.toString(), filterResult);
-	
-	url = new URL("http://no.go.com/");
-	filterResult = filter.filter(url, metadata, url.toExternalForm());
-	Assert.assertEquals(null, filterResult);
+        URL url = new URL("http://may.go.com/image.jpg");
+        Metadata metadata = new Metadata();
+        String filterResult = filter.filter(url, metadata, url.toExternalForm());
+        Assert.assertEquals(url.toString(), filterResult);
+
+        url = new URL("http://no.go.com/");
+        filterResult = filter.filter(url, metadata, url.toExternalForm());
+        Assert.assertEquals(null, filterResult);
     }
 
     @Test
     public void testDomainNotAllowed() throws MalformedURLException {
-	URL url = new URL("http://domainnotallowed.com/forum/search.php");
-	Metadata metadata = new Metadata();
-	String filterResult = filter.filter(url, metadata, url.toExternalForm());
-	Assert.assertEquals(null, filterResult);
-	
-	url = new URL("http://domainnotallowed.com/");
-	filterResult = filter.filter(url, metadata, url.toExternalForm());
-	Assert.assertEquals(null, filterResult);
-	
-	url = new URL("http://partiallyallowed.com/");
-	filterResult = filter.filter(url, metadata, url.toExternalForm());
-	Assert.assertEquals(url.toString(), filterResult);
-	
-	url = new URL("http://partiallyallowed.com/verbotten");
-	filterResult = filter.filter(url, metadata, url.toExternalForm());
-	Assert.assertEquals(null, filterResult);
+        URL url = new URL("http://domainnotallowed.com/forum/search.php");
+        Metadata metadata = new Metadata();
+        String filterResult = filter.filter(url, metadata, url.toExternalForm());
+        Assert.assertEquals(null, filterResult);
+
+        url = new URL("http://domainnotallowed.com/");
+        filterResult = filter.filter(url, metadata, url.toExternalForm());
+        Assert.assertEquals(null, filterResult);
+
+        url = new URL("http://partiallyallowed.com/");
+        filterResult = filter.filter(url, metadata, url.toExternalForm());
+        Assert.assertEquals(url.toString(), filterResult);
+
+        url = new URL("http://partiallyallowed.com/verbotten");
+        filterResult = filter.filter(url, metadata, url.toExternalForm());
+        Assert.assertEquals(null, filterResult);
 
-	// allowed
-	url = new URL("http://digitalpebble.com/");
-	filterResult = filter.filter(url, metadata, url.toExternalForm());
-	Assert.assertEquals(url.toString(), filterResult);
+        // allowed
+        url = new URL("http://digitalpebble.com/");
+        filterResult = filter.filter(url, metadata, url.toExternalForm());
+        Assert.assertEquals(url.toString(), filterResult);
     }
 }
diff --git a/src/test/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserTest.java b/src/test/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserTest.java
index db73d67..b0a0d5a 100644
--- a/src/test/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserTest.java
+++ b/src/test/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserTest.java
@@ -16,6 +16,7 @@
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotEquals;
 
+import crawlercommons.sitemaps.UnknownFormatException;
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.nio.charset.StandardCharsets;
@@ -25,75 +26,83 @@
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
-
 import org.apache.commons.io.IOUtils;
-import org.commoncrawl.stormcrawler.news.NewsSiteMapParserBolt.SitemapType;
-import org.junit.Before;
-import org.junit.Test;
-
 import org.apache.stormcrawler.Metadata;
 import org.apache.stormcrawler.parse.Outlink;
 import org.apache.stormcrawler.parse.ParsingTester;
-
-import crawlercommons.sitemaps.UnknownFormatException;
+import org.commoncrawl.stormcrawler.news.NewsSiteMapParserBolt.SitemapType;
+import org.junit.Before;
+import org.junit.Test;
 
 public class NewsSiteMapParserTest extends ParsingTester {
 
     @Before
     public void setupParserBolt() {
-	setupParserBolt(new NewsSiteMapParserBolt());
-	Map config = new HashMap<>();
-	config.put("sitemap.sniffContent", true);
-	// allow items published during the last week
-	config.put("sitemap.filter.hours.since.modified", 168);
-	prepareParserBolt("test.parsefilters.json", config);
+        setupParserBolt(new NewsSiteMapParserBolt());
+        Map config = new HashMap<>();
+        config.put("sitemap.sniffContent", true);
+        // allow items published during the last week
+        config.put("sitemap.filter.hours.since.modified", 168);
+        prepareParserBolt("test.parsefilters.json", config);
     }
 
     @Test
     public void testSiteMapParser() throws IOException, UnknownFormatException {
-	String url = "https://example.org/sitemap-news.xml";
-	byte[] content = readContent("sitemap-news.xml");
-	String contentType = "";
-	Metadata parentMetadata = new Metadata();
-	List links = new ArrayList<>();
-
-	SitemapType type = ((NewsSiteMapParserBolt) bolt).detectContent(url, content);
-	assertEquals(SitemapType.NEWS, type);
-
-	((NewsSiteMapParserBolt) bolt).parseSiteMap(url, content, contentType, parentMetadata, links);
-
-	// unmodified sitemap:
-	// - publication date is far in the past, link should be skipped
-	// 2008-12-23
-	assertEquals("Outdated link not skipped", 0, links.size());
-
-	// now set the publication date to yesterday
-	LocalDateTime yesterday = LocalDateTime.now().minusDays(1);
-	content = (new String(content, StandardCharsets.UTF_8))
-		.replace("2008-12-23", ""
-			+ yesterday.format(DateTimeFormatter.ofPattern("yyyy-MM-dd")) + "")
-		.getBytes(StandardCharsets.UTF_8);
-	((NewsSiteMapParserBolt) bolt).parseSiteMap(url, content, contentType, parentMetadata, links);
-
-	assertEquals("Expected one  and one additional  link - image links are ignored", 2,
-		links.size());
+        String url = "https://example.org/sitemap-news.xml";
+        byte[] content = readContent("sitemap-news.xml");
+        String contentType = "";
+        Metadata parentMetadata = new Metadata();
+        List links = new ArrayList<>();
+
+        SitemapType type = ((NewsSiteMapParserBolt) bolt).detectContent(url, content);
+        assertEquals(SitemapType.NEWS, type);
+
+        ((NewsSiteMapParserBolt) bolt)
+                .parseSiteMap(url, content, contentType, parentMetadata, links);
+
+        // unmodified sitemap:
+        // - publication date is far in the past, link should be skipped
+        // 2008-12-23
+        assertEquals("Outdated link not skipped", 0, links.size());
+
+        // now set the publication date to yesterday
+        LocalDateTime yesterday = LocalDateTime.now().minusDays(1);
+        content =
+                (new String(content, StandardCharsets.UTF_8))
+                        .replace(
+                                "2008-12-23",
+                                ""
+                                        + yesterday.format(
+                                                DateTimeFormatter.ofPattern("yyyy-MM-dd"))
+                                        + "")
+                        .getBytes(StandardCharsets.UTF_8);
+        ((NewsSiteMapParserBolt) bolt)
+                .parseSiteMap(url, content, contentType, parentMetadata, links);
+
+        assertEquals(
+                "Expected one  and one additional  link - image links are ignored",
+                2,
+                links.size());
     }
 
     protected byte[] readContent(String filename) throws IOException {
-	ByteArrayOutputStream baos = new ByteArrayOutputStream();
-	IOUtils.copy(getClass().getClassLoader().getResourceAsStream(filename), baos);
-	return baos.toByteArray();
+        ByteArrayOutputStream baos = new ByteArrayOutputStream();
+        IOUtils.copy(getClass().getClassLoader().getResourceAsStream(filename), baos);
+        return baos.toByteArray();
     }
 
-	@Test
-     public void testFeedWithSitemapNamespace() throws IOException, UnknownFormatException {
-         String url = "https://example.org/feed.xml";
-		byte[] content = readContent("feed-with-sitemap-namespace.xml");
-         SitemapType type = ((NewsSiteMapParserBolt) bolt).detectContent(url, content);
-         assertNotEquals("RSS feed with sitemap namespace should not be detected as sitemap",
-                 SitemapType.NEWS, type);
-         assertNotEquals("RSS feed with sitemap namespace should not be detected as sitemap",
-                 SitemapType.SITEMAP, type);
-     }
-
+    @Test
+    public void testFeedWithSitemapNamespace() throws IOException, UnknownFormatException {
+        String url = "https://example.org/feed.xml";
+        byte[] content = readContent("feed-with-sitemap-namespace.xml");
+        SitemapType type = ((NewsSiteMapParserBolt) bolt).detectContent(url, content);
+        assertNotEquals(
+                "RSS feed with sitemap namespace should not be detected as sitemap",
+                SitemapType.NEWS,
+                type);
+        assertNotEquals(
+                "RSS feed with sitemap namespace should not be detected as sitemap",
+                SitemapType.SITEMAP,
+                type);
+    }
 }